v1.0

0112b0f0 · chenzk · 0112b0f0 · 0112b0f0 · 0112b0f0 · 0112b0f0
Commit 0112b0f0 authored Feb 14, 2025 by chenzk
20 changed files
--- a/inspiremusic/utils/scheduler.py
+++ b/inspiremusic/utils/scheduler.py
+# Copyright (c) 2020 Mobvoi Inc (Binbin Zhang)
+#               2022 Ximalaya Inc (Yuguang Yang)
+#               2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from ESPnet(https://github.com/espnet/espnet)
+#               NeMo(https://github.com/NVIDIA/NeMo)
+
+from typing import Union
+
+import math
+import warnings
+import torch
+from torch.optim.lr_scheduler import _LRScheduler
+
+
+class WarmupLR(_LRScheduler):
+    """The WarmupLR scheduler
+
+    This scheduler is almost same as NoamLR Scheduler except for following
+    difference:
+
+    NoamLR:
+        lr = optimizer.lr * model_size ** -0.5
+             * min(step ** -0.5, step * warmup_step ** -1.5)
+    WarmupLR:
+        lr = optimizer.lr * warmup_step ** 0.5
+             * min(step ** -0.5, step * warmup_step ** -1.5)
+
+    Note that the maximum lr equals to optimizer.lr in this scheduler.
+
+    """
+
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+        warmup_steps: Union[int, float] = 25000,
+        last_epoch: int = -1,
+    ):
+        self.warmup_steps = warmup_steps
+
+        # __init__() must be invoked before setting field
+        # because step() is also invoked in __init__()
+        super().__init__(optimizer, last_epoch)
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})"
+
+    def get_lr(self):
+        step_num = self.last_epoch + 1
+        if self.warmup_steps == 0:
+            return [lr * step_num**-0.5 for lr in self.base_lrs]
+        else:
+            return [
+                lr * self.warmup_steps**0.5 *
+                min(step_num**-0.5, step_num * self.warmup_steps**-1.5)
+                for lr in self.base_lrs
+            ]
+
+    def set_step(self, step: int):
+        self.last_epoch = step
+
+
+class WarmupPolicy(_LRScheduler):
+    """Adds warmup kwargs and warmup logic to lr policy.
+    All arguments should be passed as kwargs for clarity,
+    Args:
+        warmup_steps: Number of training steps in warmup stage
+        warmup_ratio: Ratio of warmup steps to total steps
+        max_steps: Total number of steps while training or `None` for
+            infinite training
+    """
+
+    def __init__(self,
+                 optimizer,
+                 *,
+                 warmup_steps=None,
+                 warmup_ratio=None,
+                 max_steps=None,
+                 min_lr=0.0,
+                 last_epoch=-1):
+        assert not (warmup_steps is not None and warmup_ratio is not None),\
+            "Either use particular number of step or ratio"
+        assert warmup_ratio is None or max_steps is not None, \
+            "If there is a ratio, there should be a total steps"
+
+        # It is necessary to assign all attributes *before* __init__,
+        # as class is wrapped by an inner class.
+        self.max_steps = max_steps
+        if warmup_steps is not None:
+            self.warmup_steps = warmup_steps
+        elif warmup_ratio is not None:
+            self.warmup_steps = int(warmup_ratio * max_steps)
+        else:
+            self.warmup_steps = 0
+
+        self.min_lr = min_lr
+        super().__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn(
+                "To get the last learning rate computed "
+                "by the scheduler, please use `get_last_lr()`.",
+                UserWarning,
+                stacklevel=2)
+
+        step = self.last_epoch
+
+        if step <= self.warmup_steps and self.warmup_steps > 0:
+            return self._get_warmup_lr(step)
+
+        if step > self.max_steps:
+            return [self.min_lr for _ in self.base_lrs]
+
+        return self._get_lr(step)
+
+    def _get_warmup_lr(self, step):
+        lr_val = (step + 1) / (self.warmup_steps + 1)
+        return [initial_lr * lr_val for initial_lr in self.base_lrs]
+
+    def _get_lr(self, step):
+        """Simple const lr policy"""
+        return self.base_lrs
+
+
+class SquareRootConstantPolicy(_LRScheduler):
+    """Adds warmup kwargs and warmup logic to lr policy.
+    All arguments should be passed as kwargs for clarity,
+    Args:
+        warmup_steps: Number of training steps in warmup stage
+        warmup_ratio: Ratio of warmup steps to total steps
+        max_steps: Total number of steps while training or `None` for
+            infinite training
+    """
+
+    def __init__(self,
+                 optimizer,
+                 *,
+                 constant_steps=None,
+                 constant_ratio=None,
+                 max_steps=None,
+                 min_lr=0.0,
+                 last_epoch=-1):
+        assert not (constant_steps is not None
+                    and constant_ratio is not None), \
+            "Either use particular number of step or ratio"
+        assert constant_ratio is None or max_steps is not None, \
+            "If there is a ratio, there should be a total steps"
+
+        # It is necessary to assign all attributes *before* __init__,
+        # as class is wrapped by an inner class.
+        self.max_steps = max_steps
+        if constant_steps is not None:
+            self.constant_steps = constant_steps
+        elif constant_ratio is not None:
+            self.constant_steps = int(constant_ratio * max_steps)
+        else:
+            self.constant_steps = 0
+
+        self.constant_lr = 1 / (constant_steps**0.5)
+        self.min_lr = min_lr
+        super().__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn(
+                "To get the last learning rate computed "
+                "by the scheduler, please use `get_last_lr()`.",
+                UserWarning,
+                stacklevel=2)
+
+        step = self.last_epoch
+
+        if step <= self.constant_steps:
+            return [self.constant_lr for _ in self.base_lrs]
+
+        if step > self.max_steps:
+            return [self.min_lr for _ in self.base_lrs]
+
+        return self._get_lr(step)
+
+    def _get_lr(self, step):
+        """Simple const lr policy"""
+        return self.base_lrs
+
+
+class WarmupHoldPolicy(WarmupPolicy):
+    """Variant of WarmupPolicy which maintains high
+       learning rate for a defined number of steps.
+    All arguments should be passed as kwargs for clarity,
+    Args:
+        warmup_steps: Number of training steps in warmup stage
+        warmup_ratio: Ratio of warmup steps to total steps
+        hold_steps: Number of training steps to
+                    hold the learning rate after warm up
+        hold_ratio: Ratio of hold steps to total steps
+        max_steps: Total number of steps while training or `None` for
+            infinite training
+    """
+
+    def __init__(
+        self,
+        optimizer,
+        *,
+        warmup_steps=None,
+        warmup_ratio=None,
+        hold_steps=None,
+        hold_ratio=None,
+        max_steps=None,
+        min_lr=0.0,
+        last_epoch=-1,
+    ):
+        assert not (hold_steps is not None and hold_ratio is not None), \
+            "Either use particular number of step or ratio"
+        assert hold_ratio is None or max_steps is not None, \
+            "If there is a ratio, there should be a total steps"
+
+        self.min_lr = min_lr
+        self._last_warmup_lr = 0.0
+
+        # Necessary to duplicate as class attributes are hidden in inner class
+        self.max_steps = max_steps
+        if warmup_steps is not None:
+            self.warmup_steps = warmup_steps
+        elif warmup_ratio is not None:
+            self.warmup_steps = int(warmup_ratio * max_steps)
+        else:
+            self.warmup_steps = 0
+
+        if hold_steps is not None:
+            self.hold_steps = hold_steps + self.warmup_steps
+        elif hold_ratio is not None:
+            self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps
+        else:
+            self.hold_steps = 0
+
+        super().__init__(
+            optimizer,
+            warmup_steps=warmup_steps,
+            warmup_ratio=warmup_ratio,
+            max_steps=max_steps,
+            last_epoch=last_epoch,
+            min_lr=min_lr,
+        )
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn(
+                "To get the last learning rate computed by the scheduler,"
+                " "
+                "please use `get_last_lr()`.",
+                UserWarning,
+                stacklevel=2)
+
+        step = self.last_epoch
+
+        # Warmup phase
+        if step <= self.warmup_steps and self.warmup_steps > 0:
+            return self._get_warmup_lr(step)
+
+        # Hold phase
+        if (step >= self.warmup_steps) and (step < self.hold_steps):
+            return self.base_lrs
+
+        if step > self.max_steps:
+            return [self.min_lr for _ in self.base_lrs]
+
+        return self._get_lr(step)
+
+
+class WarmupAnnealHoldPolicy(_LRScheduler):
+    """Adds warmup kwargs and warmup logic to lr policy.
+    All arguments should be passed as kwargs for clarity,
+    Args:
+        warmup_steps: Number of training steps in warmup stage
+        warmup_ratio: Ratio of warmup steps to total steps
+        max_steps: Total number of steps while training or `None` for
+            infinite training
+        min_lr: Minimum lr to hold the learning rate after decay at.
+        constant_steps: Number of steps to keep lr constant at.
+        constant_ratio: Ratio of steps to keep lr constant.
+    """
+
+    def __init__(
+        self,
+        optimizer,
+        *,
+        warmup_steps=None,
+        warmup_ratio=None,
+        constant_steps=None,
+        constant_ratio=None,
+        max_steps=None,
+        min_lr=0.0,
+        last_epoch=-1,
+    ):
+        assert not (warmup_steps is not None
+                    and warmup_ratio is not None), \
+            "Either use particular number of step or ratio"
+        assert not (constant_steps is not None
+                    and constant_ratio is not None), \
+            "Either use constant_steps or constant_ratio"
+        assert warmup_ratio is None or max_steps is not None, \
+            "If there is a ratio, there should be a total steps"
+
+        # It is necessary to assign all attributes *before* __init__,
+        # as class is wrapped by an inner class.
+        self.max_steps = max_steps
+
+        if warmup_steps is not None:
+            self.warmup_steps = warmup_steps
+        elif warmup_ratio is not None:
+            self.warmup_steps = int(warmup_ratio * max_steps)
+        else:
+            self.warmup_steps = 0
+
+        if constant_steps is not None:
+            self.constant_steps = constant_steps
+        elif constant_ratio is not None:
+            self.constant_steps = int(constant_ratio * max_steps)
+        else:
+            self.constant_steps = 0
+
+        self.decay_steps = max_steps - (self.constant_steps +
+                                        self.warmup_steps)
+
+        self.min_lr = min_lr
+        super().__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn(
+                "To get the last learning rate computed "
+                "by the scheduler, please use `get_last_lr()`.",
+                UserWarning,
+                stacklevel=2)
+
+        step = self.last_epoch
+
+        # Warmup steps
+        if self.warmup_steps > 0 and step <= self.warmup_steps:
+            return self._get_warmup_lr(step)
+
+        # Constant steps after warmup and decay
+        if self.constant_steps > 0 and (
+                self.warmup_steps + self.decay_steps) < step <= self.max_steps:
+            return self._get_constant_lr(step)
+
+        # Min lr after max steps of updates
+        if step > self.max_steps:
+            return [self.min_lr for _ in self.base_lrs]
+
+        return self._get_lr(step)
+
+    def _get_warmup_lr(self, step):
+        lr_val = (step + 1) / (self.warmup_steps + 1)
+        return [initial_lr * lr_val for initial_lr in self.base_lrs]
+
+    def _get_constant_lr(self, step):
+        return [self.min_lr for _ in self.base_lrs]
+
+    def _get_lr(self, step):
+        """Simple const lr policy"""
+        return self.base_lrs
+
+
+def _squareroot_annealing(initial_lr, step, max_steps, min_lr):
+    mult = ((max_steps - step) / max_steps)**0.5
+    out_lr = initial_lr * mult
+    out_lr = max(out_lr, min_lr)
+    return out_lr
+
+
+def _square_annealing(initial_lr, step, max_steps, min_lr):
+    mult = ((max_steps - step) / max_steps)**2
+    out_lr = initial_lr * mult
+    out_lr = max(out_lr, min_lr)
+    return out_lr
+
+
+def _cosine_annealing(initial_lr, step, max_steps, min_lr):
+    mult = 0.5 * (1 + math.cos(math.pi * step / max_steps))
+    out_lr = (initial_lr - min_lr) * mult + min_lr
+    return out_lr
+
+
+def _linear_warmup_with_cosine_annealing(max_lr, warmup_steps, step,
+                                         decay_steps, min_lr):
+    assert max_lr > min_lr
+    # Use linear warmup for the initial part.
+    if warmup_steps > 0 and step <= warmup_steps:
+        return max_lr * float(step) / float(warmup_steps)
+
+    # For any steps larger than `decay_steps`, use `min_lr`.
+    if step > warmup_steps + decay_steps:
+        return min_lr
+
+    # If we are done with the warmup period, use the decay style.
+    num_steps_ = step - warmup_steps
+    decay_steps_ = decay_steps
+    decay_ratio = float(num_steps_) / float(decay_steps_)
+    assert decay_ratio >= 0.0
+    assert decay_ratio <= 1.0
+    delta_lr = max_lr - min_lr
+
+    coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)
+
+    return min_lr + coeff * delta_lr
+
+
+def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle):
+    if cycle:
+        multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps)
+        decay_steps *= multiplier
+    else:
+        step = min(step, decay_steps)
+    p = step / decay_steps
+    lr = (initial_lr - min_lr) * math.pow(1.0 - p, power)
+    lr += min_lr
+    return lr
+
+
+def _noam_hold_annealing(initial_lr, step, warmup_steps, hold_steps,
+                         decay_rate, min_lr):
+    # hold_steps = total number of steps
+    # to hold the LR, not the warmup + hold steps.
+    T_warmup_decay = max(1, warmup_steps**decay_rate)
+    T_hold_decay = max(1, (step - hold_steps)**decay_rate)
+    lr = (initial_lr * T_warmup_decay) / T_hold_decay
+    lr = max(lr, min_lr)
+    return lr
+
+
+class SquareAnnealing(WarmupPolicy):
+
+    def __init__(self,
+                 optimizer,
+                 *,
+                 max_steps,
+                 min_lr=1e-5,
+                 last_epoch=-1,
+                 **kwargs):
+        super().__init__(optimizer=optimizer,
+                         max_steps=max_steps,
+                         last_epoch=last_epoch,
+                         min_lr=min_lr,
+                         **kwargs)
+
+    def _get_lr(self, step):
+        new_lrs = [
+            _square_annealing(
+                initial_lr=initial_lr,
+                step=step - self.warmup_steps,
+                max_steps=self.max_steps - self.warmup_steps,
+                min_lr=self.min_lr,
+            ) for initial_lr in self.base_lrs
+        ]
+        return new_lrs
+
+
+class SquareRootAnnealing(WarmupPolicy):
+
+    def __init__(self,
+                 optimizer,
+                 *,
+                 max_steps,
+                 min_lr=0,
+                 last_epoch=-1,
+                 **kwargs):
+        super().__init__(optimizer=optimizer,
+                         max_steps=max_steps,
+                         last_epoch=last_epoch,
+                         min_lr=min_lr,
+                         **kwargs)
+
+    def _get_lr(self, step):
+        new_lrs = [
+            _squareroot_annealing(initial_lr=initial_lr,
+                                  step=step,
+                                  max_steps=self.max_steps,
+                                  min_lr=self.min_lr)
+            for initial_lr in self.base_lrs
+        ]
+        return new_lrs
+
+
+class CosineAnnealing(WarmupAnnealHoldPolicy):
+
+    def __init__(self,
+                 optimizer,
+                 *,
+                 max_steps,
+                 min_lr=0,
+                 last_epoch=-1,
+                 **kwargs):
+        super().__init__(optimizer=optimizer,
+                         max_steps=max_steps,
+                         last_epoch=last_epoch,
+                         min_lr=min_lr,
+                         **kwargs)
+
+    def _get_lr(self, step):
+        for initial_lr in self.base_lrs:
+            if initial_lr < self.min_lr:
+                raise ValueError(
+                    f"{self} received an initial learning rate "
+                    f"that was lower than the minimum learning rate.")
+
+        if self.constant_steps is None or self.constant_steps == 0:
+            new_lrs = [
+                _cosine_annealing(
+                    initial_lr=initial_lr,
+                    step=step - self.warmup_steps,
+                    max_steps=self.max_steps - self.warmup_steps,
+                    min_lr=self.min_lr,
+                ) for initial_lr in self.base_lrs
+            ]
+        else:
+            new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step)
+        return new_lrs
+
+    def _get_warmup_lr(self, step):
+        if self.constant_steps is None or self.constant_steps == 0:
+            return super()._get_warmup_lr(step)
+        else:
+            # Use linear warmup for the initial part.
+            return self._get_linear_warmup_with_cosine_annealing_lr(step)
+
+    def _get_constant_lr(self, step):
+        # Only called when `constant_steps` > 0.
+        return self._get_linear_warmup_with_cosine_annealing_lr(step)
+
+    def _get_linear_warmup_with_cosine_annealing_lr(self, step):
+        # Cosine Schedule for Megatron LM,
+        # slightly different warmup schedule + constant LR at the end.
+        new_lrs = [
+            _linear_warmup_with_cosine_annealing(
+                max_lr=self.base_lrs[0],
+                warmup_steps=self.warmup_steps,
+                step=step,
+                decay_steps=self.decay_steps,
+                min_lr=self.min_lr,
+            ) for _ in self.base_lrs
+        ]
+        return new_lrs
+
+
+class NoamAnnealing(_LRScheduler):
+
+    def __init__(self,
+                 optimizer,
+                 *,
+                 d_model,
+                 warmup_steps=None,
+                 warmup_ratio=None,
+                 max_steps=None,
+                 min_lr=0.0,
+                 last_epoch=-1):
+        self._normalize = d_model**(-0.5)
+        assert not (warmup_steps is not None and warmup_ratio is not None), \
+            "Either use particular number of step or ratio"
+        assert warmup_ratio is None or max_steps is not None, \
+            "If there is a ratio, there should be a total steps"
+
+        # It is necessary to assign all attributes *before* __init__,
+        # as class is wrapped by an inner class.
+        self.max_steps = max_steps
+        if warmup_steps is not None:
+            self.warmup_steps = warmup_steps
+        elif warmup_ratio is not None:
+            self.warmup_steps = int(warmup_ratio * max_steps)
+        else:
+            self.warmup_steps = 0
+
+        self.min_lr = min_lr
+        super().__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn(
+                "To get the last learning rate computed "
+                "by the scheduler, please use `get_last_lr()`.",
+                UserWarning,
+                stacklevel=2)
+
+        step = max(1, self.last_epoch)
+
+        for initial_lr in self.base_lrs:
+            if initial_lr < self.min_lr:
+                raise ValueError(
+                    f"{self} received an initial learning rate "
+                    f"that was lower than the minimum learning rate.")
+
+        new_lrs = [
+            self._noam_annealing(initial_lr=initial_lr, step=step)
+            for initial_lr in self.base_lrs
+        ]
+        return new_lrs
+
+    def _noam_annealing(self, initial_lr, step):
+        if self.warmup_steps > 0:
+            mult = self._normalize * min(step**(-0.5),
+                                         step * (self.warmup_steps**(-1.5)))
+        else:
+            mult = self._normalize * step**(-0.5)
+
+        out_lr = initial_lr * mult
+        if step > self.warmup_steps:
+            out_lr = max(out_lr, self.min_lr)
+        return out_lr
+
+
+class NoamHoldAnnealing(WarmupHoldPolicy):
+
+    def __init__(self,
+                 optimizer,
+                 *,
+                 max_steps,
+                 decay_rate=0.5,
+                 min_lr=0.0,
+                 last_epoch=-1,
+                 **kwargs):
+        """
+        From Nemo:
+        Implementation of the Noam Hold Annealing policy
+        from the SqueezeFormer paper.
+
+        Unlike NoamAnnealing, the peak learning rate
+        can be explicitly set for this scheduler.
+        The schedule first performs linear warmup,
+        then holds the peak LR, then decays with some schedule for
+        the remainder of the steps.
+        Therefore the min-lr is still dependent
+        on the hyper parameters selected.
+
+        It's schedule is determined by three factors-
+
+        Warmup Steps: Initial stage, where linear warmup
+            occurs uptil the peak LR is reached. Unlike NoamAnnealing,
+            the peak LR is explicitly stated here instead of a scaling factor.
+
+        Hold Steps: Intermediate stage, where the peak LR
+            is maintained for some number of steps. In this region,
+            the high peak LR allows the model to converge faster
+            if training is stable. However the high LR
+            may also cause instability during training.
+            Should usually be a significant fraction of training
+            steps (around 30-40% of the entire training steps).
+
+        Decay Steps: Final stage, where the LR rapidly decays
+            with some scaling rate (set by decay rate).
+            To attain Noam decay, use 0.5,
+            for Squeezeformer recommended decay, use 1.0.
+            The fast decay after prolonged high LR during
+            hold phase allows for rapid convergence.
+
+        References:
+            - [Squeezeformer:
+            An Efficient Transformer for Automatic Speech Recognition]
+            (https://arxiv.org/abs/2206.00888)
+
+        Args:
+            optimizer: Pytorch compatible Optimizer object.
+            warmup_steps: Number of training steps in warmup stage
+            warmup_ratio: Ratio of warmup steps to total steps
+            hold_steps: Number of training steps to
+                        hold the learning rate after warm up
+            hold_ratio: Ratio of hold steps to total steps
+            max_steps: Total number of steps while training or `None` for
+                infinite training
+            decay_rate: Float value describing the polynomial decay
+                        after the hold period. Default value
+                        of 0.5 corresponds to Noam decay.
+            min_lr: Minimum learning rate.
+        """
+        self.decay_rate = decay_rate
+        super().__init__(optimizer=optimizer,
+                         max_steps=max_steps,
+                         last_epoch=last_epoch,
+                         min_lr=min_lr,
+                         **kwargs)
+
+    def _get_lr(self, step):
+        if self.warmup_steps is None or self.warmup_steps == 0:
+            raise ValueError(
+                "Noam scheduler cannot be used without warmup steps")
+
+        if self.hold_steps > 0:
+            hold_steps = self.hold_steps - self.warmup_steps
+        else:
+            hold_steps = 0
+
+        new_lrs = [
+            _noam_hold_annealing(
+                initial_lr,
+                step=step,
+                warmup_steps=self.warmup_steps,
+                hold_steps=hold_steps,
+                decay_rate=self.decay_rate,
+                min_lr=self.min_lr,
+            ) for initial_lr in self.base_lrs
+        ]
+        return new_lrs
+
+    def set_step(self, step: int):
+        self.last_epoch = step
+
+
+class ConstantLR(_LRScheduler):
+    """The ConstantLR scheduler
+
+    This scheduler keeps a constant lr
+
+    """
+
+    def __init__(
+        self,
+        optimizer: torch.optim.Optimizer,
+    ):
+        # __init__() must be invoked before setting field
+        # because step() is also invoked in __init__()
+        super().__init__(optimizer)
+
+    def get_lr(self):
+        return self.base_lrs
+
+    def set_step(self, step: int):
+        self.last_epoch = step
--- a/inspiremusic/utils/tokenizer_utils.py
+++ b/inspiremusic/utils/tokenizer_utils.py
+import glob
+import json
+import os
+import random
+import sys
+import time
+import warnings
+
+import matplotlib
+import numpy as np
+import torch
+import yaml
+from torch import distributed as dist
+from torch.nn.utils import weight_norm
+matplotlib.use("Agg")
+import matplotlib.pylab as plt
+import re
+import pathlib
+
+
+def seed_everything(seed, cudnn_deterministic=False):
+    """
+    Function that sets seed for pseudo-random number generators in:
+    pytorch, numpy, python.random
+    
+    Args:
+        seed: the integer value seed for global random state
+    """
+    if seed is not None:
+        # print(f"Global seed set to {seed}")
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+    # if cudnn_deterministic:
+    #     torch.backends.cudnn.deterministic = True
+    #     warnings.warn('You have chosen to seed training. '
+    #                   'This will turn on the CUDNN deterministic setting, '
+    #                   'which can slow down your training considerably! '
+    #                   'You may see unexpected behavior when restarting '
+    #                   'from checkpoints.')
+
+
+def is_primary():
+    return get_rank() == 0
+
+
+def get_rank():
+    if not dist.is_available():
+        return 0
+    if not dist.is_initialized():
+        return 0
+
+    return dist.get_rank()
+
+
+def load_yaml_config(path):
+    with open(path) as f:
+        config = yaml.full_load(f)
+    return config
+
+
+def save_config_to_yaml(config, path):
+    assert path.endswith('.yaml')
+    with open(path, 'w') as f:
+        f.write(yaml.dump(config))
+        f.close()
+
+
+def save_dict_to_json(d, path, indent=None):
+    json.dump(d, open(path, 'w'), indent=indent)
+
+
+def load_dict_from_json(path):
+    return json.load(open(path, 'r'))
+
+
+def write_args(args, path):
+    args_dict = dict((name, getattr(args, name)) for name in dir(args)
+                     if not name.startswith('_'))
+    with open(path, 'a') as args_file:
+        args_file.write('==> torch version: {}\n'.format(torch.__version__))
+        args_file.write(
+            '==> cudnn version: {}\n'.format(torch.backends.cudnn.version()))
+        args_file.write('==> Cmd:\n')
+        args_file.write(str(sys.argv))
+        args_file.write('\n==> args:\n')
+        for k, v in sorted(args_dict.items()):
+            args_file.write('  %s: %s\n' % (str(k), str(v)))
+        args_file.close()
+
+
+class Logger(object):
+    def __init__(self, args):
+        self.args = args
+        self.save_dir = args.save_dir
+        self.is_primary = is_primary()
+
+        if self.is_primary:
+            os.makedirs(self.save_dir, exist_ok=True)
+
+            # save the args and config
+            self.config_dir = os.path.join(self.save_dir, 'configs')
+            os.makedirs(self.config_dir, exist_ok=True)
+            file_name = os.path.join(self.config_dir, 'args.txt')
+            write_args(args, file_name)
+
+            log_dir = os.path.join(self.save_dir, 'logs')
+            if not os.path.exists(log_dir):
+                os.makedirs(log_dir, exist_ok=True)
+            self.text_writer = open(os.path.join(log_dir, 'log.txt'),
+                                    'a')  # 'w')
+            if args.tensorboard:
+                self.log_info('using tensorboard')
+                self.tb_writer = torch.utils.tensorboard.SummaryWriter(
+                    log_dir=log_dir
+                )  # tensorboard.SummaryWriter(log_dir=log_dir)
+            else:
+                self.tb_writer = None
+
+    def save_config(self, config):
+        if self.is_primary:
+            save_config_to_yaml(config,
+                                os.path.join(self.config_dir, 'config.yaml'))
+
+    def log_info(self, info, check_primary=True):
+        if self.is_primary or (not check_primary):
+            print(info)
+            if self.is_primary:
+                info = str(info)
+                time_str = time.strftime('%Y-%m-%d-%H-%M')
+                info = '{}: {}'.format(time_str, info)
+                if not info.endswith('\n'):
+                    info += '\n'
+                self.text_writer.write(info)
+                self.text_writer.flush()
+
+    def add_scalar(self, **kargs):
+        """Log a scalar variable."""
+        if self.is_primary:
+            if self.tb_writer is not None:
+                self.tb_writer.add_scalar(**kargs)
+
+    def add_scalars(self, **kargs):
+        """Log a scalar variable."""
+        if self.is_primary:
+            if self.tb_writer is not None:
+                self.tb_writer.add_scalars(**kargs)
+
+    def add_image(self, **kargs):
+        """Log a scalar variable."""
+        if self.is_primary:
+            if self.tb_writer is not None:
+                self.tb_writer.add_image(**kargs)
+
+    def add_images(self, **kargs):
+        """Log a scalar variable."""
+        if self.is_primary:
+            if self.tb_writer is not None:
+                self.tb_writer.add_images(**kargs)
+
+    def close(self):
+        if self.is_primary:
+            self.text_writer.close()
+            self.tb_writer.close()
+
+
+def plot_spectrogram(spectrogram):
+    fig, ax = plt.subplots(figsize=(10, 2))
+    im = ax.imshow(
+        spectrogram, aspect="auto", origin="lower", interpolation='none')
+    plt.colorbar(im, ax=ax)
+
+    fig.canvas.draw()
+    plt.close()
+
+    return fig
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def apply_weight_norm(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        weight_norm(m)
+
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+
+
+def load_checkpoint(filepath, device):
+    assert os.path.isfile(filepath)
+    print("Loading '{}'".format(filepath))
+    checkpoint_dict = torch.load(filepath, map_location=device)
+    print("Complete.")
+    return checkpoint_dict
+
+
+def save_checkpoint(filepath, obj, num_ckpt_keep=5):
+    name = re.match(r'(do|g)_\d+', pathlib.Path(filepath).name).group(1)
+    ckpts = sorted(pathlib.Path(filepath).parent.glob(f'{name}_*'))
+    if len(ckpts) > num_ckpt_keep:
+        [os.remove(c) for c in ckpts[:-num_ckpt_keep]]
+    print("Saving checkpoint to {}".format(filepath))
+    torch.save(obj, filepath)
+    print("Complete.")
+
+
+def scan_checkpoint(cp_dir, prefix):
+    pattern = os.path.join(cp_dir, prefix + '????????')
+    cp_list = glob.glob(pattern)
+    if len(cp_list) == 0:
+        return None
+    return sorted(cp_list)[-1]
+
--- a/inspiremusic/utils/train_utils.py
+++ b/inspiremusic/utils/train_utils.py
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#               2023 Horizon Inc. (authors: Xingchen Song)
+#               2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from contextlib import nullcontext
+import logging
+import os
+import torch
+import json
+import re
+import datetime
+import yaml
+
+import deepspeed
+import torch.optim as optim
+import torch.distributed as dist
+
+from torch.utils.tensorboard import SummaryWriter
+from torch.utils.data import DataLoader
+from torch.nn.utils import clip_grad_norm_
+
+from deepspeed.runtime.zero.stage_1_and_2 import estimate_zero2_model_states_mem_needs_all_live
+
+from inspiremusic.dataset.dataset import Dataset
+from inspiremusic.utils.scheduler import WarmupLR, NoamHoldAnnealing, ConstantLR
+
+
+def init_distributed(args):
+    world_size = int(os.environ.get('WORLD_SIZE', 1))
+    local_rank = int(os.environ.get('LOCAL_RANK', 0))
+    rank = int(os.environ.get('RANK', 0))
+    logging.info('training on multiple gpus, this gpu {}'.format(local_rank) +
+                 ', rank {}, world_size {}'.format(rank, world_size))
+    if args.train_engine == 'torch_ddp':
+        torch.cuda.set_device(local_rank)
+        dist.init_process_group(args.dist_backend)
+    else:
+        deepspeed.init_distributed(dist_backend=args.dist_backend)
+    return world_size, local_rank, rank
+
+
+def init_dataset_and_dataloader(args, configs):
+    gan = False
+    data_pipeline = configs['data_pipeline_gan'] if gan is True else configs['data_pipeline']
+    train_dataset = Dataset(args.train_data, data_pipeline=data_pipeline, mode='train', shuffle=True, partition=True)
+    cv_dataset = Dataset(args.cv_data, data_pipeline=data_pipeline, mode='train', shuffle=False, partition=False)
+
+    # do not use persistent_workers=True, as whisper tokenizer opens tiktoken file each time when the for loop starts
+    train_data_loader = DataLoader(train_dataset,
+                                   batch_size=None,
+                                   pin_memory=args.pin_memory,
+                                   num_workers=args.num_workers,
+                                   prefetch_factor=args.prefetch,
+                                   timeout=60)
+    cv_data_loader = DataLoader(cv_dataset,
+                                batch_size=None,
+                                pin_memory=args.pin_memory,
+                                num_workers=args.num_workers,
+                                prefetch_factor=args.prefetch,
+                                timeout=60)
+    return train_dataset, cv_dataset, train_data_loader, cv_data_loader
+
+
+def check_modify_and_save_config(args, configs):
+    if args.train_engine == "torch_ddp":
+        configs['train_conf']["dtype"] = 'fp32'
+    else:
+        with open(args.deepspeed_config, 'r') as fin:
+            ds_configs = json.load(fin)
+        if "fp16" in ds_configs and ds_configs["fp16"]["enabled"]:
+            configs['train_conf']["dtype"] = "fp16"
+        elif "bf16" in ds_configs and ds_configs["bf16"]["enabled"]:
+            configs['train_conf']["dtype"] = "bf16"
+        else:
+            configs['train_conf']["dtype"] = "fp32"
+        assert ds_configs["train_micro_batch_size_per_gpu"] == 1
+        # if use deepspeed, override ddp config
+        configs['train_conf']['save_per_step'] = int(configs['train_conf']['save_per_step'] *
+                                                     configs['train_conf']['accum_grad'] / ds_configs["gradient_accumulation_steps"])
+        configs['train_conf']['accum_grad'] = ds_configs["gradient_accumulation_steps"]
+        configs['train_conf']['grad_clip'] = ds_configs["gradient_clipping"]
+        configs['train_conf']['log_interval'] = ds_configs["steps_per_print"]
+    return configs
+
+
+def wrap_cuda_model(args, model):
+    local_world_size = int(os.environ.get('LOCAL_WORLD_SIZE', 1))
+    world_size = int(os.environ.get('WORLD_SIZE', 1))
+    if args.train_engine == "torch_ddp":  # native pytorch ddp
+        assert (torch.cuda.is_available())
+        model.cuda()
+        model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True)
+    else:
+        if int(os.environ.get('RANK', 0)) == 0:
+            logging.info("Estimating model states memory needs (zero2)...")
+            estimate_zero2_model_states_mem_needs_all_live(
+                model,
+                num_gpus_per_node=local_world_size,
+                num_nodes=world_size // local_world_size)
+    return model
+       
+def init_optimizer_and_scheduler(args, configs, model):
+    if configs['train_conf']['optim'] == 'adam':
+        optimizer = optim.Adam(model.parameters(), **configs['train_conf']['optim_conf'])
+    elif configs['train_conf']['optim'] == 'adamw':
+        optimizer = optim.AdamW(model.parameters(), **configs['train_conf']['optim_conf'])
+    else:
+        raise ValueError("unknown optimizer: " + configs['train_conf'])
+
+    if configs['train_conf']['scheduler'] == 'warmuplr':
+        scheduler_type = WarmupLR
+        scheduler = WarmupLR(optimizer, **configs['train_conf']['scheduler_conf'])
+    elif configs['train_conf']['scheduler'] == 'NoamHoldAnnealing':
+        scheduler_type = NoamHoldAnnealing
+        scheduler = NoamHoldAnnealing(optimizer, **configs['train_conf']['scheduler_conf'])
+    elif configs['train_conf']['scheduler'] == 'constantlr':
+        scheduler_type = ConstantLR
+        scheduler = ConstantLR(optimizer)
+    else:
+        raise ValueError("unknown scheduler: " + configs['train_conf'])
+
+    # use deepspeed optimizer for speedup
+    if args.train_engine == "deepspeed":
+        def scheduler(opt):
+            return scheduler_type(opt, **configs['train_conf']['scheduler_conf'])
+        model, optimizer, _, scheduler = deepspeed.initialize(
+            args=args,
+            model=model,
+            optimizer=None,
+            lr_scheduler=scheduler,
+            model_parameters=model.parameters())
+
+    return model, optimizer, scheduler
+
+
+def init_summarywriter(args):
+    writer = None
+    if int(os.environ.get('RANK', 0)) == 0:
+        os.makedirs(args.model_dir, exist_ok=True)
+        writer = SummaryWriter(args.tensorboard_dir)
+    return writer
+
+
+def save_model(model, model_name, info_dict):
+    rank = int(os.environ.get('RANK', 0))
+    model_dir = info_dict["model_dir"]
+    save_model_path = os.path.join(model_dir, '{}.pt'.format(model_name))
+
+    if info_dict["train_engine"] == "torch_ddp":
+        if rank == 0:
+            torch.save(model.module.state_dict(), save_model_path)
+    else:
+        with torch.no_grad():
+            model.save_checkpoint(save_dir=model_dir,
+                                  tag=model_name,
+                                  client_state=info_dict)
+    if rank == 0:
+        info_path = re.sub('.pt$', '.yaml', save_model_path)
+        info_dict['save_time'] = datetime.datetime.now().strftime('%d/%m/%Y %H:%M:%S')
+        with open(info_path, 'w') as fout:
+            data = yaml.dump(info_dict)
+            fout.write(data)
+        logging.info('[Rank {}] Checkpoint: save to checkpoint {}'.format(rank, save_model_path))
+
+
+def inspiremusic_join(group_join, info_dict):
+    world_size = int(os.environ.get('WORLD_SIZE', 1))
+    local_rank = int(os.environ.get('LOCAL_RANK', 0))
+    rank = int(os.environ.get('RANK', 0))
+
+    if info_dict["batch_idx"] != 0:
+        # we try to join all rank in both ddp and deepspeed mode, in case different rank has different lr
+        try:
+            dist.monitored_barrier(group=group_join,
+                                   timeout=group_join.options._timeout)
+            return False
+        except RuntimeError as e:
+            logging.info("Detected uneven workload distribution: {}\n".format(e) +
+                         "Break current worker to manually join all workers, " +
+                         "world_size {}, current rank {}, current local_rank {}\n".
+                         format(world_size, rank, local_rank))
+            return True
+    else:
+        return False
+
+
+def batch_forward(model, batch, info_dict, scaler):
+    device = int(os.environ.get('LOCAL_RANK', 0))
+
+    dtype = info_dict["dtype"]
+    if dtype == "fp16":
+        dtype = torch.float16
+    elif dtype == "bf16":
+        dtype = torch.bfloat16
+    else:  # fp32
+        dtype = torch.float32
+
+    if info_dict['train_engine'] == 'torch_ddp':
+        autocast = torch.cuda.amp.autocast(enabled=scaler is not None)
+    else:
+        autocast = torch.cuda.amp.autocast(enabled=True, dtype=dtype, cache_enabled=False)
+
+    with autocast:
+        info_dict['loss_dict'] = model(batch, device)
+    return info_dict
+
+
+def batch_backward(model, info_dict, scaler):
+    if info_dict["train_engine"] == "deepspeed":
+        scaled_loss = model.backward(info_dict['loss_dict']['loss'])
+    else:
+        scaled_loss = info_dict['loss_dict']['loss'] / info_dict['accum_grad']
+        if scaler is not None:
+            scaler.scale(scaled_loss).backward()
+        else:
+            scaled_loss.backward()
+
+    info_dict['loss_dict']['loss'] = scaled_loss
+    return info_dict
+
+def update_parameter_and_lr(model, optimizer, scheduler, info_dict, scaler=None):
+    grad_norm = 0.0
+    if info_dict['train_engine'] == "deepspeed":
+        info_dict["is_gradient_accumulation_boundary"] = model.is_gradient_accumulation_boundary()
+        model.step()
+        grad_norm = model.get_global_grad_norm()
+    elif (info_dict['batch_idx'] + 1) % info_dict["accum_grad"] == 0:
+        if scaler is not None:
+            scaler.unscale_(optimizer)  # Unscale gradients before clipping
+            grad_norm = clip_grad_norm_(model.parameters(), info_dict['grad_clip']) 
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            grad_norm = clip_grad_norm_(model.parameters(), info_dict['grad_clip'])
+            if torch.isfinite(grad_norm):
+                optimizer.step()
+        optimizer.zero_grad()
+        scheduler.step()
+    info_dict["lr"] = optimizer.param_groups[0]['lr']
+    info_dict["grad_norm"] = grad_norm
+    return info_dict
+
+
+def log_per_step(writer, info_dict):
+    tag = info_dict["tag"]
+    epoch = info_dict.get('epoch', 0)
+    step = info_dict["step"]
+    batch_idx = info_dict["batch_idx"]
+    loss_dict = info_dict['loss_dict']
+    rank = int(os.environ.get('RANK', 0))
+
+    # only rank 0 write to tensorboard to avoid multi-process write
+    if writer is not None:
+        if (info_dict['train_engine'] == 'deepspeed' and info_dict['is_gradient_accumulation_boundary'] is True) or \
+           (info_dict['train_engine'] == 'torch_ddp' and (info_dict['batch_idx'] + 1) % info_dict['accum_grad'] == 0):
+            for k in ['epoch', 'lr', 'grad_norm']:
+                writer.add_scalar('{}/{}'.format(tag, k), info_dict[k], step + 1)
+            for k, v in loss_dict.items():
+                writer.add_scalar('{}/{}'.format(tag, k), v, step + 1)
+
+    # TRAIN & CV, Shell log (stdout)
+    if (info_dict['batch_idx'] + 1) % info_dict['log_interval'] == 0:
+        log_str = '{} Batch {}/{} '.format(tag, epoch, batch_idx + 1)
+        for name, value in loss_dict.items():
+            log_str += '{} {:.6f} '.format(name, value.item())
+        if tag == "TRAIN":
+            log_str += 'lr {:.8f} grad_norm {:.6f}'.format(
+                info_dict["lr"], info_dict['grad_norm'])
+        log_str += ' rank {}'.format(rank)
+        logging.debug(log_str)
+
+
+def log_per_save(writer, info_dict):
+    tag = info_dict["tag"]
+    epoch = info_dict["epoch"]
+    step = info_dict["step"]
+    loss_dict = info_dict["loss_dict"]
+    lr = info_dict['lr']
+    rank = int(os.environ.get('RANK', 0))
+    logging.info(
+        'Epoch {} Step {} CV info lr {} {} rank {}'.format(
+            epoch, step + 1, lr, rank, ' '.join(['{}_{}'.format(k, v) for k, v in loss_dict.items()])))
+
+    if writer is not None:
+        for k in ['epoch', 'lr']:
+            writer.add_scalar('{}/{}'.format(tag, k), info_dict[k], step + 1)
+        for k, v in loss_dict.items():
+            writer.add_scalar('{}/{}'.format(tag, k), v, step + 1)
--- a/inspiremusic/utils/utils.py
+++ b/inspiremusic/utils/utils.py
+import os
+import sys
+
+def align_trans_scp_file(trans, scp):
+    trans_dict = {}
+    with open(trans, 'r') as f:
+        for line in f:
+            sec = line.strip().split("\t")
+            trans_dict[sec[0]] = sec[1]
+    scp_dict = {}
+    with open(scp, 'r') as f:
+        for line in f:
+            sec = line.strip().split(" ")
+            scp_dict[sec[0]] = sec[1]
+    with open("text", "w") as f:
+        for k, v in scp_dict.items():
+            f.write("%s\t%s\n"%(k,trans_dict[k]))
+
+if __name__ == '__main__':
+    trans = sys.argv[1]
+    scp = sys.argv[2]
+    align_trans_scp_file(trans, scp)
\ No newline at end of file
--- a/inspiremusic/version.txt
+++ b/inspiremusic/version.txt
+v0.1
\ No newline at end of file
--- a/inspiremusic/wavtokenizer/__init__.py
+++ b/inspiremusic/wavtokenizer/__init__.py
--- a/inspiremusic/wavtokenizer/__pycache__/__init__.cpython-310.pyc
+++ b/inspiremusic/wavtokenizer/__pycache__/__init__.cpython-310.pyc
--- a/inspiremusic/wavtokenizer/decoder/__init__.py
+++ b/inspiremusic/wavtokenizer/decoder/__init__.py
--- a/inspiremusic/wavtokenizer/decoder/__pycache__/__init__.cpython-310.pyc
+++ b/inspiremusic/wavtokenizer/decoder/__pycache__/__init__.cpython-310.pyc
--- a/inspiremusic/wavtokenizer/decoder/__pycache__/feature_extractors.cpython-310.pyc
+++ b/inspiremusic/wavtokenizer/decoder/__pycache__/feature_extractors.cpython-310.pyc
--- a/inspiremusic/wavtokenizer/decoder/__pycache__/heads.cpython-310.pyc
+++ b/inspiremusic/wavtokenizer/decoder/__pycache__/heads.cpython-310.pyc
--- a/inspiremusic/wavtokenizer/decoder/__pycache__/models.cpython-310.pyc
+++ b/inspiremusic/wavtokenizer/decoder/__pycache__/models.cpython-310.pyc
--- a/inspiremusic/wavtokenizer/decoder/__pycache__/modules.cpython-310.pyc
+++ b/inspiremusic/wavtokenizer/decoder/__pycache__/modules.cpython-310.pyc
--- a/inspiremusic/wavtokenizer/decoder/__pycache__/pretrained.cpython-310.pyc
+++ b/inspiremusic/wavtokenizer/decoder/__pycache__/pretrained.cpython-310.pyc
--- a/inspiremusic/wavtokenizer/decoder/__pycache__/spectral_ops.cpython-310.pyc
+++ b/inspiremusic/wavtokenizer/decoder/__pycache__/spectral_ops.cpython-310.pyc
--- a/inspiremusic/wavtokenizer/decoder/dataset.py
+++ b/inspiremusic/wavtokenizer/decoder/dataset.py
+from dataclasses import dataclass
+
+import numpy as np
+import torch
+import torchaudio
+from pytorch_lightning import LightningDataModule
+from torch.utils.data import Dataset, DataLoader
+
+import soundfile
+# import librosa
+import random
+
+torch.set_num_threads(1)
+
+
+@dataclass
+class DataConfig:
+    filelist_path: str
+    sampling_rate: int
+    num_samples: int
+    batch_size: int
+    num_workers: int
+
+def collate_fn(batch):
+    batch = [item for item in batch if item is not None]
+    return torch.stack(batch, dim=0)
+
+class VocosDataModule(LightningDataModule):
+    def __init__(self, train_params: DataConfig, val_params: DataConfig):
+        super().__init__()
+        self.train_config = train_params
+        self.val_config = val_params
+
+    def _get_dataloder(self, cfg: DataConfig, train: bool):
+        dataset = VocosDataset(cfg, train=train)
+        dataloader = DataLoader(
+            dataset, batch_size=cfg.batch_size, num_workers=cfg.num_workers, shuffle=train, pin_memory=True, collate_fn=collate_fn
+        )
+        return dataloader
+
+    def train_dataloader(self) -> DataLoader:
+        return self._get_dataloder(self.train_config, train=True)
+
+    def val_dataloader(self) -> DataLoader:
+        return self._get_dataloder(self.val_config, train=False)
+
+
+class VocosDataset(Dataset):
+    def __init__(self, cfg: DataConfig, train: bool):
+        with open(cfg.filelist_path) as f:
+            self.filelist = f.read().splitlines()
+        self.sampling_rate = cfg.sampling_rate
+        self.num_samples = cfg.num_samples
+        self.train = train
+
+    def __len__(self) -> int:
+        return len(self.filelist)
+
+    def __getitem__(self, index: int) -> torch.Tensor:
+        audio_path = self.filelist[index]
+        # y, sr = torchaudio.load(audio_path)
+        # print(audio_path,"111")
+        try:
+            y1, sr = soundfile.read(audio_path)
+            # y1, sr = librosa.load(audio_path,sr=None)
+            y = torch.tensor(y1).float().unsqueeze(0)
+            # if y.size(0) > 1:
+            #     # mix to mono
+            #     y = y.mean(dim=0, keepdim=True)
+            if y.ndim > 2:
+                # mix to mono
+                # print("有问题哈,数据处理部分")
+                # y = y.mean(dim=-1, keepdim=False)
+                random_channel = random.randint(0, y.size(-1) - 1)
+                y = y[:, :, random_channel] 
+
+            gain = np.random.uniform(-1, -6) if self.train else -3
+            y, _ = torchaudio.sox_effects.apply_effects_tensor(y, sr, [["norm", f"{gain:.2f}"]])
+            if sr != self.sampling_rate:
+                y = torchaudio.functional.resample(y, orig_freq=sr, new_freq=self.sampling_rate)
+            if y.size(-1) < self.num_samples:
+                pad_length = self.num_samples - y.size(-1)
+                padding_tensor = y.repeat(1, 1 + pad_length // y.size(-1))
+                y = torch.cat((y, padding_tensor[:, :pad_length]), dim=1)
+            elif self.train:
+                start = np.random.randint(low=0, high=y.size(-1) - self.num_samples + 1)
+                y = y[:, start : start + self.num_samples]
+            else:
+                # During validation, take always the first segment for determinism
+                y = y[:, : self.num_samples]
+
+            return y[0]
+        except Exception as e:
+            print(f"Error processing file {audio_path} at index {index}: {e}")
+            # 这里可以继续选择抛出异常，或者返回一个 None 表示无效数据
+            return None
+
+    # def __getitem__(self, index: int) -> torch.Tensor:
+    #     audio_path = self.filelist[index]
+    #     try:
+    #         y, sr = torchaudio.load(audio_path)
+    #         if y.size(0) > 1:
+    #             # 随机选择一个通道
+    #             random_channel = random.randint(0, y.size(0) - 1)
+    #             y = y[random_channel, :].unsqueeze(0)  # 保持返回值为 (1, T) 的形式
+    #         # gain = np.random.uniform(-1, -6) if self.train else -3
+    #         # y, _ = torchaudio.sox_effects.apply_effects_tensor(y, sr, [["norm", f"{gain:.2f}"]])
+    #         if sr != self.sampling_rate:
+    #             y = torchaudio.functional.resample(y, orig_freq=sr, new_freq=self.sampling_rate)
+    #         if y.size(-1) < self.num_samples:
+    #             pad_length = self.num_samples - y.size(-1)
+    #             padding_tensor = y.repeat(1, 1 + pad_length // y.size(-1))
+    #             y = torch.cat((y, padding_tensor[:, :pad_length]), dim=1)
+    #         elif self.train:
+    #             start = np.random.randint(low=0, high=y.size(-1) - self.num_samples + 1)
+    #             y = y[:, start: start + self.num_samples]
+    #         else:
+    #             # During validation, take always the first segment for determinism
+    #             y = y[:, :self.num_samples]
+    #         return y[0]
+    #     except Exception as e:
+    #         print(f"Error processing file {audio_path} at index {index}: {e}")
+    #         # 这里可以继续选择抛出异常，或者返回一个 None 表示无效数据
+    #         return None
\ No newline at end of file
--- a/inspiremusic/wavtokenizer/decoder/discriminator_dac.py
+++ b/inspiremusic/wavtokenizer/decoder/discriminator_dac.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# from audiotools import AudioSignal
+# from audiotools import ml
+# from audiotools import STFTParams
+from einops import rearrange
+from torch.nn.utils import weight_norm
+
+from collections import namedtuple
+
+STFTParams = namedtuple(
+    "STFTParams",
+    ["window_length", "hop_length", "window_type", "match_stride", "padding_type"],
+)
+
+STFTParams.__new__.__defaults__ = (None, None, None, None, None)
+
+
+def WNConv1d(*args, **kwargs):
+    act = kwargs.pop("act", True)
+    conv = weight_norm(nn.Conv1d(*args, **kwargs))
+    if not act:
+        return conv
+    return nn.Sequential(conv, nn.LeakyReLU(0.1))
+
+
+def WNConv2d(*args, **kwargs):
+    act = kwargs.pop("act", True)
+    conv = weight_norm(nn.Conv2d(*args, **kwargs))
+    if not act:
+        return conv
+    return nn.Sequential(conv, nn.LeakyReLU(0.1))
+
+
+class MPD(nn.Module):
+    def __init__(self, period):
+        super().__init__()
+        self.period = period
+        self.convs = nn.ModuleList(
+            [
+                WNConv2d(1, 32, (5, 1), (3, 1), padding=(2, 0)),
+                WNConv2d(32, 128, (5, 1), (3, 1), padding=(2, 0)),
+                WNConv2d(128, 512, (5, 1), (3, 1), padding=(2, 0)),
+                WNConv2d(512, 1024, (5, 1), (3, 1), padding=(2, 0)),
+                WNConv2d(1024, 1024, (5, 1), 1, padding=(2, 0)),
+            ]
+        )
+        self.conv_post = WNConv2d(
+            1024, 1, kernel_size=(3, 1), padding=(1, 0), act=False
+        )
+
+    def pad_to_period(self, x):
+        t = x.shape[-1]
+        x = F.pad(x, (0, self.period - t % self.period), mode="reflect")
+        return x
+
+    def forward(self, x):
+        fmap = []
+
+        x = self.pad_to_period(x)
+        x = rearrange(x, "b c (l p) -> b c l p", p=self.period)
+
+        for layer in self.convs:
+            x = layer(x)
+            fmap.append(x)
+
+        x = self.conv_post(x)
+        fmap.append(x)
+
+        return fmap
+
+
+class MSD(nn.Module):
+    def __init__(self, rate: int = 1, sample_rate: int = 48000):
+        super().__init__()
+        self.convs = nn.ModuleList(
+            [
+                WNConv1d(1, 16, 15, 1, padding=7),
+                WNConv1d(16, 64, 41, 4, groups=4, padding=20),
+                WNConv1d(64, 256, 41, 4, groups=16, padding=20),
+                WNConv1d(256, 1024, 41, 4, groups=64, padding=20),
+                WNConv1d(1024, 1024, 41, 4, groups=256, padding=20),
+                WNConv1d(1024, 1024, 5, 1, padding=2),
+            ]
+        )
+        self.conv_post = WNConv1d(1024, 1, 3, 1, padding=1, act=False)
+        self.sample_rate = sample_rate
+        self.rate = rate
+
+    def forward(self, x):
+        # x = AudioSignal(x, self.sample_rate)
+        # x.resample(self.sample_rate // self.rate)
+        # x = x.audio_data
+
+        fmap = []
+
+        for l in self.convs:
+            x = l(x)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+
+        return fmap
+
+
+BANDS = [(0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)]
+
+
+class MRD(nn.Module):
+    def __init__(
+        self,
+        window_length: int,
+        hop_factor: float = 0.25,
+        sample_rate: int = 24000,
+        bands: list = BANDS,
+    ):
+        """Complex multi-band spectrogram discriminator.
+        Parameters
+        ----------
+        window_length : int
+            Window length of STFT.
+        hop_factor : float, optional
+            Hop factor of the STFT, defaults to ``0.25 * window_length``.
+        sample_rate : int, optional
+            Sampling rate of audio in Hz, by default 24000
+        bands : list, optional
+            Bands to run discriminator over.
+        """
+        super().__init__()
+
+        self.window_length = window_length
+        self.hop_factor = hop_factor
+        self.sample_rate = sample_rate
+        self.stft_params = STFTParams(
+            window_length=window_length,
+            hop_length=int(window_length * hop_factor),
+            match_stride=True,
+        )
+
+        n_fft = window_length // 2 + 1
+        bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
+        self.bands = bands
+        self.n_fft = window_length
+
+        ch = 32
+        convs = lambda: nn.ModuleList(
+            [
+                WNConv2d(2, ch, (3, 9), (1, 1), padding=(1, 4)),
+                WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
+                WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
+                WNConv2d(ch, ch, (3, 9), (1, 2), padding=(1, 4)),
+                WNConv2d(ch, ch, (3, 3), (1, 1), padding=(1, 1)),
+            ]
+        )
+        self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
+        self.conv_post = WNConv2d(ch, 1, (3, 3), (1, 1), padding=(1, 1), act=False)
+
+    def spectrogram(self, x):
+        # x = AudioSignal(x, self.sample_rate, stft_params=self.stft_params)
+        # x = torch.view_as_real(x.stft())
+
+        # x.squeeze(0).stft(n_fft=1024,win_length=1024,return_complex=True).size()
+        # breakpoint()
+        if x.size(0)==1:
+            # x = torch.view_as_real(x.squeeze(0).stft(n_fft=self.window_length,return_complex=True).unsqueeze(0))
+            x = torch.view_as_real(x.squeeze(0).stft(n_fft=self.n_fft,return_complex=True).unsqueeze(0))
+        else:
+            # x = torch.view_as_real(x.squeeze(1).stft(n_fft=self.window_length,return_complex=True).unsqueeze(1))
+            x = torch.view_as_real(x.squeeze(1).stft(n_fft=self.n_fft,return_complex=True).unsqueeze(1))
+        x = rearrange(x, "b 1 f t c -> (b 1) c t f")
+        # Split into bands
+        x_bands = [x[..., b[0] : b[1]] for b in self.bands]
+        return x_bands
+
+    def forward(self, x):
+        x_bands = self.spectrogram(x)
+        fmap = []
+
+        x = []
+        for band, stack in zip(x_bands, self.band_convs):
+            for layer in stack:
+                band = layer(band)
+                fmap.append(band)
+            x.append(band)
+
+        x = torch.cat(x, dim=-1)
+        x = self.conv_post(x)
+        fmap.append(x)
+
+        return fmap
+
+
+# class DACDiscriminator(ml.BaseModel):
+class DACDiscriminator(nn.Module):
+    def __init__(
+        self,
+        rates: list = [],
+        periods: list = [2, 3, 5, 7, 11],
+        fft_sizes: list = [2048, 1024, 512],
+        sample_rate: int = 24000,
+        bands: list = BANDS,
+    ):
+        """Discriminator that combines multiple discriminators.
+
+        Parameters
+        ----------
+        rates : list, optional
+            sampling rates (in Hz) to run MSD at, by default []
+            If empty, MSD is not used.
+        periods : list, optional
+            periods (of samples) to run MPD at, by default [2, 3, 5, 7, 11]
+        fft_sizes : list, optional
+            Window sizes of the FFT to run MRD at, by default [2048, 1024, 512]
+        sample_rate : int, optional
+            Sampling rate of audio in Hz, by default 24000
+        bands : list, optional
+            Bands to run MRD at, by default `BANDS`
+        """
+        super().__init__()
+        discs = []
+        discs += [MPD(p) for p in periods]
+        discs += [MSD(r, sample_rate=sample_rate) for r in rates]
+        discs += [MRD(f, sample_rate=sample_rate, bands=bands) for f in fft_sizes]
+        self.discriminators = nn.ModuleList(discs)
+
+    def preprocess(self, y):
+        # Remove DC offset
+        y = y - y.mean(dim=-1, keepdims=True)
+        # Peak normalize the volume of input audio
+        y = 0.8 * y / (y.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
+        return y
+
+    def forward(self, x):
+        x = self.preprocess(x)
+        fmaps = [d(x) for d in self.discriminators]
+        return fmaps
+
+
+if __name__ == "__main__":
+    disc = DACDiscriminator()
+    x = torch.zeros(1, 1, 24000)
+    results = disc(x)
+    breakpoint()
+    for i, result in enumerate(results):
+        print(f"disc{i}")
+        for i, r in enumerate(result):
+            print(r.shape, r.mean(), r.min(), r.max())
+        print("00")
--- a/inspiremusic/wavtokenizer/decoder/discriminators.py
+++ b/inspiremusic/wavtokenizer/decoder/discriminators.py
+from typing import Tuple, List
+
+import torch
+from torch import nn
+from torch.nn import Conv2d
+from torch.nn.utils import weight_norm
+
+
+class MultiPeriodDiscriminator(nn.Module):
+    """
+    Multi-Period Discriminator module adapted from https://github.com/jik876/hifi-gan.
+    Additionally, it allows incorporating conditional information with a learned embeddings table.
+
+    Args:
+        periods (tuple[int]): Tuple of periods for each discriminator.
+        num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
+            Defaults to None.
+    """
+
+    def __init__(self, periods: Tuple[int] = (2, 3, 5, 7, 11), num_embeddings: int = None):
+        super().__init__()
+        self.discriminators = nn.ModuleList([DiscriminatorP(period=p, num_embeddings=num_embeddings) for p in periods])
+
+    def forward(
+        self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for d in self.discriminators:
+            y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
+            y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class DiscriminatorP(nn.Module):
+    def __init__(
+        self,
+        period: int,
+        in_channels: int = 1,
+        kernel_size: int = 5,
+        stride: int = 3,
+        lrelu_slope: float = 0.1,
+        num_embeddings: int = None,
+    ):
+        super().__init__()
+        self.period = period
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(Conv2d(in_channels, 32, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
+                weight_norm(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
+                weight_norm(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
+                weight_norm(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(kernel_size // 2, 0))),
+                weight_norm(Conv2d(1024, 1024, (kernel_size, 1), (1, 1), padding=(kernel_size // 2, 0))),
+            ]
+        )
+        if num_embeddings is not None:
+            self.emb = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=1024)
+            torch.nn.init.zeros_(self.emb.weight)
+
+        self.conv_post = weight_norm(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+        self.lrelu_slope = lrelu_slope
+
+    def forward(
+        self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        x = x.unsqueeze(1)
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = torch.nn.functional.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for i, l in enumerate(self.convs):
+            x = l(x)
+            x = torch.nn.functional.leaky_relu(x, self.lrelu_slope)
+            if i > 0:
+                fmap.append(x)
+        if cond_embedding_id is not None:
+            emb = self.emb(cond_embedding_id)
+            h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
+        else:
+            h = 0
+        x = self.conv_post(x)
+        fmap.append(x)
+        x += h
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class MultiResolutionDiscriminator(nn.Module):
+    def __init__(
+        self,
+        resolutions: Tuple[Tuple[int, int, int]] = ((1024, 256, 1024), (2048, 512, 2048), (512, 128, 512)),
+        num_embeddings: int = None,
+    ):
+        """
+        Multi-Resolution Discriminator module adapted from https://github.com/mindslab-ai/univnet.
+        Additionally, it allows incorporating conditional information with a learned embeddings table.
+
+        Args:
+            resolutions (tuple[tuple[int, int, int]]): Tuple of resolutions for each discriminator.
+                Each resolution should be a tuple of (n_fft, hop_length, win_length).
+            num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
+                Defaults to None.
+        """
+        super().__init__()
+        self.discriminators = nn.ModuleList(
+            [DiscriminatorR(resolution=r, num_embeddings=num_embeddings) for r in resolutions]
+        )
+
+    def forward(
+        self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+
+        for d in self.discriminators:
+            y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
+            y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class DiscriminatorR(nn.Module):
+    def __init__(
+        self,
+        resolution: Tuple[int, int, int],
+        channels: int = 64,
+        in_channels: int = 1,
+        num_embeddings: int = None,
+        lrelu_slope: float = 0.1,
+    ):
+        super().__init__()
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.lrelu_slope = lrelu_slope
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(nn.Conv2d(in_channels, channels, kernel_size=(7, 5), stride=(2, 2), padding=(3, 2))),
+                weight_norm(nn.Conv2d(channels, channels, kernel_size=(5, 3), stride=(2, 1), padding=(2, 1))),
+                weight_norm(nn.Conv2d(channels, channels, kernel_size=(5, 3), stride=(2, 2), padding=(2, 1))),
+                weight_norm(nn.Conv2d(channels, channels, kernel_size=3, stride=(2, 1), padding=1)),
+                weight_norm(nn.Conv2d(channels, channels, kernel_size=3, stride=(2, 2), padding=1)),
+            ]
+        )
+        if num_embeddings is not None:
+            self.emb = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=channels)
+            torch.nn.init.zeros_(self.emb.weight)
+        self.conv_post = weight_norm(nn.Conv2d(channels, 1, (3, 3), padding=(1, 1)))
+
+    def forward(
+        self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None
+    ) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        fmap = []
+        x = self.spectrogram(x)
+        x = x.unsqueeze(1)
+        for l in self.convs:
+            x = l(x)
+            x = torch.nn.functional.leaky_relu(x, self.lrelu_slope)
+            fmap.append(x)
+        if cond_embedding_id is not None:
+            emb = self.emb(cond_embedding_id)
+            h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
+        else:
+            h = 0
+        x = self.conv_post(x)
+        fmap.append(x)
+        x += h
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+    def spectrogram(self, x: torch.Tensor) -> torch.Tensor:
+        n_fft, hop_length, win_length = self.resolution
+        magnitude_spectrogram = torch.stft(
+            x,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=None,  # interestingly rectangular window kind of works here
+            center=True,
+            return_complex=True,
+        ).abs()
+
+        return magnitude_spectrogram
--- a/inspiremusic/wavtokenizer/decoder/experiment.py
+++ b/inspiremusic/wavtokenizer/decoder/experiment.py
+import math
+
+import numpy as np
+import pytorch_lightning as pl
+import torch
+import torchaudio
+import transformers
+import yaml
+
+from decoder.discriminator_dac import DACDiscriminator
+
+from decoder.discriminators import MultiPeriodDiscriminator, MultiResolutionDiscriminator
+from decoder.feature_extractors import FeatureExtractor
+from decoder.heads import FourierHead
+from decoder.helpers import plot_spectrogram_to_numpy
+from decoder.loss import DiscriminatorLoss, GeneratorLoss, FeatureMatchingLoss, MelSpecReconstructionLoss, DACGANLoss
+from decoder.models import Backbone
+from decoder.modules import safe_log
+from decoder.pretrained_model import instantiate_class
+
+
+class VocosExp(pl.LightningModule):
+    # noinspection PyUnusedLocal
+    def __init__(
+        self,
+        feature_extractor: FeatureExtractor,
+        backbone: Backbone,
+        head: FourierHead,
+        resume_config: str,
+        resume_model: str,
+        sample_rate: int = 24000,
+        initial_learning_rate: float = 2e-4,
+        num_warmup_steps: int = 0,
+        mel_loss_coeff: float = 45,
+        mrd_loss_coeff: float = 1.0,
+        pretrain_mel_steps: int = 0,
+        decay_mel_coeff: bool = False,
+        evaluate_utmos: bool = False,
+        evaluate_pesq: bool = False,
+        evaluate_periodicty: bool = False,
+        resume: bool = False,
+    ):
+        """
+        Args:
+            feature_extractor (FeatureExtractor): An instance of FeatureExtractor to extract features from audio signals.
+            backbone (Backbone): An instance of Backbone model.
+            head (FourierHead):  An instance of Fourier head to generate spectral coefficients and reconstruct a waveform.
+            sample_rate (int): Sampling rate of the audio signals.
+            initial_learning_rate (float): Initial learning rate for the optimizer.
+            num_warmup_steps (int): Number of steps for the warmup phase of learning rate scheduler. Default is 0.
+            mel_loss_coeff (float, optional): Coefficient for Mel-spectrogram loss in the loss function. Default is 45.
+            mrd_loss_coeff (float, optional): Coefficient for Multi Resolution Discriminator loss. Default is 1.0.
+            pretrain_mel_steps (int, optional): Number of steps to pre-train the model without the GAN objective. Default is 0.
+            decay_mel_coeff (bool, optional): If True, the Mel-spectrogram loss coefficient is decayed during training. Default is False.
+            evaluate_utmos (bool, optional): If True, UTMOS scores are computed for each validation run.
+            evaluate_pesq (bool, optional): If True, PESQ scores are computed for each validation run.
+            evaluate_periodicty (bool, optional): If True, periodicity scores are computed for each validation run.
+        """
+        super().__init__()
+        self.save_hyperparameters(ignore=["feature_extractor", "backbone", "head"])
+
+        self.feature_extractor = feature_extractor
+        self.backbone = backbone
+        self.head = head
+
+        self.resume_config = resume_config
+        self.resume_model = resume_model
+        self.resume = resume
+
+        self.multiperioddisc = MultiPeriodDiscriminator()
+        self.multiresddisc = MultiResolutionDiscriminator()
+
+        
+        self.dac = DACDiscriminator()
+
+        self.dacdiscriminator = DACGANLoss(self.dac)
+
+        self.disc_loss = DiscriminatorLoss()
+        self.gen_loss = GeneratorLoss()
+        self.feat_matching_loss = FeatureMatchingLoss()
+        self.melspec_loss = MelSpecReconstructionLoss(sample_rate=sample_rate)
+
+        self.train_discriminator = False
+        self.base_mel_coeff = self.mel_loss_coeff = mel_loss_coeff
+
+    def configure_optimizers(self):
+        disc_params = [
+            {"params": self.multiperioddisc.parameters()},
+            {"params": self.multiresddisc.parameters()},
+            {"params": self.dac.parameters()},
+        ]
+        gen_params = [
+            {"params": self.feature_extractor.parameters()},
+            {"params": self.backbone.parameters()},
+            {"params": self.head.parameters()},
+        ]
+
+        opt_disc = torch.optim.AdamW(disc_params, lr=self.hparams.initial_learning_rate)
+        opt_gen = torch.optim.AdamW(gen_params, lr=self.hparams.initial_learning_rate)
+
+        max_steps = self.trainer.max_steps // 2  # Max steps per optimizer
+        scheduler_disc = transformers.get_cosine_schedule_with_warmup(
+            opt_disc, num_warmup_steps=self.hparams.num_warmup_steps, num_training_steps=max_steps,
+        )
+        scheduler_gen = transformers.get_cosine_schedule_with_warmup(
+            opt_gen, num_warmup_steps=self.hparams.num_warmup_steps, num_training_steps=max_steps,
+        )
+
+        return (
+            [opt_disc, opt_gen],
+            [{"scheduler": scheduler_disc, "interval": "step"}, {"scheduler": scheduler_gen, "interval": "step"}],
+        )
+
+    def forward(self, audio_input, **kwargs):
+        features, _, commit_loss = self.feature_extractor(audio_input, **kwargs)
+        # print('1111', self.feature_extractor.state_dict()['encodec.decoder.model.3.convtr.convtr.weight_g'])
+        x = self.backbone(features, **kwargs)
+        audio_output = self.head(x)
+        return audio_output, commit_loss
+
+    def training_step(self, batch, batch_idx, optimizer_idx, **kwargs):
+        audio_input = batch
+
+        # train discriminator
+        if optimizer_idx == 0 and self.train_discriminator:
+            with torch.no_grad():
+                audio_hat, _ = self(audio_input, **kwargs)
+
+
+            loss_dac=self.dacdiscriminator.discriminator_loss(audio_hat.unsqueeze(1),audio_input.unsqueeze(1))
+
+            real_score_mp, gen_score_mp, _, _ = self.multiperioddisc(y=audio_input, y_hat=audio_hat, **kwargs,)
+            real_score_mrd, gen_score_mrd, _, _ = self.multiresddisc(y=audio_input, y_hat=audio_hat, **kwargs,)
+            loss_mp, loss_mp_real, _ = self.disc_loss(
+                disc_real_outputs=real_score_mp, disc_generated_outputs=gen_score_mp
+            )
+            loss_mrd, loss_mrd_real, _ = self.disc_loss(
+                disc_real_outputs=real_score_mrd, disc_generated_outputs=gen_score_mrd
+            )
+            loss_mp /= len(loss_mp_real)
+            loss_mrd /= len(loss_mrd_real)
+            loss = loss_mp + self.hparams.mrd_loss_coeff * loss_mrd + loss_dac
+
+            self.log("discriminator/total", loss, prog_bar=True)
+            self.log("discriminator/multi_period_loss", loss_mp)
+            self.log("discriminator/multi_res_loss", loss_mrd)
+            self.log("discriminator/dac", loss_dac)
+            return loss
+
+        # train generator
+        if optimizer_idx == 1:
+            audio_hat, commit_loss = self(audio_input, **kwargs)
+            if self.train_discriminator:
+
+                loss_dac_1,loss_dac_2 = self.dacdiscriminator.generator_loss(audio_hat.unsqueeze(1),audio_input.unsqueeze(1))
+                _, gen_score_mp, fmap_rs_mp, fmap_gs_mp = self.multiperioddisc(
+                    y=audio_input, y_hat=audio_hat, **kwargs,
+                )
+                _, gen_score_mrd, fmap_rs_mrd, fmap_gs_mrd = self.multiresddisc(
+                    y=audio_input, y_hat=audio_hat, **kwargs,
+                )
+                loss_gen_mp, list_loss_gen_mp = self.gen_loss(disc_outputs=gen_score_mp)
+                loss_gen_mrd, list_loss_gen_mrd = self.gen_loss(disc_outputs=gen_score_mrd)
+                loss_gen_mp = loss_gen_mp / len(list_loss_gen_mp)
+                loss_gen_mrd = loss_gen_mrd / len(list_loss_gen_mrd)
+                loss_fm_mp = self.feat_matching_loss(fmap_r=fmap_rs_mp, fmap_g=fmap_gs_mp) / len(fmap_rs_mp)
+                loss_fm_mrd = self.feat_matching_loss(fmap_r=fmap_rs_mrd, fmap_g=fmap_gs_mrd) / len(fmap_rs_mrd)
+
+                self.log("generator/multi_period_loss", loss_gen_mp)
+                self.log("generator/multi_res_loss", loss_gen_mrd)
+                self.log("generator/feature_matching_mp", loss_fm_mp)
+                self.log("generator/feature_matching_mrd", loss_fm_mrd)
+                self.log("generator/loss_dac_1", loss_dac_1)
+                self.log("generator/loss_dac_2", loss_dac_2)
+            else:
+                loss_gen_mp = loss_gen_mrd = loss_fm_mp = loss_fm_mrd = 0
+
+            mel_loss = self.melspec_loss(audio_hat, audio_input)
+            loss = (
+                loss_gen_mp
+                + self.hparams.mrd_loss_coeff * loss_gen_mrd
+                + loss_fm_mp
+                + self.hparams.mrd_loss_coeff * loss_fm_mrd
+                + self.mel_loss_coeff * mel_loss
+                + 1000 * commit_loss
+                + loss_dac_1
+                + loss_dac_2
+            )
+
+            self.log("generator/total_loss", loss, prog_bar=True)
+            self.log("mel_loss_coeff", self.mel_loss_coeff)
+            self.log("generator/mel_loss", mel_loss)
+            self.log("commit_loss", commit_loss)
+
+            if self.global_step % 1000 == 0 and self.global_rank == 0:
+                self.logger.experiment.add_audio(
+                    "train/audio_in", audio_input[0].data.cpu(), self.global_step, self.hparams.sample_rate
+                )
+                self.logger.experiment.add_audio(
+                    "train/audio_pred", audio_hat[0].data.cpu(), self.global_step, self.hparams.sample_rate
+                )
+                with torch.no_grad():
+                    mel = safe_log(self.melspec_loss.mel_spec(audio_input[0]))
+                    mel_hat = safe_log(self.melspec_loss.mel_spec(audio_hat[0]))
+                self.logger.experiment.add_image(
+                    "train/mel_target",
+                    plot_spectrogram_to_numpy(mel.data.cpu().numpy()),
+                    self.global_step,
+                    dataformats="HWC",
+                )
+                self.logger.experiment.add_image(
+                    "train/mel_pred",
+                    plot_spectrogram_to_numpy(mel_hat.data.cpu().numpy()),
+                    self.global_step,
+                    dataformats="HWC",
+                )
+
+            return loss
+
+    def on_validation_epoch_start(self):
+        if self.hparams.evaluate_utmos:
+            from metrics.UTMOS import UTMOSScore
+
+            if not hasattr(self, "utmos_model"):
+                self.utmos_model = UTMOSScore(device=self.device)
+
+    def validation_step(self, batch, batch_idx, **kwargs):
+        audio_input = batch
+        audio_hat, commit_loss = self(audio_input, **kwargs)
+
+        audio_16_khz = torchaudio.functional.resample(audio_input, orig_freq=self.hparams.sample_rate, new_freq=16000)
+        audio_hat_16khz = torchaudio.functional.resample(audio_hat, orig_freq=self.hparams.sample_rate, new_freq=16000)
+
+        if self.hparams.evaluate_periodicty:
+            from metrics.periodicity import calculate_periodicity_metrics
+
+            periodicity_loss, pitch_loss, f1_score = calculate_periodicity_metrics(audio_16_khz, audio_hat_16khz)
+        else:
+            periodicity_loss = pitch_loss = f1_score = 0
+
+        if self.hparams.evaluate_utmos:
+            utmos_score = self.utmos_model.score(audio_hat_16khz.unsqueeze(1)).mean()
+        else:
+            utmos_score = torch.zeros(1, device=self.device)
+
+        if self.hparams.evaluate_pesq:
+            from pesq import pesq
+
+            pesq_score = 0
+            for ref, deg in zip(audio_16_khz.cpu().numpy(), audio_hat_16khz.cpu().numpy()):
+                pesq_score += pesq(16000, ref, deg, "wb", on_error=1)
+            pesq_score /= len(audio_16_khz)
+            pesq_score = torch.tensor(pesq_score)
+        else:
+            pesq_score = torch.zeros(1, device=self.device)
+
+        mel_loss = self.melspec_loss(audio_hat.unsqueeze(1), audio_input.unsqueeze(1))
+        total_loss = mel_loss + (5 - utmos_score) + (5 - pesq_score) + 1000 * commit_loss
+
+        return {
+            "val_loss": total_loss,
+            "mel_loss": mel_loss,
+            "utmos_score": utmos_score,
+            "pesq_score": pesq_score,
+            "periodicity_loss": periodicity_loss,
+            "pitch_loss": pitch_loss,
+            "f1_score": f1_score,
+            "audio_input": audio_input[0],
+            "audio_pred": audio_hat[0],
+        }
+
+    def validation_epoch_end(self, outputs):
+        if self.global_rank == 0:
+            *_, audio_in, audio_pred = outputs[0].values()
+            self.logger.experiment.add_audio(
+                "val_in", audio_in.data.cpu().numpy(), self.global_step, self.hparams.sample_rate
+            )
+            self.logger.experiment.add_audio(
+                "val_pred", audio_pred.data.cpu().numpy(), self.global_step, self.hparams.sample_rate
+            )
+            mel_target = safe_log(self.melspec_loss.mel_spec(audio_in))
+            mel_hat = safe_log(self.melspec_loss.mel_spec(audio_pred))
+            self.logger.experiment.add_image(
+                "val_mel_target",
+                plot_spectrogram_to_numpy(mel_target.data.cpu().numpy()),
+                self.global_step,
+                dataformats="HWC",
+            )
+            self.logger.experiment.add_image(
+                "val_mel_hat",
+                plot_spectrogram_to_numpy(mel_hat.data.cpu().numpy()),
+                self.global_step,
+                dataformats="HWC",
+            )
+        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
+        mel_loss = torch.stack([x["mel_loss"] for x in outputs]).mean()
+        utmos_score = torch.stack([x["utmos_score"] for x in outputs]).mean()
+        pesq_score = torch.stack([x["pesq_score"] for x in outputs]).mean()
+        periodicity_loss = np.array([x["periodicity_loss"] for x in outputs]).mean()
+        pitch_loss = np.array([x["pitch_loss"] for x in outputs]).mean()
+        f1_score = np.array([x["f1_score"] for x in outputs]).mean()
+
+        self.log("val_loss", avg_loss, sync_dist=True)
+        self.log("val/mel_loss", mel_loss, sync_dist=True)
+        self.log("val/utmos_score", utmos_score, sync_dist=True)
+        self.log("val/pesq_score", pesq_score, sync_dist=True)
+        self.log("val/periodicity_loss", periodicity_loss, sync_dist=True)
+        self.log("val/pitch_loss", pitch_loss, sync_dist=True)
+        self.log("val/f1_score", f1_score, sync_dist=True)
+
+    @property
+    def global_step(self):
+        """
+        Override global_step so that it returns the total number of batches processed
+        """
+        return self.trainer.fit_loop.epoch_loop.total_batch_idx
+
+    def on_train_batch_start(self, *args):
+        if self.global_step >= self.hparams.pretrain_mel_steps:
+            self.train_discriminator = True
+        else:
+            self.train_discriminator = False
+
+    def on_train_batch_end(self, *args):
+        def mel_loss_coeff_decay(current_step, num_cycles=0.5):
+            max_steps = self.trainer.max_steps // 2
+            if current_step < self.hparams.num_warmup_steps:
+                return 1.0
+            progress = float(current_step - self.hparams.num_warmup_steps) / float(
+                max(1, max_steps - self.hparams.num_warmup_steps)
+            )
+            return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+
+        if self.hparams.decay_mel_coeff:
+            self.mel_loss_coeff = self.base_mel_coeff * mel_loss_coeff_decay(self.global_step + 1)
+
+
+class WavTokenizer(VocosExp):
+    """
+    WavTokenizer is a subclass of VocosExp that overrides the parent experiment to function as a conditional GAN.
+    It manages an additional `bandwidth_id` attribute, which denotes a learnable embedding corresponding to
+    a specific bandwidth value of EnCodec. During training, a random bandwidth_id is generated for each step,
+    while during validation, a fixed bandwidth_id is used.
+    """
+
+    def __init__(
+        self,
+        feature_extractor: FeatureExtractor,
+        backbone: Backbone,
+        head: FourierHead,
+        resume_config: str,
+        resume_model: str,
+        sample_rate: int = 24000,
+        initial_learning_rate: float = 2e-4,
+        num_warmup_steps: int = 0,
+        mel_loss_coeff: float = 45,
+        mrd_loss_coeff: float = 1.0,
+        pretrain_mel_steps: int = 0,
+        decay_mel_coeff: bool = False,
+        evaluate_utmos: bool = False,
+        evaluate_pesq: bool = False,
+        evaluate_periodicty: bool = False,
+        resume: bool = False,
+    ):
+        super().__init__(
+            feature_extractor,
+            backbone,
+            head,
+            resume_config,
+            resume_model,
+            sample_rate,
+            initial_learning_rate,
+            num_warmup_steps,
+            mel_loss_coeff,
+            mrd_loss_coeff,
+            pretrain_mel_steps,
+            decay_mel_coeff,
+            evaluate_utmos,
+            evaluate_pesq,
+            evaluate_periodicty,
+            resume
+        )
+        # Override with conditional discriminators
+        # VocosExp.__init__(self, feature_extractor, backbone, head, resume_config, resume_model)
+        # if self.resume:
+        #     VocosExp.load_from_checkpoint(self.resume_model)
+        self.multiperioddisc = MultiPeriodDiscriminator(num_embeddings=len(self.feature_extractor.bandwidths))
+        self.multiresddisc = MultiResolutionDiscriminator(num_embeddings=len(self.feature_extractor.bandwidths))
+        self.dac = DACDiscriminator()
+        if self.resume:
+            print('加载预训练模型:', self.resume_model)
+            # with open(self.resume_config, "r") as f:
+            #     config = yaml.safe_load(f)
+            # feature_extractor = instantiate_class(args=(), init=config['model']['init_args']["feature_extractor"])
+            # backbone = instantiate_class(args=(), init=config['model']['init_args']["backbone"])
+            # head = instantiate_class(args=(), init=config['model']['init_args']["head"])
+
+            # 不加载量化器部分权重
+            state_dict_raw = torch.load(self.resume_model, map_location=self.device)['state_dict']
+            state_dict_fa_qa = dict()
+            state_dict_fa_en = dict()
+            state_dict_fa_de = dict()
+            state_dict_bb = dict()
+            state_dict_hd = dict()
+            state_dict_mp = dict()
+            state_dict_mr = dict()
+            state_dict_dac = dict()
+            for k, v in state_dict_raw.items():
+                # breakpoint()
+                if k.startswith('feature_extractor.encodec.quantizer'):
+                    # breakpoint()
+                    # print("*****",k)
+                    ss = k[46:48]
+                    if ss[-1] == '.':
+                        num = int(ss[0])
+                        # print("num,k",num,k[36:])
+                        if num <= 7:
+                            state_dict_fa_qa[k[36:]] = v
+                if k.startswith('feature_extractor.encodec.encoder'):
+                    state_dict_fa_en[k[34:]] = v
+                if k.startswith('feature_extractor.encodec.decoder'):
+                    state_dict_fa_de[k[34:]] = v
+                if k.startswith('backbone.'):
+                    state_dict_bb[k[9:]] = v
+                if k.startswith('head.'):
+                    state_dict_hd[k[5:]] = v
+                if k.startswith('multiperioddisc.'):
+                    state_dict_mp[k[16:]] = v
+                if k.startswith('multiresddisc.'):
+                    state_dict_mr[k[14:]] = v
+                if k.startswith('dac.'):
+                    state_dict_dac[k[4:]] = v
+            # breakpoint()
+            # feature_extractor.encodec.quantizer.load_state_dict(state_dict_fa_qa, strict=True)
+            feature_extractor.encodec.encoder.load_state_dict(state_dict_fa_en, strict=True)
+            feature_extractor.encodec.decoder.load_state_dict(state_dict_fa_de, strict=True)
+            feature_extractor.encodec.quantizer.load_state_dict(state_dict_fa_qa, strict=True)
+            backbone.load_state_dict(state_dict_bb, strict=True)
+            head.load_state_dict(state_dict_hd, strict=True)
+            self.feature_extractor = feature_extractor.to(self.device)
+            self.backbone = backbone.to(self.device)
+            self.head = head.to(self.device)
+            self.multiperioddisc.load_state_dict(state_dict_mp, strict=True)
+            self.multiresddisc.load_state_dict(state_dict_mr, strict=True)
+            self.dac.load_state_dict(state_dict_dac, strict=True)
+
+    def training_step(self, *args):
+        # print('-------------------train--------------------')
+        # if self.global_rank == 0 and self.resume:
+        #     config_path = self.resume_config
+        #     model_path = self.resume_model
+        #     self.pretrained_load(config_path, model_path)
+        #     print('加载预训练模型:', model_path)
+        bandwidth_id = torch.randint(low=0, high=len(self.feature_extractor.bandwidths), size=(1,), device=self.device,)
+        output = super().training_step(*args, bandwidth_id=bandwidth_id)
+        return output
+
+    def validation_step(self, *args):
+        # print('-------------------valid--------------------')
+        bandwidth_id = torch.tensor([0], device=self.device)
+        output = super().validation_step(*args, bandwidth_id=bandwidth_id)
+        return output
+
+    def validation_epoch_end(self, outputs):
+        if self.global_rank == 0:
+            *_, audio_in, _ = outputs[0].values()
+            # Resynthesis with encodec for reference
+            self.feature_extractor.encodec.set_target_bandwidth(self.feature_extractor.bandwidths[0])
+            encodec_audio = self.feature_extractor.encodec(audio_in[None, None, :])
+            self.logger.experiment.add_audio(
+                "encodec", encodec_audio[0, 0].data.cpu().numpy(), self.global_step, self.hparams.sample_rate,
+            )
+
+        super().validation_epoch_end(outputs)
--- a/inspiremusic/wavtokenizer/decoder/feature_extractors.py
+++ b/inspiremusic/wavtokenizer/decoder/feature_extractors.py
+from typing import List
+
+import torch
+import torchaudio
+from torch import nn
+import math
+# from inspiremusic.wavtokenizer.decoder.modules import safe_log
+from inspiremusic.wavtokenizer.encoder.modules import SEANetEncoder, SEANetDecoder
+from inspiremusic.wavtokenizer.encoder import EncodecModel
+from inspiremusic.wavtokenizer.encoder.quantization import ResidualVectorQuantizer
+
+
+def safe_log(x: torch.Tensor, clip_val: float = 1e-7) -> torch.Tensor:
+    """
+    Computes the element-wise logarithm of the input tensor with clipping to avoid near-zero values.
+
+    Args:
+        x (Tensor): Input tensor.
+        clip_val (float, optional): Minimum value to clip the input tensor. Defaults to 1e-7.
+
+    Returns:
+        Tensor: Element-wise logarithm of the input tensor with clipping applied.
+    """
+    return torch.log(torch.clip(x, min=clip_val))
+
+
+def symlog(x: torch.Tensor) -> torch.Tensor:
+    return torch.sign(x) * torch.log1p(x.abs())
+
+
+def symexp(x: torch.Tensor) -> torch.Tensor:
+    return torch.sign(x) * (torch.exp(x.abs()) - 1)
+
+
+class FeatureExtractor(nn.Module):
+    """Base class for feature extractors."""
+
+    def forward(self, audio: torch.Tensor, **kwargs) -> torch.Tensor:
+        """
+        Extract features from the given audio.
+
+        Args:
+            audio (Tensor): Input audio waveform.
+
+        Returns:
+            Tensor: Extracted features of shape (B, C, L), where B is the batch size,
+                    C denotes output features, and L is the sequence length.
+        """
+        raise NotImplementedError("Subclasses must implement the forward method.")
+
+
+class MelSpectrogramFeatures(FeatureExtractor):
+    def __init__(self, sample_rate=24000, n_fft=1024, hop_length=256, n_mels=100, padding="center"):
+        super().__init__()
+        if padding not in ["center", "same"]:
+            raise ValueError("Padding must be 'center' or 'same'.")
+        self.padding = padding
+        self.mel_spec = torchaudio.transforms.MelSpectrogram(
+            sample_rate=sample_rate,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            n_mels=n_mels,
+            center=padding == "center",
+            power=1,
+        )
+
+    def forward(self, audio, **kwargs):
+        if self.padding == "same":
+            pad = self.mel_spec.win_length - self.mel_spec.hop_length
+            audio = torch.nn.functional.pad(audio, (pad // 2, pad // 2), mode="reflect")
+        mel = self.mel_spec(audio)
+        features = safe_log(mel)
+        return features
+
+
+class EncodecFeatures(FeatureExtractor):
+    def __init__(
+        self,
+        encodec_model: str = "encodec_24khz",
+        bandwidths: List[float] = [1.5, 3.0, 6.0, 12.0],
+        train_codebooks: bool = False,
+        num_quantizers: int = 1, 
+        dowmsamples: List[int] = [6, 5, 5, 4],
+        vq_bins: int = 16384,
+        vq_kmeans: int = 800,
+    ):
+        super().__init__()
+
+        # breakpoint()
+        self.frame_rate = 25  # not use
+        # n_q = int(bandwidths[-1]*1000/(math.log2(2048) * self.frame_rate))
+        n_q = num_quantizers   # important
+        encoder = SEANetEncoder(causal=False, n_residual_layers=1, norm='weight_norm', pad_mode='reflect', lstm=2,
+                                dimension=512, channels=1, n_filters=32, ratios=dowmsamples, activation='ELU',
+                                kernel_size=7, residual_kernel_size=3, last_kernel_size=7, dilation_base=2,
+                                true_skip=False, compress=2)
+        decoder = SEANetDecoder(causal=False, n_residual_layers=1, norm='weight_norm', pad_mode='reflect', lstm=2,
+                                dimension=512, channels=1, n_filters=32, ratios=[8, 5, 4, 2], activation='ELU',
+                                kernel_size=7, residual_kernel_size=3, last_kernel_size=7, dilation_base=2,
+                                true_skip=False, compress=2)
+        quantizer = ResidualVectorQuantizer(dimension=512, n_q=n_q, bins=vq_bins, kmeans_iters=vq_kmeans,
+                                            decay=0.99, kmeans_init=True)
+
+        # breakpoint()
+        if encodec_model == "encodec_24khz":
+            self.encodec = EncodecModel(encoder=encoder, decoder=decoder, quantizer=quantizer,
+                                        target_bandwidths=bandwidths, sample_rate=24000, channels=1)
+        else:
+            raise ValueError(
+                f"Unsupported encodec_model: {encodec_model}. Supported options are 'encodec_24khz'."
+            )
+        for param in self.encodec.parameters():
+            param.requires_grad = True
+        # self.num_q = n_q
+        # codebook_weights = torch.cat([vq.codebook for vq in self.encodec.quantizer.vq.layers[: self.num_q]], dim=0)
+        # self.codebook_weights = torch.nn.Parameter(codebook_weights, requires_grad=train_codebooks)
+        self.bandwidths = bandwidths
+
+    # @torch.no_grad()
+    # def get_encodec_codes(self, audio):
+    #     audio = audio.unsqueeze(1)
+    #     emb = self.encodec.encoder(audio)
+    #     codes = self.encodec.quantizer.encode(emb, self.encodec.frame_rate, self.encodec.bandwidth)
+    #     return codes
+
+    def forward(self, audio: torch.Tensor, bandwidth_id: torch.Tensor = torch.tensor(0)):
+        if self.training:
+            self.encodec.train()
+
+        audio = audio.unsqueeze(1)                  # audio(16,24000)
+
+        # breakpoint()
+
+        emb = self.encodec.encoder(audio)
+        q_res = self.encodec.quantizer(emb, self.frame_rate, bandwidth=self.bandwidths[bandwidth_id])
+        quantized = q_res.quantized
+        codes = q_res.codes
+        commit_loss = q_res.penalty                 # codes(8,16,75),features(16,128,75)
+
+        return quantized, codes, commit_loss
+
+        # codes = self.get_encodec_codes(audio)
+        # # Instead of summing in the loop, it stores subsequent VQ dictionaries in a single `self.codebook_weights`
+        # # with offsets given by the number of bins, and finally summed in a vectorized operation.
+        # offsets = torch.arange(
+        #     0, self.encodec.quantizer.bins * len(codes), self.encodec.quantizer.bins, device=audio.device
+        # )
+        # embeddings_idxs = codes + offsets.view(-1, 1, 1)
+        # features = torch.nn.functional.embedding(embeddings_idxs, self.codebook_weights).sum(dim=0)
+        # return features.transpose(1, 2)
+
+    def infer(self, audio: torch.Tensor, bandwidth_id: torch.Tensor):
+        if self.training:
+            self.encodec.train()
+
+        audio = audio.unsqueeze(1)                  # audio(16,24000)
+        emb = self.encodec.encoder(audio)
+        q_res = self.encodec.quantizer.infer(emb, self.frame_rate, bandwidth=self.bandwidths[bandwidth_id])
+        quantized = q_res.quantized
+        codes = q_res.codes
+        commit_loss = q_res.penalty                 # codes(8,16,75),features(16,128,75)
+
+        return quantized, codes, commit_loss
+
+    def _infer(self, audio: torch.Tensor, bandwidth_id: torch.Tensor = torch.tensor(0)):
+        if self.training:
+            self.encodec.train()
+
+        audio = audio.unsqueeze(1)                  # audio(16,24000)
+        emb = self.encodec.encoder(audio)
+        q_res = self.encodec.quantizer.infer(emb, self.frame_rate, bandwidth=self.bandwidths[bandwidth_id])
+        quantized = q_res.quantized
+        codes = q_res.codes
+        commit_loss = q_res.penalty                 # codes(8,16,75),features(16,128,75)
+
+        return quantized, codes, commit_loss
\ No newline at end of file