v1.0

0112b0f0 · chenzk · 0112b0f0 · 0112b0f0 · 0112b0f0 · 0112b0f0
Commit 0112b0f0 authored Feb 14, 2025 by chenzk
20 changed files
--- a/examples/music_generation/inspiremusic/music_tokenizer/meldataset.py
+++ b/examples/music_generation/inspiremusic/music_tokenizer/meldataset.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# code based on https://github.com/b04901014/MQTTS
+import math
+import os
+import random
+
+import librosa
+import numpy as np
+import torch.utils.data
+from librosa.filters import mel as librosa_mel_fn
+
+def load_wav(full_path, sr):
+    wav, sr = librosa.load(full_path, sr=sr)
+    return wav, sr
+
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+
+mel_basis = {}
+hann_window = {}
+
+## modified to get stft with return complex value = True for pytorch ver2.0
+def mel_spectrogram(y,
+                    n_fft,
+                    num_mels,
+                    sampling_rate,
+                    hop_size,
+                    win_size,
+                    fmin,
+                    fmax,
+                    center=False):
+
+    global mel_basis, hann_window
+    if fmax not in mel_basis:
+        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
+        mel_basis[str(fmax) + '_' +
+                  str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1), (int((n_fft - hop_size) / 2), int(
+            (n_fft - hop_size) / 2)),
+        mode='reflect')
+    y = y.squeeze(1)
+
+    spec = torch.view_as_real(torch.stft(
+        y,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window[str(y.device)],
+        center=center,
+        pad_mode='reflect',
+        normalized=False,
+        onesided=True,
+        return_complex=True
+    ))
+
+    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+
+    spec = torch.matmul(mel_basis[str(fmax) + '_' + str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+
+    return spec
+
+
+def get_dataset_filelist(a):
+    with open(a.input_training_file, 'r') as f:
+        training_files = [l.strip() for l in f]
+    with open(a.input_validation_file, 'r') as f:
+        validation_files = [l.strip() for l in f]
+    return training_files, validation_files
+
+
+class MelDataset(torch.utils.data.Dataset):
+    def __init__(self,
+                 training_files,
+                 segment_size,
+                 n_fft,
+                 num_mels,
+                 hop_size,
+                 win_size,
+                 sampling_rate,
+                 fmin,
+                 fmax,
+                 split=True,
+                 shuffle=True,
+                 n_cache_reuse=1,
+                 device=None,
+                 fmax_loss=None,
+                 fine_tuning=False,
+                 base_mels_path=None):
+        self.audio_files = training_files
+        random.seed(1234)
+        if shuffle:
+            random.shuffle(self.audio_files)
+        self.segment_size = segment_size
+        self.sampling_rate = sampling_rate
+        self.split = split
+        self.n_fft = n_fft
+        self.num_mels = num_mels
+        self.hop_size = hop_size
+        self.win_size = win_size
+        self.fmin = fmin
+        self.fmax = fmax
+        self.fmax_loss = fmax_loss
+        self.cached_wav = None
+        self.n_cache_reuse = n_cache_reuse
+        self._cache_ref_count = 0
+        self.device = device
+        self.fine_tuning = fine_tuning
+        self.base_mels_path = base_mels_path
+
+    def __getitem__(self, index):
+        filename = self.audio_files[index]
+        if self._cache_ref_count == 0:
+            try:
+                # Note by yuantian: load with the sample_rate of config
+                audio, sampling_rate = load_wav(filename, sr=self.sampling_rate)
+            except Exception as e:
+                print(f"Error on audio: {filename}")
+                audio = np.random.normal(size=(160000, )) * 0.05
+                sampling_rate = self.sampling_rate
+            self.cached_wav = audio
+            if sampling_rate != self.sampling_rate:
+                raise ValueError("{} SR doesn't match target {} SR".format(
+                    sampling_rate, self.sampling_rate))
+            self._cache_ref_count = self.n_cache_reuse
+        else:
+            audio = self.cached_wav
+            self._cache_ref_count -= 1
+
+        audio = torch.FloatTensor(audio)
+        audio = audio.unsqueeze(0)
+
+        if not self.fine_tuning:
+            if self.split:
+                if audio.size(1) >= self.segment_size:
+                    max_audio_start = audio.size(1) - self.segment_size
+                    audio_start = random.randint(0, max_audio_start)
+                    audio = audio[:, audio_start:audio_start +
+                                  self.segment_size]
+                else:
+                    audio = torch.nn.functional.pad(audio, (
+                        0, self.segment_size - audio.size(1)), 'constant')
+
+            mel = mel_spectrogram(
+                audio,
+                self.n_fft,
+                self.num_mels,
+                self.sampling_rate,
+                self.hop_size,
+                self.win_size,
+                self.fmin,
+                self.fmax,
+                center=False)
+        else:
+            mel = np.load(
+                os.path.join(self.base_mels_path,
+                             os.path.splitext(os.path.split(filename)[-1])[0] +
+                             '.npy'))
+            mel = torch.from_numpy(mel)
+
+            if len(mel.shape) < 3:
+                mel = mel.unsqueeze(0)
+
+            if self.split:
+                frames_per_seg = math.ceil(self.segment_size / self.hop_size)
+
+                if audio.size(1) >= self.segment_size:
+                    mel_start = random.randint(0,
+                                               mel.size(2) - frames_per_seg - 1)
+                    mel = mel[:, :, mel_start:mel_start + frames_per_seg]
+                    audio = audio[:, mel_start * self.hop_size:(
+                        mel_start + frames_per_seg) * self.hop_size]
+                else:
+                    mel = torch.nn.functional.pad(mel, (
+                        0, frames_per_seg - mel.size(2)), 'constant')
+                    audio = torch.nn.functional.pad(audio, (
+                        0, self.segment_size - audio.size(1)), 'constant')
+
+        mel_loss = mel_spectrogram(
+            audio,
+            self.n_fft,
+            self.num_mels,
+            self.sampling_rate,
+            self.hop_size,
+            self.win_size,
+            self.fmin,
+            self.fmax_loss,
+            center=False)
+
+        return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
+
+    def __len__(self):
+        return len(self.audio_files)
--- a/examples/music_generation/inspiremusic/music_tokenizer/models.py
+++ b/examples/music_generation/inspiremusic/music_tokenizer/models.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import AvgPool1d
+from torch.nn import Conv1d
+from torch.nn import Conv2d
+from torch.nn import ConvTranspose1d
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils import spectral_norm
+from torch.nn.utils import weight_norm
+
+from inspiremusic.utils.tokenizer_utils import get_padding
+from inspiremusic.utils.tokenizer_utils import init_weights
+
+LRELU_SLOPE = 0.1
+
+class ResBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList([
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[0],
+                    padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[1],
+                    padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[2],
+                    padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList([
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=1,
+                    padding=get_padding(kernel_size, 1))), weight_norm(
+                        Conv1d(
+                            channels,
+                            channels,
+                            kernel_size,
+                            1,
+                            dilation=1,
+                            padding=get_padding(kernel_size, 1))), weight_norm(
+                                Conv1d(
+                                    channels,
+                                    channels,
+                                    kernel_size,
+                                    1,
+                                    dilation=1,
+                                    padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+
+
+class ResBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.h = h
+        self.convs = nn.ModuleList([
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[0],
+                    padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[1],
+                    padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+
+    def forward(self, x):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+
+
+class Generator(torch.nn.Module):
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(
+            Conv1d(512, h.upsample_initial_channel, 7, 1, padding=3))
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u,
+                k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        h.upsample_initial_channel // (2**i),
+                        h.upsample_initial_channel // (2**(i + 1)),
+                        k,
+                        u,
+                        # padding=(u//2 + u%2),
+                        padding=(k - u) // 2,
+                        # output_padding=u%2
+                    )))
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2**(i + 1))
+            for j, (k, d) in enumerate(
+                    zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x, LRELU_SLOPE)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+
+
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3,
+                 use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(
+                Conv2d(
+                    1,
+                    32, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(
+                Conv2d(
+                    32,
+                    128, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(
+                Conv2d(
+                    128,
+                    512, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(
+                Conv2d(
+                    512,
+                    1024, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorP(2),
+            DiscriminatorP(3),
+            DiscriminatorP(5),
+            DiscriminatorP(7),
+            DiscriminatorP(11),
+        ])
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 128, 15, 1, padding=7)),
+            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+            norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+            norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class MultiScaleDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiScaleDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorS(use_spectral_norm=True),
+            DiscriminatorS(),
+            DiscriminatorS(),
+        ])
+        self.meanpools = nn.ModuleList(
+            [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)])
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i - 1](y)
+                y_hat = self.meanpools[i - 1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+
+    return loss * 2
+
+
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1 - dr)**2)
+        g_loss = torch.mean(dg**2)
+        loss += (r_loss + g_loss)
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+
+    return loss, r_losses, g_losses
+
+
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1 - dg)**2)
+        gen_losses.append(l)
+        loss += l
+
+    return loss, gen_losses
+
+
+class Encoder(torch.nn.Module):
+    def __init__(self, h):
+        super(Encoder, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(Conv1d(1, 32, 7, 1, padding=3))
+        self.normalize = nn.ModuleList()
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(
+                list(
+                    reversed(
+                        list(zip(h.upsample_rates, h.upsample_kernel_sizes))))):
+            self.ups.append(
+                weight_norm(
+                    Conv1d(
+                        32 * (2**i),
+                        32 * (2**(i + 1)),
+                        k,
+                        u,
+                        padding=((k - u) // 2)
+                        # padding=(u//2 + u%2)
+                    )))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = 32 * (2**(i + 1))
+            for j, (k, d) in enumerate(
+                    zip(
+                        list(reversed(h.resblock_kernel_sizes)),
+                        list(reversed(h.resblock_dilation_sizes)))):
+                self.resblocks.append(resblock(h, ch, k, d))
+                self.normalize.append(
+                    torch.nn.GroupNorm(ch // 16, ch, eps=1e-6, affine=True))
+        self.conv_post = Conv1d(512, 512, 3, 1, padding=1)
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                    xs = self.normalize[i * self.num_kernels + j](xs)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+                    xs = self.normalize[i * self.num_kernels + j](xs)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        return x
+
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+
+
+class Quantizer_module(torch.nn.Module):
+    def __init__(self, n_e, e_dim):
+        super(Quantizer_module, self).__init__()
+        self.embedding = nn.Embedding(n_e, e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / n_e, 1.0 / n_e)
+
+    def forward(self, x):
+        # compute Euclidean distance
+        d = torch.sum(x ** 2, 1, keepdim=True) + torch.sum(self.embedding.weight ** 2, 1) \
+            - 2 * torch.matmul(x, self.embedding.weight.T)
+        min_indicies = torch.argmin(d, 1)
+        z_q = self.embedding(min_indicies)
+        return z_q, min_indicies
+
+
+class Quantizer(torch.nn.Module):
+    def __init__(self, h):
+        super(Quantizer, self).__init__()
+        assert 512 % h.n_code_groups == 0
+        self.quantizer_modules = nn.ModuleList([
+            Quantizer_module(h.n_codes, 512 // h.n_code_groups)
+            for _ in range(h.n_code_groups)
+        ])
+        self.quantizer_modules2 = nn.ModuleList([
+            Quantizer_module(h.n_codes, 512 // h.n_code_groups)
+            for _ in range(h.n_code_groups)
+        ])
+        self.h = h
+        self.codebook_loss_lambda = self.h.codebook_loss_lambda  # e.g., 1
+        self.commitment_loss_lambda = self.h.commitment_loss_lambda  # e.g., 0.25
+        self.residul_layer = 2
+        self.n_code_groups = h.n_code_groups
+
+    def for_one_step(self, xin, idx):
+        xin = xin.transpose(1, 2)
+        x = xin.reshape(-1, 512)
+        x = torch.split(x, 512 // self.h.n_code_groups, dim=-1)
+        min_indicies = []
+        z_q = []
+        if idx == 0:
+            for _x, m in zip(x, self.quantizer_modules):
+                _z_q, _min_indicies = m(_x)
+                z_q.append(_z_q)
+                min_indicies.append(_min_indicies)  #B * T,
+            z_q = torch.cat(z_q, -1).reshape(xin.shape)
+            # loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean((z_q - xin.detach()) ** 2)
+            loss = self.codebook_loss_lambda * torch.mean((z_q - xin.detach()) ** 2) \
+                + self.commitment_loss_lambda * torch.mean((z_q.detach() - xin) ** 2)
+            z_q = xin + (z_q - xin).detach()
+            z_q = z_q.transpose(1, 2)
+            return z_q, loss, min_indicies
+        else:
+            for _x, m in zip(x, self.quantizer_modules2):
+                _z_q, _min_indicies = m(_x)
+                z_q.append(_z_q)
+                min_indicies.append(_min_indicies)  #B * T,
+            z_q = torch.cat(z_q, -1).reshape(xin.shape)
+            # loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean((z_q - xin.detach()) ** 2)
+            loss = self.codebook_loss_lambda * torch.mean((z_q - xin.detach()) ** 2) \
+                + self.commitment_loss_lambda * torch.mean((z_q.detach() - xin) ** 2)
+            z_q = xin + (z_q - xin).detach()
+            z_q = z_q.transpose(1, 2)
+            return z_q, loss, min_indicies
+
+    def forward(self, xin):
+        #B, C, T
+        quantized_out = 0.0
+        residual = xin
+        all_losses = []
+        all_indices = []
+        for i in range(self.residul_layer):
+            quantized, loss, indices = self.for_one_step(residual, i)  # 
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+            all_indices.extend(indices)  # 
+            all_losses.append(loss)
+        all_losses = torch.stack(all_losses)
+        loss = torch.mean(all_losses)
+        return quantized_out, loss, all_indices
+
+    def embed(self, x):
+        #idx: N, T, 4
+        #print('x ', x.shape)
+        quantized_out = torch.tensor(0.0, device=x.device)
+        x = torch.split(x, 1, 2)  # split, 将最后一个维度分开, 每个属于一个index group
+        #print('x.shape ', len(x),x[0].shape)
+        for i in range(self.residul_layer):
+            ret = []
+            if i == 0:
+                for j in range(self.n_code_groups):
+                    q = x[j]
+                    embed = self.quantizer_modules[j]
+                    q = embed.embedding(q.squeeze(-1).long())
+                    ret.append(q)
+                ret = torch.cat(ret, -1)
+                #print(ret.shape)
+                quantized_out = quantized_out + ret
+            else:
+                for j in range(self.n_code_groups):
+                    q = x[j + self.n_code_groups]
+                    embed = self.quantizer_modules2[j]
+                    q = embed.embedding(q.squeeze(-1).long())
+                    ret.append(q)
+                ret = torch.cat(ret, -1)
+                quantized_out = quantized_out + ret
+        return quantized_out.transpose(1, 2)  #N, C, T
--- a/examples/music_generation/inspiremusic/music_tokenizer/vqvae.py
+++ b/examples/music_generation/inspiremusic/music_tokenizer/vqvae.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+import torch
+import torch.nn as nn
+from inspiremusic.music_tokenizer.env import AttrDict
+from inspiremusic.music_tokenizer.models import Encoder
+from inspiremusic.music_tokenizer.models import Generator
+from inspiremusic.music_tokenizer.models import Quantizer
+
+
+class VQVAE(nn.Module):
+    def __init__(self,
+                 config_path,
+                 ckpt_path,
+                 with_encoder=False):
+        super(VQVAE, self).__init__()
+        ckpt = torch.load(ckpt_path)
+        with open(config_path) as f:
+            data = f.read()
+        json_config = json.loads(data)
+        self.h = AttrDict(json_config)
+        self.quantizer = Quantizer(self.h)
+        self.generator = Generator(self.h)
+        self.generator.load_state_dict(ckpt['generator'])
+        self.quantizer.load_state_dict(ckpt['quantizer'])
+        if with_encoder:
+            self.encoder = Encoder(self.h)
+            self.encoder.load_state_dict(ckpt['encoder'])
+
+    def forward(self, x):
+        # x is the codebook
+        # x.shape (B, T, Nq)
+        quant_emb = self.quantizer.embed(x)
+        return self.generator(quant_emb)
+
+    def encode(self, x):
+        batch_size = x.size(0)
+        if len(x.shape) == 3 and x.shape[-1] == 1:
+            x = x.squeeze(-1)
+        c = self.encoder(x.unsqueeze(1))
+        q, loss_q, c = self.quantizer(c)
+        c = [code.reshape(batch_size, -1) for code in c]
+        # shape: [N, T, 4]
+        return torch.stack(c, -1)
--- a/examples/music_generation/inspiremusic/text/__pycache__/abs_tokenizer.cpython-310.pyc
+++ b/examples/music_generation/inspiremusic/text/__pycache__/abs_tokenizer.cpython-310.pyc
--- a/examples/music_generation/inspiremusic/text/__pycache__/tokenizer.cpython-310.pyc
+++ b/examples/music_generation/inspiremusic/text/__pycache__/tokenizer.cpython-310.pyc
--- a/examples/music_generation/inspiremusic/text/abs_tokenizer.py
+++ b/examples/music_generation/inspiremusic/text/abs_tokenizer.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC
+from abc import abstractmethod
+from typing import Iterable
+from typing import List
+
+
+class AbsTokenizer(ABC):
+    @abstractmethod
+    def text2tokens(self, line: str) -> List[str]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def tokens2text(self, tokens: Iterable[str]) -> str:
+        raise NotImplementedError
+
+    
+    
+    def encode(self, line: str, **kwargs) -> List[str]:
+
+        return self.text2tokens(line)
\ No newline at end of file
--- a/examples/music_generation/inspiremusic/text/tokenizer.py
+++ b/examples/music_generation/inspiremusic/text/tokenizer.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+import re
+from typing import Iterable, List, Union
+import numpy as np
+import torch
+
+from inspiremusic.text.abs_tokenizer import AbsTokenizer
+from transformers import AutoTokenizer
+
+def get_tokenizer(tokenizer_name, tokenizer_path):
+    if "qwen" in tokenizer_name:
+        return QwenTokenizer(tokenizer_path,skip_special_tokens=True)
+    else:
+        return None
+
+class QwenTokenizer(AbsTokenizer):
+    def __init__(
+            self,
+            token_path: str,
+            skip_special_tokens: bool = True,
+    ):
+        super().__init__()
+        # NOTE: non-chat model, all these special tokens keep randomly initialized.
+        special_tokens = {
+            'eos_token': '<|endoftext|>',
+            'pad_token': '<|endoftext|>',
+            'additional_special_tokens': [
+                '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
+                '[breath]', '<strong>', '</strong>', '[noise]',
+                '[laughter]', '[cough]', '[clucking]', '[accent]',
+                '[quick_breath]',
+            ]
+        }
+        self.tokenizer = AutoTokenizer.from_pretrained(token_path)
+        self.tokenizer.add_special_tokens(special_tokens)
+        self.skip_special_tokens = skip_special_tokens
+
+    def get_vocab_size(self):
+        return self.tokenizer.vocab_size
+
+    def text2tokens(self, line: str) -> List:
+        tokens = self.tokenizer([line], return_tensors="pt")
+        tokens = tokens["input_ids"][0].cpu().tolist()
+        return tokens
+
+    def tokens2text(self, tokens) -> str:
+        tokens = torch.tensor(tokens, dtype=torch.int64)
+        text = self.tokenizer.batch_decode([tokens], skip_special_tokens=self.skip_special_tokens)[0]
+        return text
+
+
+
+def get_qwen_vocab_size(token_type: str):
+    if "qwen1.5" in token_type.lower() or "qwen2.0" in token_type.lower() or "qwen2.5" in token_type.lower():
+        # 293 for special and extra tokens, including endoftext, im_start, im_end, endofprompt and others in the future.
+        # model.vocab_size = 151936, tokenizer.vocab_size = 151643
+        # NOTE: the first three special tokens (endoftext, im_start, im_end) are trained in Chat series models,
+        # others are kept in random initialization state.
+        return 151643 + 293
+    else:
+        raise ValueError(f"Unknown tokenizer {token_type}")
\ No newline at end of file
--- a/examples/music_generation/inspiremusic/transformer/__init__.py
+++ b/examples/music_generation/inspiremusic/transformer/__init__.py
--- a/examples/music_generation/inspiremusic/transformer/__pycache__/__init__.cpython-310.pyc
+++ b/examples/music_generation/inspiremusic/transformer/__pycache__/__init__.cpython-310.pyc
--- a/examples/music_generation/inspiremusic/transformer/__pycache__/activation.cpython-310.pyc
+++ b/examples/music_generation/inspiremusic/transformer/__pycache__/activation.cpython-310.pyc
--- a/examples/music_generation/inspiremusic/transformer/__pycache__/attention.cpython-310.pyc
+++ b/examples/music_generation/inspiremusic/transformer/__pycache__/attention.cpython-310.pyc
--- a/examples/music_generation/inspiremusic/transformer/__pycache__/convolution.cpython-310.pyc
+++ b/examples/music_generation/inspiremusic/transformer/__pycache__/convolution.cpython-310.pyc
--- a/examples/music_generation/inspiremusic/transformer/__pycache__/embedding.cpython-310.pyc
+++ b/examples/music_generation/inspiremusic/transformer/__pycache__/embedding.cpython-310.pyc
--- a/examples/music_generation/inspiremusic/transformer/__pycache__/encoder.cpython-310.pyc
+++ b/examples/music_generation/inspiremusic/transformer/__pycache__/encoder.cpython-310.pyc
--- a/examples/music_generation/inspiremusic/transformer/__pycache__/encoder_layer.cpython-310.pyc
+++ b/examples/music_generation/inspiremusic/transformer/__pycache__/encoder_layer.cpython-310.pyc
--- a/examples/music_generation/inspiremusic/transformer/__pycache__/label_smoothing_loss.cpython-310.pyc
+++ b/examples/music_generation/inspiremusic/transformer/__pycache__/label_smoothing_loss.cpython-310.pyc
--- a/examples/music_generation/inspiremusic/transformer/__pycache__/positionwise_feed_forward.cpython-310.pyc
+++ b/examples/music_generation/inspiremusic/transformer/__pycache__/positionwise_feed_forward.cpython-310.pyc
--- a/examples/music_generation/inspiremusic/transformer/__pycache__/qwen_encoder.cpython-310.pyc
+++ b/examples/music_generation/inspiremusic/transformer/__pycache__/qwen_encoder.cpython-310.pyc
--- a/examples/music_generation/inspiremusic/transformer/__pycache__/subsampling.cpython-310.pyc
+++ b/examples/music_generation/inspiremusic/transformer/__pycache__/subsampling.cpython-310.pyc
--- a/examples/music_generation/inspiremusic/transformer/activation.py
+++ b/examples/music_generation/inspiremusic/transformer/activation.py
+# Copyright (c) 2020 Johns Hopkins University (Shinji Watanabe)
+#               2020 Northwestern Polytechnical University (Pengcheng Guo)
+#               2020 Mobvoi Inc (Binbin Zhang)
+#               2024 Alibaba Inc (Xiang Lyu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Swish() activation function for Conformer."""
+
+import torch
+from torch import nn, sin, pow
+from torch.nn import Parameter
+
+
+class Swish(torch.nn.Module):
+    """Construct an Swish object."""
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Return Swish activation function."""
+        return x * torch.sigmoid(x)
+
+
+# Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
+#   LICENSE is in incl_licenses directory.
+class Snake(nn.Module):
+    '''
+    Implementation of a sine-based periodic activation function
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter
+    References:
+        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    '''
+    def __init__(self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False):
+        '''
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha: trainable parameter
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            alpha will be trained along with the rest of your model.
+        '''
+        super(Snake, self).__init__()
+        self.in_features = in_features
+
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        else:  # linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+
+        self.alpha.requires_grad = alpha_trainable
+
+        self.no_div_by_zero = 0.000000001
+
+    def forward(self, x):
+        '''
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        Snake ∶= x + 1/a * sin^2 (xa)
+        '''
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+
+        return x
\ No newline at end of file