v1.0

0112b0f0 · chenzk · 0112b0f0 · 0112b0f0 · 0112b0f0 · 0112b0f0
Commit 0112b0f0 authored Feb 14, 2025 by chenzk
20 changed files
--- a/inspiremusic/metrics/openl3_fd.py
+++ b/inspiremusic/metrics/openl3_fd.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import openl3
+import librosa
+import numpy as np
+from scipy import linalg
+import glob
+from tqdm import tqdm
+import os
+import soxr
+import pyloudnorm as pyln
+
+
+def calculate_embd_statistics(embd_lst):
+    if isinstance(embd_lst, list):
+        embd_lst = np.array(embd_lst)
+    mu = np.mean(embd_lst, axis=0)
+    sigma = np.cov(embd_lst, rowvar=False)
+    return mu, sigma
+
+
+def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
+    """
+    Adapted from: https://github.com/mseitzer/pytorch-fid/blob/master/src/pytorch_fid/fid_score.py
+    Adapted from: https://github.com/gudgud96/frechet-audio-distance/blob/main/frechet_audio_distance/fad.py
+    
+    Numpy implementation of the Frechet Distance.
+    
+    The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
+    and X_2 ~ N(mu_2, C_2) is
+            d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
+
+    Params:
+    -- mu1: Embedding's mean statistics for generated samples.
+    -- mu2: Embedding's mean statistics for reference samples.
+    -- sigma1: Covariance matrix over embeddings for generated samples.
+    -- sigma2: Covariance matrix over embeddings for reference samples.
+    Returns:
+    --  Fréchet Distance.
+    """
+
+    mu1 = np.atleast_1d(mu1)
+    mu2 = np.atleast_1d(mu2)
+
+    sigma1 = np.atleast_2d(sigma1)
+    sigma2 = np.atleast_2d(sigma2)
+
+    assert mu1.shape == mu2.shape, \
+        'Training and test mean vectors have different lengths'
+    assert sigma1.shape == sigma2.shape, \
+        'Training and test covariances have different dimensions'
+
+    diff = mu1 - mu2
+
+    # product might be almost singular
+    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+    if not np.isfinite(covmean).all():
+        msg = ('fid calculation produces singular product; '
+            'adding %s to diagonal of cov estimates') % eps
+        print(msg)
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+
+    # numerical error might give slight imaginary component
+    if np.iscomplexobj(covmean):
+        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+            m = np.max(np.abs(covmean.imag))
+            raise ValueError('Imaginary component {}'.format(m))
+        covmean = covmean.real
+
+    tr_covmean = np.trace(covmean)
+
+    return (diff.dot(diff) + np.trace(sigma1)
+            + np.trace(sigma2) - 2 * tr_covmean)
+
+
+def extract_embeddings(directory_path, channels, samplingrate, content_type, openl3_hop_size, batch_size=16):
+    """
+    Given a list of files, compute their embeddings in batches.
+
+    If channels == 1: stereo audio is downmixed to mono. Mono embeddings are of dim=512.
+
+    If channels == 2: mono audio is "faked" to stereo by copying the mono channel.
+    Stereo embeddings are of dim=1024, since we concatenate L (dim=512) and R (dim=512) embeddings.
+
+    Params:
+    -- directory_path: path where the generated audio files are available.
+    -- channels: 1 (mono), or 2 (stereo) to get mono or stereo embeddings.
+    -- samplingrate: max bandwidth at which we evaluate the given signals. Up to 48kHz.
+    -- content_type: 'music' or 'env' to select a content type specific openl3 model.
+    -- openl3_hop_size: analysis resolution of openl3 in seconds. Openl3's input window is 1 sec. 
+    -- batch_size: number of audio files to process in each batch.
+    Returns:
+    -- list of embeddings: [np.array[], ...], as expected by calculate_frechet_distance()
+    """
+    _, extension = os.path.splitext(directory_path)
+    if extension.lower() == ".scp":
+        wav_files = []
+        with open(directory_path, "r") as f:
+            for line in f:
+                sec = line.strip().split(" ")
+                wav_files.append(sec[1])
+    else:
+        wav_files = glob.glob(directory_path)
+    if len(wav_files) == 0:
+        raise ValueError('No files with this extension in this path!')
+    model = openl3.models.load_audio_embedding_model(input_repr="mel256", content_type=content_type, embedding_size=512)
+    
+    first = True
+    for i in tqdm(range(0, len(wav_files), batch_size)):
+        batch_files = wav_files[i:i+batch_size]
+        batch_audio_l = []
+        batch_audio_r = []
+        batch_sr = []
+        
+        for file in batch_files:
+            audio, sr = librosa.load(file, sr=None, mono=False)
+            audio = audio.T
+            audio = pyln.normalize.peak(audio, -1.0)            
+            if audio.shape[0] < sr: 
+                print('Audio shorter than 1 sec, openl3 will zero-pad it:', file, audio.shape, sr)
+
+            # resample to the desired evaluation bandwidth
+            audio = soxr.resample(audio, sr, samplingrate) # mono/stereo <- mono/stereo, input sr, output sr
+
+            # mono embeddings are stored in batch_audio_l (R channel not used)
+            if channels == 1:
+                batch_audio_l.append(audio)
+
+            elif channels == 2:
+                if audio.ndim == 1:
+                    # if mono, "fake" stereo by copying mono channel to L and R
+                    batch_audio_l.append(audio)
+                    batch_audio_r.append(audio)
+                elif audio.ndim == 2:
+                    # if it's stereo separate channels for openl3
+                    batch_audio_l.append(audio[:,0])
+                    batch_audio_r.append(audio[:,1])
+
+            batch_sr.append(samplingrate)
+
+        # extracting mono embeddings (dim=512) or the L channel for stereo embeddings
+        emb, _ = openl3.get_audio_embedding(batch_audio_l, batch_sr, model=model, verbose=False, hop_size=openl3_hop_size, batch_size=batch_size)
+
+        # format mono embedding
+        if channels == 1:
+            emb = np.concatenate(emb,axis=0)
+        
+        # extracting stereo embeddings (dim=1024), since we concatenate L (dim=512) and R (dim=512) embeddings
+        elif channels == 2:
+            # extract the missing R channel
+            emb_r, _ = openl3.get_audio_embedding(batch_audio_r, batch_sr, model=model, verbose=False, hop_size=openl3_hop_size, batch_size=batch_size)
+            emb = [np.concatenate([l, r], axis=1) for l, r in zip(emb, emb_r)]
+            emb = np.concatenate(emb, axis=0)
+
+        # concatenate embeddings
+        if first:
+            embeddings = emb
+            first = False
+        else:
+            embeddings = np.concatenate([embeddings, emb], axis=0)
+    
+    # return as a list of embeddings: [np.array[], ...]
+    return [e for e in embeddings]
+
+
+def extract_embeddings_nobatching(directory_path, channels, samplingrate, content_type, openl3_hop_size):
+    """
+    Given a list of files, compute their embeddings one by one.
+
+    If channels == 1: stereo audio is downmixed to mono. Mono embeddings are of dim=512.
+
+    If channels == 2: mono audio is "faked" to stereo by copying the mono channel.
+    Stereo embeddings are of dim=1024, since we concatenate L (dim=512) and R (dim=512) embeddings.
+
+    Params:
+    -- directory_path: path where the generated audio files are available.
+    -- channels: 1 (mono), or 2 (stereo) to get mono or stereo embeddings.
+    -- samplingrate: max bandwidth at which we evaluate the given signals. Up to 48kHz.
+    -- content_type: 'music' or 'env' to select a content type specific openl3 model.
+    -- openl3_hop_size: analysis resolution of openl3 in seconds. Openl3's input window is 1 sec. 
+    Returns:
+    -- list of embeddings: [np.array[], ...], as expected by calculate_frechet_distance()
+    """
+    _, extension = os.path.splitext(directory_path)
+    if extension.lower() == ".scp":
+        wav_files = []
+        with open(directory_path, "r") as f:
+            for line in f:
+                sec = line.strip().split(" ")
+                wav_files.append(sec[1])
+    else:
+        wav_files = glob.glob(directory_path)
+    if len(wav_files) == 0:
+        raise ValueError('No files with this extension in this path!')    
+    model = openl3.models.load_audio_embedding_model(input_repr="mel256", content_type=content_type, embedding_size=512)
+
+    first = True
+    for file in tqdm(wav_files):
+        audio, sr = librosa.load(file, sr=None)
+        audio = pyln.normalize.peak(audio, -1.0)
+        if audio.shape[0] < sr: 
+            print('Audio shorter than 1 sec, openl3 will zero-pad it:', file, audio.shape, sr)
+
+        # resample to the desired evaluation bandwidth
+        audio = soxr.resample(audio, sr, samplingrate) # mono/stereo <- mono/stereo, input sr, output sr
+
+        # extracting stereo embeddings (dim=1024), since we concatenate L (dim=512) and R (dim=512) embeddings
+        if channels == 2:
+            if audio.ndim == 1:
+                audio_l3, sr_l3 = audio, samplingrate
+            elif audio.ndim == 2:
+                # if it's stereo separate channels for openl3
+                audio_l3 = [audio[:,0], audio[:,1]]
+                sr_l3 = [samplingrate, samplingrate]
+            emb, _ = openl3.get_audio_embedding(audio_l3, sr_l3, model=model, verbose=False, hop_size=openl3_hop_size)
+            if audio.ndim == 1:
+                # if mono audio, "fake" stereo by concatenating mono embedding as L and R embeddings
+                emb = np.concatenate([emb, emb],axis=1)
+            elif audio.ndim == 2:
+                emb = np.concatenate(emb,axis=1)
+
+        # or extracting mono embeddings (dim=512)
+        elif channels == 1: 
+            emb, _ = openl3.get_audio_embedding(audio, samplingrate, model=model, verbose=False, hop_size=openl3_hop_size)
+
+        # concatenate embeddings
+        if first:
+            embeddings = emb
+            first = False
+        else:
+            embeddings = np.concatenate([embeddings, emb], axis=0)
+    
+    # return as a list of embeddings: [np.array[], ...]
+    return [e for e in embeddings]
+
+
+def openl3_fd(channels, samplingrate, content_type, openl3_hop_size, eval_path, 
+              eval_files_extension='.wav', ref_path=None, ref_files_extension='.wav', load_ref_embeddings=None, batching=False):
+    """
+    Compute the Fréchet Distance between files in eval_path and ref_path.
+    
+    Fréchet distance computed on top of openl3 embeddings.
+
+    GPU-based computation.
+
+    Extracting the embeddings is timeconsuming. After being computed once, we store them.
+    We store pre-computed reference embedding statistics in load/openl3_fd/ 
+    To load those and save computation, just set the path in load_ref_embeddings.
+    If load_ref_embeddings is set, ref_path is not required.
+
+    Params:
+    -- channels: 1 (mono), or 2 (stereo) to get the Fréchet Distance over mono or stereo embeddings.
+    -- samplingrate: max bandwith at wich we evaluate the given signals. Up to 48kHz.
+    -- content_type: 'music' or 'env' to select a content type for openl3.
+    -- openl3_hop_size: analysis resolution of openl3 in seconds. Openl3's input window is 1 sec.
+    -- eval_path: path where the generated audio files to evaluate are available.
+    -- eval_files_extenstion: files extension (default .wav) in eval_path.
+    -- ref_path: path where the reference audio files are available. (instead of load_ref_embeddings)
+    -- ref_files_extension: files extension (default .wav) in ref_path.
+    -- load_ref_embeddings: path to the reference embedding statistics. (inestead of ref_path)
+    -- batching: set batch size (with an int) or set to False (default False).
+    Returns:
+    -- Fréchet distance.
+    """
+
+    if not os.path.isdir(eval_path):        
+        raise ValueError('eval_path does not exist')
+
+    if load_ref_embeddings:
+        if not os.path.exists(load_ref_embeddings):
+            raise ValueError('load_ref_embeddings does not exist')
+        print('[LOADING REFERENCE EMBEDDINGS] ', load_ref_embeddings)
+        loaded = np.load(load_ref_embeddings)
+        mu_ref = loaded['mu_ref']
+        sigma_ref = loaded['sigma_ref']
+
+    else:
+        if ref_path:
+            if not os.path.isdir(ref_path):
+                if not os.path.isfile(ref_path):
+                    raise ValueError("ref_path does not exist")
+            if os.path.isfile(ref_path):
+                path = ref_path
+            else:
+                path = os.path.join(ref_path, '*'+ref_files_extension)
+            print('[EXTRACTING REFERENCE EMBEDDINGS] ', path)
+            if batching:
+                ref_embeddings = extract_embeddings(path, channels, samplingrate, content_type, openl3_hop_size, batch_size=batching)
+            else:
+                ref_embeddings = extract_embeddings_nobatching(path, channels, samplingrate, content_type, openl3_hop_size)            
+            mu_ref, sigma_ref = calculate_embd_statistics(ref_embeddings)
+
+            # store statistics to load later on
+            if not os.path.exists('load/openl3_fd'):
+                os.makedirs('load/openl3_fd/')
+            save_ref_embeddings_path = (
+                'load/openl3_fd/' +
+                path.replace('/', '_') +
+                '__channels' + str(channels) +
+                '__' + str(samplingrate) +
+                '__openl3' + str(content_type) +
+                '__openl3hopsize' + str(openl3_hop_size) +
+                '__batch' + str(batching) +
+                '.npz'
+            )                
+            np.savez(save_ref_embeddings_path, mu_ref=mu_ref, sigma_ref=sigma_ref)
+            print('[REFERENCE EMBEDDINGS][SAVED] ', save_ref_embeddings_path)
+
+        else:
+            raise ValueError('Must specify ref_path or load_ref_embeddings')
+
+    path = os.path.join(eval_path, '*'+eval_files_extension)
+    print('[EXTRACTING EVALUATION EMBEDDINGS] ', path)
+    if batching:
+        eval_embeddings = extract_embeddings(path, channels, samplingrate, content_type, openl3_hop_size, batch_size=batching)
+    else:
+        eval_embeddings = extract_embeddings_nobatching(path, channels, samplingrate, content_type, openl3_hop_size)    
+    mu_eval, sigma_eval = calculate_embd_statistics(eval_embeddings)
+
+    fd = calculate_frechet_distance(mu_eval, sigma_eval, mu_ref, sigma_ref)
+    if load_ref_embeddings:
+        print('[FRéCHET DISTANCE] ', eval_path, load_ref_embeddings, fd)
+    else:
+        print('[FRéCHET DISTANCE] ', eval_path, ref_path, fd)
+
+    return fd
\ No newline at end of file
--- a/inspiremusic/metrics/passt_kld.py
+++ b/inspiremusic/metrics/passt_kld.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+
+import os
+import contextlib
+from functools import partial
+from tqdm import tqdm
+import pickle
+import numpy as np
+import librosa
+from hear21passt.base import get_basic_model
+import pyloudnorm as pyln
+
+import torch
+import torch.nn.functional as F
+
+
+SAMPLING_RATE = 32000
+
+
+class _patch_passt_stft:
+    """    
+    From version 1.8.0, return_complex must always be given explicitly 
+    for real inputs and return_complex=False has been deprecated.
+
+    Decorator to patch torch.stft in PaSST that uses an old stft version.
+
+    Adapted from: https://github.com/facebookresearch/audiocraft/blob/a2b96756956846e194c9255d0cdadc2b47c93f1b/audiocraft/metrics/kld.py
+    """
+    def __init__(self):
+        self.old_stft = torch.stft
+
+    def __enter__(self):
+        # return_complex is a mandatory parameter in latest torch versions.
+        # torch is throwing RuntimeErrors when not set.
+        # see: https://pytorch.org/docs/1.7.1/generated/torch.stft.html?highlight=stft#torch.stft
+        # see: https://github.com/kkoutini/passt_hear21/commit/dce83183674e559162b49924d666c0a916dc967a
+        torch.stft = partial(torch.stft, return_complex=False)
+
+    def __exit__(self, *exc):
+        torch.stft = self.old_stft
+
+
+def return_probabilities(model, audio_path, window_size=10, overlap=5, collect='mean'):
+    """
+    Given an audio and the PaSST model, return the probabilities of each AudioSet class.
+
+    Audio is converted to mono at 32kHz.
+
+    PaSST model is trained with 10 sec inputs. We refer to this parameter as the window_size.
+    We set it to 10 sec for consistency with PaSST training.
+
+    For longer audios, we split audio into overlapping analysis windows of window_size and overlap of 10 and 5 seconds.
+    PaSST supports 10, 20 or 30 sec inputs. Not longer inputs: https://github.com/kkoutini/PaSST/issues/19 
+
+    Note that AudioSet taggers normally use sigmoid output layers. Yet, to compute the
+    KL we work with normalized probabilities by running a softmax over logits as in MusicGen:
+    https://github.com/facebookresearch/audiocraft/blob/a2b96756956846e194c9255d0cdadc2b47c93f1b/audiocraft/metrics/kld.py
+
+    This implementation assumes run will be on GPU.
+
+    Params:
+    -- model: PaSST model on a GPU.
+    -- audio_path: path to the audio to be loaded with librosa.
+    -- window_size (default=10 sec): analysis window (and receptive field) of PaSST.
+    -- overlap (default=5 sec): overlap of the running analysis window for inputs longar than window_size (10 sec).
+    -- collect (default='mean'): for longer inputs, aggregate/collect via 'mean' or 'max' pooling along logits vector.
+    Returns:
+    --  527 probabilities (after softmax, no logarithm).
+    """
+    # load the audio using librosa
+    audio, _ = librosa.load(audio_path, sr=SAMPLING_RATE, mono=True)
+    audio = pyln.normalize.peak(audio, -1.0)
+
+    # calculate the step size for the analysis windows with the specified overlap
+    step_size = int((window_size - overlap) * SAMPLING_RATE)
+
+    # iterate over the audio, creating analysis windows
+    probabilities = []
+    for i in range(0, max(step_size, len(audio) - step_size), step_size):
+        # extract the current analysis window
+        window = audio[i:i + int(window_size * SAMPLING_RATE)]
+
+        # pad the window with zeros if it's shorter than the desired window size
+        if len(window) < int(window_size * SAMPLING_RATE):
+            # discard window if it's too small (avoid mostly zeros predicted as silence), as in MusicGen:
+            # https://github.com/facebookresearch/audiocraft/blob/a2b96756956846e194c9255d0cdadc2b47c93f1b/audiocraft/metrics/kld.py
+            if len(window) > int(window_size * SAMPLING_RATE * 0.15):
+                tmp = np.zeros(int(window_size * SAMPLING_RATE))
+                tmp[:len(window)] = window
+                window = tmp
+
+        # convert to a PyTorch tensor and move to GPU
+        audio_wave = torch.from_numpy(window.astype(np.float32)).unsqueeze(0).cuda()
+
+        # get the probabilities for this analysis window
+        with open(os.devnull, 'w') as f, contextlib.redirect_stdout(f):
+            with torch.no_grad(), _patch_passt_stft():
+                logits = model(audio_wave)
+                probabilities.append(torch.squeeze(logits))
+
+    probabilities = torch.stack(probabilities)
+    if collect == 'mean':
+        probabilities = torch.mean(probabilities, dim=0)
+    elif collect == 'max':
+        probabilities, _ = torch.max(probabilities, dim=0)
+
+    return F.softmax(probabilities, dim=0).squeeze().cpu()
+
+
+def passt_kld(ids, eval_path, eval_files_extension='.wav', ref_path=None, ref_files_extension='.wav', load_ref_probabilities=None, no_ids=[], collect='mean'):
+    """
+    Compute KL-divergence between the label probabilities of the generated audio with respect to the original audio.
+    Both generated audio (in eval_path) and original audio (in ref_path) are represented by the same prompt/description.
+    Audios are identified by an id, that is the name of the file in both directories and links the audio with the prompt/description.
+    segmenting the audio
+
+    For inputs longer that the 10 sec PaSST was trained on, we aggregate/collect via 'mean' (default) or 'max' pooling along the logits vector.
+    We split the inpot into overlapping analysis windows. Subsequently, we aggregate/collect (accross windows) the generated logits and then apply a softmax. 
+
+    This evaluation script assumes that ids are in both ref_path and eval_path.
+
+    We label probabilities via the PaSST model: https://github.com/kkoutini/PaSST
+
+    GPU-based computation.
+    
+    Extracting the probabilities is timeconsuming. After being computed once, we store them.
+    We store pre-computed reference probabilities in load/ 
+    To load those and save computation, just set the path in load_ref_probabilities.
+    If load_ref_probabilities is set, ref_path is not required.
+
+    Params:
+    -- ids: list of ids present in both eval_path and ref_path. 
+    -- eval_path: path where the generated audio files to evaluate are available.
+    -- eval_files_extenstion: files extension (default .wav) in eval_path.
+    -- ref_path: path where the reference audio files are available. (instead of load_ref_probabilities)
+    -- ref_files_extenstion: files extension (default .wav) in ref_path.
+    -- load_ref_probabilities: path to the reference probabilities. (inestead of ref_path)
+    -- no_ids: it is possible that some reference audio is corrupted or not present. Ignore some this list of ids.
+    -- collect (default='mean'): for longer inputs, aggregate/collect via 'mean' or 'max' pooling along the logits vector.
+    Returns:
+    -- KL divergence
+    """
+    with open(os.devnull, 'w') as f, contextlib.redirect_stdout(f): # capturing all useless outputs from passt
+        # load model
+        model = get_basic_model(mode="logits")
+        model.eval()
+        model = model.cuda()
+
+    if not os.path.isdir(eval_path):
+        if not os.path.isfile(eval_path): 
+            raise ValueError('eval_path does not exist')
+
+    if load_ref_probabilities:
+        if not os.path.exists(load_ref_probabilities):
+            raise ValueError('load_ref_probabilities does not exist')     
+        print('[LOADING REFERENCE PROBABILITIES] ', load_ref_probabilities)
+        with open(load_ref_probabilities, 'rb') as fp:
+            ref_p = pickle.load(fp)
+
+    else:
+        if ref_path:
+            if not os.path.isdir(ref_path):
+                if os.path.isfile(ref_path):
+                    id2utt = {}
+                    with open(ref_path, "r") as f:
+                        for line in f:
+                            sec = line.strip().split(" ")
+                            id2utt[sec[0]] = sec[1]
+                    f.close()
+                else:
+                    raise ValueError("ref_path does not exist")        
+            print('[EXTRACTING REFERENCE PROBABILITIES] ', ref_path)
+            ref_p = {}
+            for id in tqdm(ids):
+                if id not in no_ids:
+                    try:
+                        if os.path.isfile(ref_path):
+                            if id in id2utt.keys():
+                                audio_path = id2utt[id]
+                            else:
+                                raise ValueError(f"id: {id} not in {ref_path}!")  
+                        else:
+                            audio_path = os.path.join(ref_path, str(id)+ref_files_extension)
+                        if os.path.isfile(audio_path):
+                            ref_p[id] = return_probabilities(model, audio_path, collect=collect)
+                    except Exception as e:
+                        print(f"An unexpected error occurred with {id}: {e}\nIf you failed to download it you can add it to no_ids list.")
+
+            # store reference probabilities to load later on
+            if not os.path.exists('load/passt_kld/'):
+                os.makedirs('load/passt_kld/')
+            save_ref_probabilities_path = 'load/passt_kld/'+ref_path.replace('/', '_')+'_collect'+str(collect)+'__reference_probabilities.pkl'
+            with open(save_ref_probabilities_path, 'wb') as fp:
+                pickle.dump(ref_p, fp)        
+            print('[REFERENCE EMBEDDINGS][SAVED] ', save_ref_probabilities_path)
+
+        else:
+            raise ValueError('Must specify ref_path or load_ref_probabilities')
+
+    print('[EVALUATING GENERATIONS] ', eval_path)
+            
+    passt_kl = 0
+    count = 0
+    for id in tqdm(ids):
+        if id not in no_ids:
+            try:
+                audio_path = os.path.join(eval_path, str(id)+eval_files_extension)
+                if os.path.isfile(audio_path):
+                    eval_p = return_probabilities(model, audio_path, collect=collect)
+                    # note: F.kl_div(x, y) is KL(y||x)
+                    # see: https://github.com/pytorch/pytorch/issues/7337
+                    # see: https://discuss.pytorch.org/t/kl-divergence-different-results-from-tf/56903/2
+                    passt_kl += F.kl_div((ref_p[id] + 1e-6).log(), eval_p, reduction='sum', log_target=False)
+                    count += 1
+            except Exception as e:
+                print(f"An unexpected error occurred with {id}: {e}\nIf you failed to download it you can add it to no_ids list.")
+    return passt_kl / count if count > 0 else 0
--- a/inspiremusic/music_tokenizer/__init__.py
+++ b/inspiremusic/music_tokenizer/__init__.py
--- a/inspiremusic/music_tokenizer/__pycache__/__init__.cpython-310.pyc
+++ b/inspiremusic/music_tokenizer/__pycache__/__init__.cpython-310.pyc
--- a/inspiremusic/music_tokenizer/__pycache__/env.cpython-310.pyc
+++ b/inspiremusic/music_tokenizer/__pycache__/env.cpython-310.pyc
--- a/inspiremusic/music_tokenizer/__pycache__/models.cpython-310.pyc
+++ b/inspiremusic/music_tokenizer/__pycache__/models.cpython-310.pyc
--- a/inspiremusic/music_tokenizer/__pycache__/vqvae.cpython-310.pyc
+++ b/inspiremusic/music_tokenizer/__pycache__/vqvae.cpython-310.pyc
--- a/inspiremusic/music_tokenizer/env.py
+++ b/inspiremusic/music_tokenizer/env.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))
--- a/inspiremusic/music_tokenizer/meldataset.py
+++ b/inspiremusic/music_tokenizer/meldataset.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# code based on https://github.com/b04901014/MQTTS
+import math
+import os
+import random
+
+import librosa
+import numpy as np
+import torch.utils.data
+from librosa.filters import mel as librosa_mel_fn
+
+def load_wav(full_path, sr):
+    wav, sr = librosa.load(full_path, sr=sr)
+    return wav, sr
+
+def dynamic_range_compression(x, C=1, clip_val=1e-5):
+    return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
+
+def dynamic_range_decompression(x, C=1):
+    return np.exp(x) / C
+
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+
+def dynamic_range_decompression_torch(x, C=1):
+    return torch.exp(x) / C
+
+def spectral_normalize_torch(magnitudes):
+    output = dynamic_range_compression_torch(magnitudes)
+    return output
+
+def spectral_de_normalize_torch(magnitudes):
+    output = dynamic_range_decompression_torch(magnitudes)
+    return output
+
+mel_basis = {}
+hann_window = {}
+
+## modified to get stft with return complex value = True for pytorch ver2.0
+def mel_spectrogram(y,
+                    n_fft,
+                    num_mels,
+                    sampling_rate,
+                    hop_size,
+                    win_size,
+                    fmin,
+                    fmax,
+                    center=False):
+
+    global mel_basis, hann_window
+    if fmax not in mel_basis:
+        mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
+        mel_basis[str(fmax) + '_' +
+                  str(y.device)] = torch.from_numpy(mel).float().to(y.device)
+        hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device)
+
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1), (int((n_fft - hop_size) / 2), int(
+            (n_fft - hop_size) / 2)),
+        mode='reflect')
+    y = y.squeeze(1)
+
+    spec = torch.view_as_real(torch.stft(
+        y,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window[str(y.device)],
+        center=center,
+        pad_mode='reflect',
+        normalized=False,
+        onesided=True,
+        return_complex=True
+    ))
+
+    spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
+
+    spec = torch.matmul(mel_basis[str(fmax) + '_' + str(y.device)], spec)
+    spec = spectral_normalize_torch(spec)
+
+    return spec
+
+
+def get_dataset_filelist(a):
+    with open(a.input_training_file, 'r') as f:
+        training_files = [l.strip() for l in f]
+    with open(a.input_validation_file, 'r') as f:
+        validation_files = [l.strip() for l in f]
+    return training_files, validation_files
+
+
+class MelDataset(torch.utils.data.Dataset):
+    def __init__(self,
+                 training_files,
+                 segment_size,
+                 n_fft,
+                 num_mels,
+                 hop_size,
+                 win_size,
+                 sampling_rate,
+                 fmin,
+                 fmax,
+                 split=True,
+                 shuffle=True,
+                 n_cache_reuse=1,
+                 device=None,
+                 fmax_loss=None,
+                 fine_tuning=False,
+                 base_mels_path=None):
+        self.audio_files = training_files
+        random.seed(1234)
+        if shuffle:
+            random.shuffle(self.audio_files)
+        self.segment_size = segment_size
+        self.sampling_rate = sampling_rate
+        self.split = split
+        self.n_fft = n_fft
+        self.num_mels = num_mels
+        self.hop_size = hop_size
+        self.win_size = win_size
+        self.fmin = fmin
+        self.fmax = fmax
+        self.fmax_loss = fmax_loss
+        self.cached_wav = None
+        self.n_cache_reuse = n_cache_reuse
+        self._cache_ref_count = 0
+        self.device = device
+        self.fine_tuning = fine_tuning
+        self.base_mels_path = base_mels_path
+
+    def __getitem__(self, index):
+        filename = self.audio_files[index]
+        if self._cache_ref_count == 0:
+            try:
+                # Note by yuantian: load with the sample_rate of config
+                audio, sampling_rate = load_wav(filename, sr=self.sampling_rate)
+            except Exception as e:
+                print(f"Error on audio: {filename}")
+                audio = np.random.normal(size=(160000, )) * 0.05
+                sampling_rate = self.sampling_rate
+            self.cached_wav = audio
+            if sampling_rate != self.sampling_rate:
+                raise ValueError("{} SR doesn't match target {} SR".format(
+                    sampling_rate, self.sampling_rate))
+            self._cache_ref_count = self.n_cache_reuse
+        else:
+            audio = self.cached_wav
+            self._cache_ref_count -= 1
+
+        audio = torch.FloatTensor(audio)
+        audio = audio.unsqueeze(0)
+
+        if not self.fine_tuning:
+            if self.split:
+                if audio.size(1) >= self.segment_size:
+                    max_audio_start = audio.size(1) - self.segment_size
+                    audio_start = random.randint(0, max_audio_start)
+                    audio = audio[:, audio_start:audio_start +
+                                  self.segment_size]
+                else:
+                    audio = torch.nn.functional.pad(audio, (
+                        0, self.segment_size - audio.size(1)), 'constant')
+
+            mel = mel_spectrogram(
+                audio,
+                self.n_fft,
+                self.num_mels,
+                self.sampling_rate,
+                self.hop_size,
+                self.win_size,
+                self.fmin,
+                self.fmax,
+                center=False)
+        else:
+            mel = np.load(
+                os.path.join(self.base_mels_path,
+                             os.path.splitext(os.path.split(filename)[-1])[0] +
+                             '.npy'))
+            mel = torch.from_numpy(mel)
+
+            if len(mel.shape) < 3:
+                mel = mel.unsqueeze(0)
+
+            if self.split:
+                frames_per_seg = math.ceil(self.segment_size / self.hop_size)
+
+                if audio.size(1) >= self.segment_size:
+                    mel_start = random.randint(0,
+                                               mel.size(2) - frames_per_seg - 1)
+                    mel = mel[:, :, mel_start:mel_start + frames_per_seg]
+                    audio = audio[:, mel_start * self.hop_size:(
+                        mel_start + frames_per_seg) * self.hop_size]
+                else:
+                    mel = torch.nn.functional.pad(mel, (
+                        0, frames_per_seg - mel.size(2)), 'constant')
+                    audio = torch.nn.functional.pad(audio, (
+                        0, self.segment_size - audio.size(1)), 'constant')
+
+        mel_loss = mel_spectrogram(
+            audio,
+            self.n_fft,
+            self.num_mels,
+            self.sampling_rate,
+            self.hop_size,
+            self.win_size,
+            self.fmin,
+            self.fmax_loss,
+            center=False)
+
+        return (mel.squeeze(), audio.squeeze(0), filename, mel_loss.squeeze())
+
+    def __len__(self):
+        return len(self.audio_files)
--- a/inspiremusic/music_tokenizer/models.py
+++ b/inspiremusic/music_tokenizer/models.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import AvgPool1d
+from torch.nn import Conv1d
+from torch.nn import Conv2d
+from torch.nn import ConvTranspose1d
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils import spectral_norm
+from torch.nn.utils import weight_norm
+
+from inspiremusic.utils.tokenizer_utils import get_padding
+from inspiremusic.utils.tokenizer_utils import init_weights
+
+LRELU_SLOPE = 0.1
+
+class ResBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList([
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[0],
+                    padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[1],
+                    padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[2],
+                    padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList([
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=1,
+                    padding=get_padding(kernel_size, 1))), weight_norm(
+                        Conv1d(
+                            channels,
+                            channels,
+                            kernel_size,
+                            1,
+                            dilation=1,
+                            padding=get_padding(kernel_size, 1))), weight_norm(
+                                Conv1d(
+                                    channels,
+                                    channels,
+                                    kernel_size,
+                                    1,
+                                    dilation=1,
+                                    padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+
+
+class ResBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.h = h
+        self.convs = nn.ModuleList([
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[0],
+                    padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[1],
+                    padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+
+    def forward(self, x):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+
+
+class Generator(torch.nn.Module):
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(
+            Conv1d(512, h.upsample_initial_channel, 7, 1, padding=3))
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u,
+                k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        h.upsample_initial_channel // (2**i),
+                        h.upsample_initial_channel // (2**(i + 1)),
+                        k,
+                        u,
+                        # padding=(u//2 + u%2),
+                        padding=(k - u) // 2,
+                        # output_padding=u%2
+                    )))
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2**(i + 1))
+            for j, (k, d) in enumerate(
+                    zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x, LRELU_SLOPE)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+
+
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3,
+                 use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(
+                Conv2d(
+                    1,
+                    32, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(
+                Conv2d(
+                    32,
+                    128, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(
+                Conv2d(
+                    128,
+                    512, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(
+                Conv2d(
+                    512,
+                    1024, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorP(2),
+            DiscriminatorP(3),
+            DiscriminatorP(5),
+            DiscriminatorP(7),
+            DiscriminatorP(11),
+        ])
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 128, 15, 1, padding=7)),
+            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+            norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+            norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class MultiScaleDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiScaleDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorS(use_spectral_norm=True),
+            DiscriminatorS(),
+            DiscriminatorS(),
+        ])
+        self.meanpools = nn.ModuleList(
+            [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)])
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i - 1](y)
+                y_hat = self.meanpools[i - 1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+
+    return loss * 2
+
+
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1 - dr)**2)
+        g_loss = torch.mean(dg**2)
+        loss += (r_loss + g_loss)
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+
+    return loss, r_losses, g_losses
+
+
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1 - dg)**2)
+        gen_losses.append(l)
+        loss += l
+
+    return loss, gen_losses
+
+
+class Encoder(torch.nn.Module):
+    def __init__(self, h):
+        super(Encoder, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(Conv1d(1, 32, 7, 1, padding=3))
+        self.normalize = nn.ModuleList()
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(
+                list(
+                    reversed(
+                        list(zip(h.upsample_rates, h.upsample_kernel_sizes))))):
+            self.ups.append(
+                weight_norm(
+                    Conv1d(
+                        32 * (2**i),
+                        32 * (2**(i + 1)),
+                        k,
+                        u,
+                        padding=((k - u) // 2)
+                        # padding=(u//2 + u%2)
+                    )))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = 32 * (2**(i + 1))
+            for j, (k, d) in enumerate(
+                    zip(
+                        list(reversed(h.resblock_kernel_sizes)),
+                        list(reversed(h.resblock_dilation_sizes)))):
+                self.resblocks.append(resblock(h, ch, k, d))
+                self.normalize.append(
+                    torch.nn.GroupNorm(ch // 16, ch, eps=1e-6, affine=True))
+        self.conv_post = Conv1d(512, 512, 3, 1, padding=1)
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                    xs = self.normalize[i * self.num_kernels + j](xs)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+                    xs = self.normalize[i * self.num_kernels + j](xs)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        return x
+
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+
+
+class Quantizer_module(torch.nn.Module):
+    def __init__(self, n_e, e_dim):
+        super(Quantizer_module, self).__init__()
+        self.embedding = nn.Embedding(n_e, e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / n_e, 1.0 / n_e)
+
+    def forward(self, x):
+        # compute Euclidean distance
+        d = torch.sum(x ** 2, 1, keepdim=True) + torch.sum(self.embedding.weight ** 2, 1) \
+            - 2 * torch.matmul(x, self.embedding.weight.T)
+        min_indicies = torch.argmin(d, 1)
+        z_q = self.embedding(min_indicies)
+        return z_q, min_indicies
+
+
+class Quantizer(torch.nn.Module):
+    def __init__(self, h):
+        super(Quantizer, self).__init__()
+        assert 512 % h.n_code_groups == 0
+        self.quantizer_modules = nn.ModuleList([
+            Quantizer_module(h.n_codes, 512 // h.n_code_groups)
+            for _ in range(h.n_code_groups)
+        ])
+        self.quantizer_modules2 = nn.ModuleList([
+            Quantizer_module(h.n_codes, 512 // h.n_code_groups)
+            for _ in range(h.n_code_groups)
+        ])
+        self.h = h
+        self.codebook_loss_lambda = self.h.codebook_loss_lambda  # e.g., 1
+        self.commitment_loss_lambda = self.h.commitment_loss_lambda  # e.g., 0.25
+        self.residul_layer = 2
+        self.n_code_groups = h.n_code_groups
+
+    def for_one_step(self, xin, idx):
+        xin = xin.transpose(1, 2)
+        x = xin.reshape(-1, 512)
+        x = torch.split(x, 512 // self.h.n_code_groups, dim=-1)
+        min_indicies = []
+        z_q = []
+        if idx == 0:
+            for _x, m in zip(x, self.quantizer_modules):
+                _z_q, _min_indicies = m(_x)
+                z_q.append(_z_q)
+                min_indicies.append(_min_indicies)  #B * T,
+            z_q = torch.cat(z_q, -1).reshape(xin.shape)
+            # loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean((z_q - xin.detach()) ** 2)
+            loss = self.codebook_loss_lambda * torch.mean((z_q - xin.detach()) ** 2) \
+                + self.commitment_loss_lambda * torch.mean((z_q.detach() - xin) ** 2)
+            z_q = xin + (z_q - xin).detach()
+            z_q = z_q.transpose(1, 2)
+            return z_q, loss, min_indicies
+        else:
+            for _x, m in zip(x, self.quantizer_modules2):
+                _z_q, _min_indicies = m(_x)
+                z_q.append(_z_q)
+                min_indicies.append(_min_indicies)  #B * T,
+            z_q = torch.cat(z_q, -1).reshape(xin.shape)
+            # loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean((z_q - xin.detach()) ** 2)
+            loss = self.codebook_loss_lambda * torch.mean((z_q - xin.detach()) ** 2) \
+                + self.commitment_loss_lambda * torch.mean((z_q.detach() - xin) ** 2)
+            z_q = xin + (z_q - xin).detach()
+            z_q = z_q.transpose(1, 2)
+            return z_q, loss, min_indicies
+
+    def forward(self, xin):
+        #B, C, T
+        quantized_out = 0.0
+        residual = xin
+        all_losses = []
+        all_indices = []
+        for i in range(self.residul_layer):
+            quantized, loss, indices = self.for_one_step(residual, i)  # 
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+            all_indices.extend(indices)  # 
+            all_losses.append(loss)
+        all_losses = torch.stack(all_losses)
+        loss = torch.mean(all_losses)
+        return quantized_out, loss, all_indices
+
+    def embed(self, x):
+        #idx: N, T, 4
+        #print('x ', x.shape)
+        quantized_out = torch.tensor(0.0, device=x.device)
+        x = torch.split(x, 1, 2)  # split, 将最后一个维度分开, 每个属于一个index group
+        #print('x.shape ', len(x),x[0].shape)
+        for i in range(self.residul_layer):
+            ret = []
+            if i == 0:
+                for j in range(self.n_code_groups):
+                    q = x[j]
+                    embed = self.quantizer_modules[j]
+                    q = embed.embedding(q.squeeze(-1).long())
+                    ret.append(q)
+                ret = torch.cat(ret, -1)
+                #print(ret.shape)
+                quantized_out = quantized_out + ret
+            else:
+                for j in range(self.n_code_groups):
+                    q = x[j + self.n_code_groups]
+                    embed = self.quantizer_modules2[j]
+                    q = embed.embedding(q.squeeze(-1).long())
+                    ret.append(q)
+                ret = torch.cat(ret, -1)
+                quantized_out = quantized_out + ret
+        return quantized_out.transpose(1, 2)  #N, C, T
--- a/inspiremusic/music_tokenizer/vqvae.py
+++ b/inspiremusic/music_tokenizer/vqvae.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+import torch
+import torch.nn as nn
+from inspiremusic.music_tokenizer.env import AttrDict
+from inspiremusic.music_tokenizer.models import Encoder
+from inspiremusic.music_tokenizer.models import Generator
+from inspiremusic.music_tokenizer.models import Quantizer
+
+
+class VQVAE(nn.Module):
+    def __init__(self,
+                 config_path,
+                 ckpt_path,
+                 with_encoder=False):
+        super(VQVAE, self).__init__()
+        ckpt = torch.load(ckpt_path)
+        with open(config_path) as f:
+            data = f.read()
+        json_config = json.loads(data)
+        self.h = AttrDict(json_config)
+        self.quantizer = Quantizer(self.h)
+        self.generator = Generator(self.h)
+        self.generator.load_state_dict(ckpt['generator'])
+        self.quantizer.load_state_dict(ckpt['quantizer'])
+        if with_encoder:
+            self.encoder = Encoder(self.h)
+            self.encoder.load_state_dict(ckpt['encoder'])
+
+    def forward(self, x):
+        # x is the codebook
+        # x.shape (B, T, Nq)
+        quant_emb = self.quantizer.embed(x)
+        return self.generator(quant_emb)
+
+    def encode(self, x):
+        batch_size = x.size(0)
+        if len(x.shape) == 3 and x.shape[-1] == 1:
+            x = x.squeeze(-1)
+        c = self.encoder(x.unsqueeze(1))
+        q, loss_q, c = self.quantizer(c)
+        c = [code.reshape(batch_size, -1) for code in c]
+        # shape: [N, T, 4]
+        return torch.stack(c, -1)
--- a/inspiremusic/text/__pycache__/abs_tokenizer.cpython-310.pyc
+++ b/inspiremusic/text/__pycache__/abs_tokenizer.cpython-310.pyc
--- a/inspiremusic/text/__pycache__/tokenizer.cpython-310.pyc
+++ b/inspiremusic/text/__pycache__/tokenizer.cpython-310.pyc
--- a/inspiremusic/text/abs_tokenizer.py
+++ b/inspiremusic/text/abs_tokenizer.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC
+from abc import abstractmethod
+from typing import Iterable
+from typing import List
+
+
+class AbsTokenizer(ABC):
+    @abstractmethod
+    def text2tokens(self, line: str) -> List[str]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def tokens2text(self, tokens: Iterable[str]) -> str:
+        raise NotImplementedError
+
+    
+    
+    def encode(self, line: str, **kwargs) -> List[str]:
+
+        return self.text2tokens(line)
\ No newline at end of file
--- a/inspiremusic/text/tokenizer.py
+++ b/inspiremusic/text/tokenizer.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+import re
+from typing import Iterable, List, Union
+import numpy as np
+import torch
+
+from inspiremusic.text.abs_tokenizer import AbsTokenizer
+from transformers import AutoTokenizer
+
+def get_tokenizer(tokenizer_name, tokenizer_path):
+    if "qwen" in tokenizer_name:
+        return QwenTokenizer(tokenizer_path,skip_special_tokens=True)
+    else:
+        return None
+
+class QwenTokenizer(AbsTokenizer):
+    def __init__(
+            self,
+            token_path: str,
+            skip_special_tokens: bool = True,
+    ):
+        super().__init__()
+        # NOTE: non-chat model, all these special tokens keep randomly initialized.
+        special_tokens = {
+            'eos_token': '<|endoftext|>',
+            'pad_token': '<|endoftext|>',
+            'additional_special_tokens': [
+                '<|im_start|>', '<|im_end|>', '<|endofprompt|>',
+                '[breath]', '<strong>', '</strong>', '[noise]',
+                '[laughter]', '[cough]', '[clucking]', '[accent]',
+                '[quick_breath]',
+            ]
+        }
+        self.tokenizer = AutoTokenizer.from_pretrained(token_path)
+        self.tokenizer.add_special_tokens(special_tokens)
+        self.skip_special_tokens = skip_special_tokens
+
+    def get_vocab_size(self):
+        return self.tokenizer.vocab_size
+
+    def text2tokens(self, line: str) -> List:
+        tokens = self.tokenizer([line], return_tensors="pt")
+        tokens = tokens["input_ids"][0].cpu().tolist()
+        return tokens
+
+    def tokens2text(self, tokens) -> str:
+        tokens = torch.tensor(tokens, dtype=torch.int64)
+        text = self.tokenizer.batch_decode([tokens], skip_special_tokens=self.skip_special_tokens)[0]
+        return text
+
+
+
+def get_qwen_vocab_size(token_type: str):
+    if "qwen1.5" in token_type.lower() or "qwen2.0" in token_type.lower() or "qwen2.5" in token_type.lower():
+        # 293 for special and extra tokens, including endoftext, im_start, im_end, endofprompt and others in the future.
+        # model.vocab_size = 151936, tokenizer.vocab_size = 151643
+        # NOTE: the first three special tokens (endoftext, im_start, im_end) are trained in Chat series models,
+        # others are kept in random initialization state.
+        return 151643 + 293
+    else:
+        raise ValueError(f"Unknown tokenizer {token_type}")
\ No newline at end of file
--- a/inspiremusic/transformer/__init__.py
+++ b/inspiremusic/transformer/__init__.py
--- a/inspiremusic/transformer/__pycache__/__init__.cpython-310.pyc
+++ b/inspiremusic/transformer/__pycache__/__init__.cpython-310.pyc
--- a/inspiremusic/transformer/__pycache__/activation.cpython-310.pyc
+++ b/inspiremusic/transformer/__pycache__/activation.cpython-310.pyc
--- a/inspiremusic/transformer/__pycache__/attention.cpython-310.pyc
+++ b/inspiremusic/transformer/__pycache__/attention.cpython-310.pyc
--- a/inspiremusic/transformer/__pycache__/convolution.cpython-310.pyc
+++ b/inspiremusic/transformer/__pycache__/convolution.cpython-310.pyc