v1.0

0112b0f0 · chenzk · 0112b0f0 · 0112b0f0 · 0112b0f0 · 0112b0f0
Commit 0112b0f0 authored Feb 14, 2025 by chenzk
20 changed files
--- a/examples/music_generation/inspiremusic/flow/flow.py
+++ b/examples/music_generation/inspiremusic/flow/flow.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import random
+from typing import Dict, Optional
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from omegaconf import DictConfig
+from inspiremusic.utils.mask import make_pad_mask
+from inspiremusic.music_tokenizer.vqvae import VQVAE
+
+class MaskedDiff(torch.nn.Module):
+    def __init__(self,
+                 input_size: int = 512,
+                 output_size: int = 128,
+                 output_type: str = "mel",
+                 vocab_size: int = 4096,
+                 input_frame_rate: int = 50,
+                 only_mask_loss: bool = True,
+                 encoder: torch.nn.Module = None,
+                 length_regulator: torch.nn.Module = None,
+                 decoder: torch.nn.Module = None,
+                 decoder_conf: Dict = {'in_channels': 240, 'out_channel': 80,
+                                       'cfm_params': DictConfig({'sigma_min': 1e-06, 'solver': 'euler', 't_scheduler': 'cosine',
+                                                                 'training_cfg_rate': 0.2, 'inference_cfg_rate': 0.7, 'reg_loss_type': 'l1'}),
+                                       'decoder_params': {'channels': [256, 256], 'dropout': 0.0, 'attention_head_dim': 64,
+                                                          'n_blocks': 4, 'num_mid_blocks': 12, 'num_heads': 8, 'act_fn': 'gelu'}},
+                 mel_feat_conf: Dict = {'n_fft': 1024, 'num_mels': 128, 'sampling_rate': 48000,
+                                        'hop_size': 256, 'win_size': 1024, 'fmin': 0, 'fmax': 48000},
+                generator_model_dir: str = "../../pretrained_models/InspireMusic-Base/music_tokenizer",
+                num_codebooks: int = 4
+                ):
+        super().__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.decoder_conf = decoder_conf
+        self.mel_feat_conf = mel_feat_conf
+        self.vocab_size = vocab_size
+        self.output_type = output_type
+        self.input_frame_rate = input_frame_rate
+        logging.info(f"input frame rate={self.input_frame_rate}")
+        self.input_embedding = nn.Embedding(vocab_size, input_size)
+
+        self.encoder = encoder
+        self.encoder_proj = torch.nn.Linear(self.encoder.output_size(), output_size)
+        self.decoder = decoder
+        self.length_regulator = length_regulator
+        self.only_mask_loss = only_mask_loss
+        self.quantizer = VQVAE( f'{generator_model_dir}/config.json',
+                                  f'{generator_model_dir}/model.pt',with_encoder=True).quantizer
+        self.quantizer.eval()
+        self.num_codebooks  = num_codebooks
+        self.cond = None
+        self.interpolate = False
+                                  
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+
+        audio_token = batch['acoustic_token'].to(device)
+        audio_token_len = batch['acoustic_token_len'].to(device)
+        audio_token  = audio_token.view(audio_token.size(0),-1,self.num_codebooks)
+        if "semantic_token" not in batch:
+            token = audio_token[:,:,0]
+            token_len = (audio_token_len/self.num_codebooks).long()
+    
+        else:
+            token = batch['semantic_token'].to(device)
+            token_len = batch['semantic_token_len'].to(device)
+
+        with torch.no_grad():
+            feat = self.quantizer.embed(audio_token)
+            feat_len = (audio_token_len/self.num_codebooks).long()
+
+        token = self.input_embedding(token) 
+        h, h_lengths = self.encoder(token, token_len)
+        h, h_lengths = self.length_regulator(h, feat_len)   
+
+        # get conditions
+        if self.cond:
+            conds = torch.zeros(feat.shape, device=token.device)
+            for i, j in enumerate(feat_len):
+                if random.random() < 0.5:
+                    continue
+                index = random.randint(0, int(0.3 * j))
+                conds[i, :index] = feat[i, :index]
+            conds = conds.transpose(1, 2)
+        else:
+            conds = None
+        
+        mask = (~make_pad_mask(feat_len)).to(h)
+
+        loss, _ = self.decoder.compute_loss(
+                feat,
+                mask.unsqueeze(1),
+                h.transpose(1, 2).contiguous(),
+                None,
+                cond=conds
+        )
+            
+        return {'loss': loss}
+
+    @torch.inference_mode()
+    def inference(self,
+                  token,
+                  token_len,
+                  sample_rate):
+        assert token.shape[0] == 1
+
+        token = self.input_embedding(torch.clamp(token, min=0)) 
+        h, h_lengths = self.encoder(token, token_len)
+
+        if sample_rate == 48000:
+            token_len = 2 * token_len
+
+        h, h_lengths = self.length_regulator(h, token_len)  
+
+        # get conditions
+        conds = None
+
+        mask = (~make_pad_mask(token_len)).to(h)
+        feat = self.decoder(
+            mu=h.transpose(1, 2).contiguous(),
+            mask=mask.unsqueeze(1),
+            spks=None,
+            cond=conds,
+            n_timesteps=10
+        )
+        return feat
\ No newline at end of file
--- a/examples/music_generation/inspiremusic/flow/flow_matching.py
+++ b/examples/music_generation/inspiremusic/flow/flow_matching.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn.functional as F
+from matcha.models.components.flow_matching import BASECFM
+
+
+class ConditionalCFM(BASECFM):
+    def __init__(self, in_channels, cfm_params, estimator: torch.nn.Module = None):
+        super().__init__(
+            n_feats=in_channels,
+            cfm_params=cfm_params,
+        )
+        self.t_scheduler = cfm_params.t_scheduler
+        self.training_cfg_rate = cfm_params.training_cfg_rate
+        self.inference_cfg_rate = cfm_params.inference_cfg_rate
+        # Just change the architecture of the estimator here
+        self.estimator = estimator
+
+    @torch.inference_mode()
+    def forward(self, mu, mask, n_timesteps, temperature=1.0, spks=None, cond=None):
+        """Forward diffusion
+
+        Args:
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            n_timesteps (int): number of diffusion steps
+            temperature (float, optional): temperature for scaling noise. Defaults to 1.0.
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+
+        Returns:
+            sample: generated mel-spectrogram
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        z = torch.randn_like(mu) * temperature
+        t_span = torch.linspace(0, 1, n_timesteps + 1, device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t_span = 1 - torch.cos(t_span * 0.5 * torch.pi)
+        return self.solve_euler(z, t_span=t_span, mu=mu, mask=mask, spks=spks, cond=cond)
+
+    def solve_euler(self, x, t_span, mu, mask, spks, cond):
+        """
+        Fixed euler solver for ODEs.
+        Args:
+            x (torch.Tensor): random noise
+            t_span (torch.Tensor): n_timesteps interpolated
+                shape: (n_timesteps + 1,)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            mask (torch.Tensor): output_mask
+                shape: (batch_size, 1, mel_timesteps)
+            spks (torch.Tensor, optional): speaker ids. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+            cond: Not used but kept for future purposes
+        """
+        t, _, dt = t_span[0], t_span[-1], t_span[1] - t_span[0]
+        t = t.unsqueeze(dim=0)
+
+        # I am storing this because I can later plot it by putting a debugger here and saving it to a file
+        # Or in future might add like a return_all_steps flag
+        sol = []
+
+        for step in range(1, len(t_span)):
+            dphi_dt = self.forward_estimator(x, mask, mu, t, spks, cond)
+            # Classifier-Free Guidance inference introduced in VoiceBox
+            if self.inference_cfg_rate > 0:
+                cfg_dphi_dt = self.forward_estimator(
+                    x, mask,
+                    torch.zeros_like(mu), t,
+                    torch.zeros_like(spks) if spks is not None else None,
+                    torch.zeros_like(cond) if cond is not None else None
+                )
+                dphi_dt = ((1.0 + self.inference_cfg_rate) * dphi_dt -
+                           self.inference_cfg_rate * cfg_dphi_dt)
+            x = x + dt * dphi_dt
+            t = t + dt
+            sol.append(x)
+            if step < len(t_span) - 1:
+                dt = t_span[step + 1] - t
+
+        return sol[-1]
+
+    def forward_estimator(self, x, mask, mu, t, spks, cond):
+        if isinstance(self.estimator, torch.nn.Module):
+            return self.estimator.forward(x, mask, mu, t, spks, cond)
+        elif isinstance(self.estimator, onnxruntime.InferenceSession):
+            ort_inputs = {
+                'x': x.cpu().numpy(),
+                'mask': mask.cpu().numpy(),
+                'mu': mu.cpu().numpy(),
+                't': t.cpu().numpy(),
+                'spks': spks.cpu().numpy(),
+                'cond': cond.cpu().numpy()
+            }
+            output = self.estimator.run(None, ort_inputs)[0]
+            return torch.tensor(output, dtype=x.dtype, device=x.device)
+        else:
+            self.estimator.set_input_shape('x', (2, 80, x.size(2)))
+            self.estimator.set_input_shape('mask', (2, 1, x.size(2)))
+            self.estimator.set_input_shape('mu', (2, 80, x.size(2)))
+            self.estimator.set_input_shape('t', (2,))
+            self.estimator.set_input_shape('spks', (2, 80))
+            self.estimator.set_input_shape('cond', (2, 80, x.size(2)))
+            # run trt engine
+            self.estimator.execute_v2([x.contiguous().data_ptr(),
+                                       mask.contiguous().data_ptr(),
+                                       mu.contiguous().data_ptr(),
+                                       t.contiguous().data_ptr(),
+                                       spks.contiguous().data_ptr(),
+                                       cond.contiguous().data_ptr(),
+                                       x.data_ptr()])
+            return x
+
+    def compute_loss(self, x1, mask, mu, spks=None, cond=None):
+        """Computes diffusion loss
+
+        Args:
+            x1 (torch.Tensor): Target
+                shape: (batch_size, n_feats, mo)
+            mask (torch.Tensor): target mask
+                shape: (batch_size, 1, mel_timesteps)
+            mu (torch.Tensor): output of encoder
+                shape: (batch_size, n_feats, mel_timesteps)
+            spks (torch.Tensor, optional): speaker embedding. Defaults to None.
+                shape: (batch_size, spk_emb_dim)
+
+        Returns:
+            loss: conditional flow matching loss
+            y: conditional flow
+                shape: (batch_size, n_feats, mel_timesteps)
+        """
+        b, _, t = mu.shape
+        
+        t = torch.rand([b, 1, 1], device=mu.device, dtype=mu.dtype)
+        if self.t_scheduler == 'cosine':
+            t = 1 - torch.cos(t * 0.5 * torch.pi)
+        
+        z = torch.randn_like(x1)
+        y = (1 - (1 - self.sigma_min) * t) * z + t * x1
+        u = x1 - (1 - self.sigma_min) * z
+
+        # during training, we randomly drop condition to trade off mode coverage and sample fidelity
+        if self.training_cfg_rate > 0:
+            cfg_mask = torch.rand(b, device=x1.device) > self.training_cfg_rate
+            mu = mu * cfg_mask.view(-1, 1, 1)
+            if cond is not None:
+                cond = cond * cfg_mask.view(-1, 1, 1)
+
+        pred = self.estimator(y, mask, mu, t.squeeze(), spks, cond)
+        loss = F.mse_loss(pred * mask, u * mask, reduction="sum") / (torch.sum(mask) * u.shape[1])
+        return loss, y
+
--- a/examples/music_generation/inspiremusic/flow/length_regulator.py
+++ b/examples/music_generation/inspiremusic/flow/length_regulator.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Tuple
+import torch.nn as nn
+import torch
+from torch.nn import functional as F
+from inspiremusic.utils.mask import make_pad_mask
+
+
+class InterpolateRegulator(nn.Module):
+    def __init__(
+            self,
+            channels: int,
+            sampling_ratios: Tuple,
+            out_channels: int = None,
+            groups: int = 1,
+    ):
+        super().__init__()
+        self.sampling_ratios = sampling_ratios
+        out_channels = out_channels or channels
+        model = nn.ModuleList([])
+        if len(sampling_ratios) > 0:
+            for _ in sampling_ratios:
+                module = nn.Conv1d(channels, channels, 3, 1, 1)
+                norm = nn.GroupNorm(groups, channels)
+                act = nn.Mish()
+                model.extend([module, norm, act])
+        model.append(
+            nn.Conv1d(channels, out_channels, 1, 1)
+        )
+        self.model = nn.Sequential(*model)
+
+    def forward(self, x, ylens=None):
+        # x in (B, T, D)
+        mask = (~make_pad_mask(ylens)).to(x).unsqueeze(-1)
+        x = F.interpolate(x.transpose(1, 2).contiguous(), size=ylens.max(), mode='linear')
+        out = self.model(x).transpose(1, 2).contiguous()
+        olens = ylens
+        return out * mask, olens
+
+    def inference(self, x1, x2, mel_len1, mel_len2, input_frame_rate=50):
+        # in inference mode, interploate prompt token and token(head/mid/tail) seprately, so we can get a clear separation point of mel
+        # x in (B, T, D)
+        if x2.shape[1] > 40:
+            x2_head = F.interpolate(x2[:, :20].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')
+            x2_mid = F.interpolate(x2[:, 20:-20].transpose(1, 2).contiguous(), size=mel_len2 - int(20 / input_frame_rate * 22050 / 256) * 2,
+                                   mode='linear')
+            x2_tail = F.interpolate(x2[:, -20:].transpose(1, 2).contiguous(), size=int(20 / input_frame_rate * 22050 / 256), mode='linear')
+            x2 = torch.concat([x2_head, x2_mid, x2_tail], dim=2)
+        else:
+            x2 = F.interpolate(x2.transpose(1, 2).contiguous(), size=mel_len2, mode='linear')
+        if x1.shape[1] != 0:
+            x1 = F.interpolate(x1.transpose(1, 2).contiguous(), size=mel_len1, mode='linear')
+            x = torch.concat([x1, x2], dim=2)
+        else:
+            x = x2
+        out = self.model(x).transpose(1, 2).contiguous()
+        return out, mel_len1 + mel_len2
--- a/examples/music_generation/inspiremusic/hifigan/__pycache__/f0_predictor.cpython-310.pyc
+++ b/examples/music_generation/inspiremusic/hifigan/__pycache__/f0_predictor.cpython-310.pyc
--- a/examples/music_generation/inspiremusic/hifigan/__pycache__/generator.cpython-310.pyc
+++ b/examples/music_generation/inspiremusic/hifigan/__pycache__/generator.cpython-310.pyc
--- a/examples/music_generation/inspiremusic/hifigan/discriminator.py
+++ b/examples/music_generation/inspiremusic/hifigan/discriminator.py
+import torch
+import torch.nn as nn
+from torch.nn.utils import weight_norm
+from typing import List, Optional, Tuple
+from einops import rearrange
+from torchaudio.transforms import Spectrogram
+
+
+class MultipleDiscriminator(nn.Module):
+    def __init__(
+            self, mpd: nn.Module, mrd: nn.Module
+    ):
+        super().__init__()
+        self.mpd = mpd
+        self.mrd = mrd
+
+    def forward(self, y: torch.Tensor, y_hat: torch.Tensor):
+        y_d_rs, y_d_gs, fmap_rs, fmap_gs = [], [], [], []
+        this_y_d_rs, this_y_d_gs, this_fmap_rs, this_fmap_gs = self.mpd(y.unsqueeze(dim=1), y_hat.unsqueeze(dim=1))
+        y_d_rs += this_y_d_rs
+        y_d_gs += this_y_d_gs
+        fmap_rs += this_fmap_rs
+        fmap_gs += this_fmap_gs
+        this_y_d_rs, this_y_d_gs, this_fmap_rs, this_fmap_gs = self.mrd(y, y_hat)
+        y_d_rs += this_y_d_rs
+        y_d_gs += this_y_d_gs
+        fmap_rs += this_fmap_rs
+        fmap_gs += this_fmap_gs
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class MultiResolutionDiscriminator(nn.Module):
+    def __init__(
+        self,
+        fft_sizes: Tuple[int, ...] = (2048, 1024, 512),
+        num_embeddings: Optional[int] = None,
+    ):
+        """
+        Multi-Resolution Discriminator module adapted from https://github.com/descriptinc/descript-audio-codec.
+        Additionally, it allows incorporating conditional information with a learned embeddings table.
+
+        Args:
+            fft_sizes (tuple[int]): Tuple of window lengths for FFT. Defaults to (2048, 1024, 512).
+            num_embeddings (int, optional): Number of embeddings. None means non-conditional discriminator.
+                Defaults to None.
+        """
+
+        super().__init__()
+        self.discriminators = nn.ModuleList(
+            [DiscriminatorR(window_length=w, num_embeddings=num_embeddings) for w in fft_sizes]
+        )
+
+    def forward(
+        self, y: torch.Tensor, y_hat: torch.Tensor, bandwidth_id: torch.Tensor = None
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[List[torch.Tensor]], List[List[torch.Tensor]]]:
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+
+        for d in self.discriminators:
+            y_d_r, fmap_r = d(x=y, cond_embedding_id=bandwidth_id)
+            y_d_g, fmap_g = d(x=y_hat, cond_embedding_id=bandwidth_id)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class DiscriminatorR(nn.Module):
+    def __init__(
+        self,
+        window_length: int,
+        num_embeddings: Optional[int] = None,
+        channels: int = 32,
+        hop_factor: float = 0.25,
+        bands: Tuple[Tuple[float, float], ...] = ((0.0, 0.1), (0.1, 0.25), (0.25, 0.5), (0.5, 0.75), (0.75, 1.0)),
+    ):
+        super().__init__()
+        self.window_length = window_length
+        self.hop_factor = hop_factor
+        self.spec_fn = Spectrogram(
+            n_fft=window_length, hop_length=int(window_length * hop_factor), win_length=window_length, power=None
+        )
+        n_fft = window_length // 2 + 1
+        bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
+        self.bands = bands
+        convs = lambda: nn.ModuleList(
+            [
+                weight_norm(nn.Conv2d(2, channels, (3, 9), (1, 1), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))),
+                weight_norm(nn.Conv2d(channels, channels, (3, 3), (1, 1), padding=(1, 1))),
+            ]
+        )
+        self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
+
+        if num_embeddings is not None:
+            self.emb = torch.nn.Embedding(num_embeddings=num_embeddings, embedding_dim=channels)
+            torch.nn.init.zeros_(self.emb.weight)
+
+        self.conv_post = weight_norm(nn.Conv2d(channels, 1, (3, 3), (1, 1), padding=(1, 1)))
+
+    def spectrogram(self, x):
+        # Remove DC offset
+        x = x - x.mean(dim=-1, keepdims=True)
+        # Peak normalize the volume of input audio
+        x = 0.8 * x / (x.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
+        x = self.spec_fn(x)
+        x = torch.view_as_real(x)
+        x = rearrange(x, "b f t c -> b c t f")
+        # Split into bands
+        x_bands = [x[..., b[0]: b[1]] for b in self.bands]
+        return x_bands
+
+    def forward(self, x: torch.Tensor, cond_embedding_id: torch.Tensor = None):
+        x_bands = self.spectrogram(x)
+        fmap = []
+        x = []
+        for band, stack in zip(x_bands, self.band_convs):
+            for i, layer in enumerate(stack):
+                band = layer(band)
+                band = torch.nn.functional.leaky_relu(band, 0.1)
+                if i > 0:
+                    fmap.append(band)
+            x.append(band)
+        x = torch.cat(x, dim=-1)
+        if cond_embedding_id is not None:
+            emb = self.emb(cond_embedding_id)
+            h = (emb.view(1, -1, 1, 1) * x).sum(dim=1, keepdims=True)
+        else:
+            h = 0
+        x = self.conv_post(x)
+        fmap.append(x)
+        x += h
+
+        return x, fmap
--- a/examples/music_generation/inspiremusic/hifigan/f0_predictor.py
+++ b/examples/music_generation/inspiremusic/hifigan/f0_predictor.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.nn as nn
+from torch.nn.utils import weight_norm
+
+
+class ConvRNNF0Predictor(nn.Module):
+    def __init__(self,
+                 num_class: int = 1,
+                 in_channels: int = 80,
+                 cond_channels: int = 512
+                 ):
+        super().__init__()
+
+        self.num_class = num_class
+        self.condnet = nn.Sequential(
+            weight_norm(
+                nn.Conv1d(in_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+            weight_norm(
+                nn.Conv1d(cond_channels, cond_channels, kernel_size=3, padding=1)
+            ),
+            nn.ELU(),
+        )
+        self.classifier = nn.Linear(in_features=cond_channels, out_features=self.num_class)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.condnet(x)
+        x = x.transpose(1, 2)
+        return torch.abs(self.classifier(x).squeeze(-1))
--- a/examples/music_generation/inspiremusic/hifigan/generator.py
+++ b/examples/music_generation/inspiremusic/hifigan/generator.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""HIFI-GAN"""
+
+from typing import Dict, Optional, List
+import numpy as np
+from scipy.signal import get_window
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Conv1d
+from torch.nn import ConvTranspose1d
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils import weight_norm
+from torch.distributions.uniform import Uniform
+
+from inspiremusic.transformer.activation import Snake
+from inspiremusic.utils.common import get_padding
+from inspiremusic.utils.common import init_weights
+
+
+"""hifigan based generator implementation.
+
+This code is modified from https://github.com/jik876/hifi-gan
+ ,https://github.com/kan-bayashi/ParallelWaveGAN and
+ https://github.com/NVIDIA/BigVGAN
+
+"""
+
+
+class ResBlock(torch.nn.Module):
+    """Residual block module in HiFiGAN/BigVGAN."""
+    def __init__(
+        self,
+        channels: int = 512,
+        kernel_size: int = 3,
+        dilations: List[int] = [1, 3, 5],
+    ):
+        super(ResBlock, self).__init__()
+        self.convs1 = nn.ModuleList()
+        self.convs2 = nn.ModuleList()
+
+        for dilation in dilations:
+            self.convs1.append(
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=dilation,
+                        padding=get_padding(kernel_size, dilation)
+                    )
+                )
+            )
+            self.convs2.append(
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1)
+                    )
+                )
+            )
+        self.convs1.apply(init_weights)
+        self.convs2.apply(init_weights)
+        self.activations1 = nn.ModuleList([
+            Snake(channels, alpha_logscale=False)
+            for _ in range(len(self.convs1))
+        ])
+        self.activations2 = nn.ModuleList([
+            Snake(channels, alpha_logscale=False)
+            for _ in range(len(self.convs2))
+        ])
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        for idx in range(len(self.convs1)):
+            xt = self.activations1[idx](x)
+            xt = self.convs1[idx](xt)
+            xt = self.activations2[idx](xt)
+            xt = self.convs2[idx](xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for idx in range(len(self.convs1)):
+            remove_weight_norm(self.convs1[idx])
+            remove_weight_norm(self.convs2[idx])
+
+
+class SineGen(torch.nn.Module):
+    """ Definition of sine generator
+    SineGen(samp_rate, harmonic_num = 0,
+            sine_amp = 0.1, noise_std = 0.003,
+            voiced_threshold = 0,
+            flag_for_pulse=False)
+    samp_rate: sampling rate in Hz
+    harmonic_num: number of harmonic overtones (default 0)
+    sine_amp: amplitude of sine-wavefrom (default 0.1)
+    noise_std: std of Gaussian noise (default 0.003)
+    voiced_thoreshold: F0 threshold for U/V classification (default 0)
+    flag_for_pulse: this SinGen is used inside PulseGen (default False)
+    Note: when flag_for_pulse is True, the first time step of a voiced
+        segment is always sin(np.pi) or cos(0)
+    """
+
+    def __init__(self, samp_rate, harmonic_num=0,
+                 sine_amp=0.1, noise_std=0.003,
+                 voiced_threshold=0):
+        super(SineGen, self).__init__()
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.harmonic_num = harmonic_num
+        self.sampling_rate = samp_rate
+        self.voiced_threshold = voiced_threshold
+
+    def _f02uv(self, f0):
+        # generate uv signal
+        uv = (f0 > self.voiced_threshold).type(torch.float32)
+        return uv
+
+    @torch.no_grad()
+    def forward(self, f0):
+        """
+        :param f0: [B, 1, sample_len], Hz
+        :return: [B, 1, sample_len]
+        """
+
+        F_mat = torch.zeros((f0.size(0), self.harmonic_num + 1, f0.size(-1))).to(f0.device)
+        for i in range(self.harmonic_num + 1):
+            F_mat[:, i: i + 1, :] = f0 * (i + 1) / self.sampling_rate
+
+        theta_mat = 2 * np.pi * (torch.cumsum(F_mat, dim=-1) % 1)
+        u_dist = Uniform(low=-np.pi, high=np.pi)
+        phase_vec = u_dist.sample(sample_shape=(f0.size(0), self.harmonic_num + 1, 1)).to(F_mat.device)
+        phase_vec[:, 0, :] = 0
+
+        # generate sine waveforms
+        sine_waves = self.sine_amp * torch.sin(theta_mat + phase_vec)
+
+        # generate uv signal
+        uv = self._f02uv(f0)
+
+        # noise: for unvoiced should be similar to sine_amp
+        #        std = self.sine_amp/3 -> max value ~ self.sine_amp
+        # .       for voiced regions is self.noise_std
+        noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
+        noise = noise_amp * torch.randn_like(sine_waves)
+
+        # first: set the unvoiced part to 0 by uv
+        # then: additive noise
+        sine_waves = sine_waves * uv + noise
+        return sine_waves, uv, noise
+
+
+class SourceModuleHnNSF(torch.nn.Module):
+    """ SourceModule for hn-nsf
+    SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0)
+    sampling_rate: sampling_rate in Hz
+    harmonic_num: number of harmonic above F0 (default: 0)
+    sine_amp: amplitude of sine source signal (default: 0.1)
+    add_noise_std: std of additive Gaussian noise (default: 0.003)
+        note that amplitude of noise in unvoiced is decided
+        by sine_amp
+    voiced_threshold: threhold to set U/V given F0 (default: 0)
+    Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+    F0_sampled (batchsize, length, 1)
+    Sine_source (batchsize, length, 1)
+    noise_source (batchsize, length 1)
+    uv (batchsize, length, 1)
+    """
+
+    def __init__(self, sampling_rate, upsample_scale, harmonic_num=0, sine_amp=0.1,
+                 add_noise_std=0.003, voiced_threshod=0):
+        super(SourceModuleHnNSF, self).__init__()
+
+        self.sine_amp = sine_amp
+        self.noise_std = add_noise_std
+
+        # to produce sine waveforms
+        self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
+                                 sine_amp, add_noise_std, voiced_threshod)
+
+        # to merge source harmonics into a single excitation
+        self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = torch.nn.Tanh()
+
+    def forward(self, x):
+        """
+        Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
+        F0_sampled (batchsize, length, 1)
+        Sine_source (batchsize, length, 1)
+        noise_source (batchsize, length 1)
+        """
+        # source for harmonic branch
+        with torch.no_grad():
+            sine_wavs, uv, _ = self.l_sin_gen(x.transpose(1, 2))
+            sine_wavs = sine_wavs.transpose(1, 2)
+            uv = uv.transpose(1, 2)
+        sine_merge = self.l_tanh(self.l_linear(sine_wavs))
+
+        # source for noise branch, in the same shape as uv
+        noise = torch.randn_like(uv) * self.sine_amp / 3
+        return sine_merge, noise, uv
+
+
+class HiFTGenerator(nn.Module):
+    """
+    HiFTNet Generator: Neural Source Filter + ISTFTNet
+    https://arxiv.org/abs/2309.09493
+    """
+    def __init__(
+            self,
+            in_channels: int = 80,
+            base_channels: int = 512,
+            nb_harmonics: int = 8,
+            sampling_rate: int = 22050,
+            nsf_alpha: float = 0.1,
+            nsf_sigma: float = 0.003,
+            nsf_voiced_threshold: float = 10,
+            upsample_rates: List[int] = [8, 8],
+            upsample_kernel_sizes: List[int] = [16, 16],
+            istft_params: Dict[str, int] = {"n_fft": 16, "hop_len": 4},
+            resblock_kernel_sizes: List[int] = [3, 7, 11],
+            resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+            source_resblock_kernel_sizes: List[int] = [7, 11],
+            source_resblock_dilation_sizes: List[List[int]] = [[1, 3, 5], [1, 3, 5]],
+            lrelu_slope: float = 0.1,
+            audio_limit: float = 0.99,
+            f0_predictor: torch.nn.Module = None,
+    ):
+        super(HiFTGenerator, self).__init__()
+
+        self.out_channels = 1
+        self.nb_harmonics = nb_harmonics
+        self.sampling_rate = sampling_rate
+        self.istft_params = istft_params
+        self.lrelu_slope = lrelu_slope
+        self.audio_limit = audio_limit
+
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.m_source = SourceModuleHnNSF(
+            sampling_rate=sampling_rate,
+            upsample_scale=np.prod(upsample_rates) * istft_params["hop_len"],
+            harmonic_num=nb_harmonics,
+            sine_amp=nsf_alpha,
+            add_noise_std=nsf_sigma,
+            voiced_threshod=nsf_voiced_threshold)
+        self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates) * istft_params["hop_len"])
+
+        self.conv_pre = weight_norm(
+            Conv1d(in_channels, base_channels, 7, 1, padding=3)
+        )
+
+        # Up
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        base_channels // (2**i),
+                        base_channels // (2**(i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+
+        # Down
+        self.source_downs = nn.ModuleList()
+        self.source_resblocks = nn.ModuleList()
+        downsample_rates = [1] + upsample_rates[::-1][:-1]
+        downsample_cum_rates = np.cumprod(downsample_rates)
+        for i, (u, k, d) in enumerate(zip(downsample_cum_rates[::-1], source_resblock_kernel_sizes, source_resblock_dilation_sizes)):
+            if u == 1:
+                self.source_downs.append(
+                    Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), 1, 1)
+                )
+            else:
+                self.source_downs.append(
+                    Conv1d(istft_params["n_fft"] + 2, base_channels // (2 ** (i + 1)), u * 2, u, padding=(u // 2))
+                )
+
+            self.source_resblocks.append(
+                ResBlock(base_channels // (2 ** (i + 1)), k, d)
+            )
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = base_channels // (2**(i + 1))
+            for _, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(ResBlock(ch, k, d))
+
+        self.conv_post = weight_norm(Conv1d(ch, istft_params["n_fft"] + 2, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+        self.reflection_pad = nn.ReflectionPad1d((1, 0))
+        self.stft_window = torch.from_numpy(get_window("hann", istft_params["n_fft"], fftbins=True).astype(np.float32))
+        self.f0_predictor = f0_predictor
+
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+        self.m_source.remove_weight_norm()
+        for l in self.source_downs:
+            remove_weight_norm(l)
+        for l in self.source_resblocks:
+            l.remove_weight_norm()
+
+    def _stft(self, x):
+        spec = torch.stft(
+            x,
+            self.istft_params["n_fft"], self.istft_params["hop_len"], self.istft_params["n_fft"], window=self.stft_window.to(x.device),
+            return_complex=True)
+        spec = torch.view_as_real(spec)  # [B, F, TT, 2]
+        return spec[..., 0], spec[..., 1]
+
+    def _istft(self, magnitude, phase):
+        magnitude = torch.clip(magnitude, max=1e2)
+        real = magnitude * torch.cos(phase)
+        img = magnitude * torch.sin(phase)
+        inverse_transform = torch.istft(torch.complex(real, img), self.istft_params["n_fft"], self.istft_params["hop_len"],
+                                        self.istft_params["n_fft"], window=self.stft_window.to(magnitude.device))
+        return inverse_transform
+
+    def decode(self, x: torch.Tensor, s: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        s_stft_real, s_stft_imag = self._stft(s.squeeze(1))
+        s_stft = torch.cat([s_stft_real, s_stft_imag], dim=1)
+
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, self.lrelu_slope)
+            x = self.ups[i](x)
+
+            if i == self.num_upsamples - 1:
+                x = self.reflection_pad(x)
+
+            # fusion
+            si = self.source_downs[i](s_stft)
+            si = self.source_resblocks[i](si)
+            x = x + si
+
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        magnitude = torch.exp(x[:, :self.istft_params["n_fft"] // 2 + 1, :])
+        phase = torch.sin(x[:, self.istft_params["n_fft"] // 2 + 1:, :])  # actually, sin is redundancy
+
+        x = self._istft(magnitude, phase)
+        x = torch.clamp(x, -self.audio_limit, self.audio_limit)
+        return x
+
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        speech_feat = batch['speech_feat'].transpose(1, 2).to(device)
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        # mel+source->speech
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, f0
+
+    @torch.inference_mode()
+    def inference(self, speech_feat: torch.Tensor, cache_source: torch.Tensor = torch.zeros(1, 1, 0)) -> torch.Tensor:
+        # mel->f0
+        f0 = self.f0_predictor(speech_feat)
+        # f0->source
+        s = self.f0_upsamp(f0[:, None]).transpose(1, 2)  # bs,n,t
+        s, _, _ = self.m_source(s)
+        s = s.transpose(1, 2)
+        # use cache_source to avoid glitch
+        if cache_source.shape[2] != 0:
+            s[:, :, :cache_source.shape[2]] = cache_source
+        generated_speech = self.decode(x=speech_feat, s=s)
+        return generated_speech, s
--- a/examples/music_generation/inspiremusic/hifigan/hifigan.py
+++ b/examples/music_generation/inspiremusic/hifigan/hifigan.py
+from typing import Dict, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from matcha.hifigan.models import feature_loss, generator_loss, discriminator_loss
+from inspiremusic.utils.losses import tpr_loss, mel_loss
+
+class HiFiGan(nn.Module):
+    def __init__(self, generator, discriminator, mel_spec_transform,
+                 multi_mel_spectral_recon_loss_weight=45, feat_match_loss_weight=2.0,
+                 tpr_loss_weight=1.0, tpr_loss_tau=0.04):
+        super(HiFiGan, self).__init__()
+        self.generator = generator
+        self.discriminator = discriminator
+        self.mel_spec_transform = mel_spec_transform
+        self.multi_mel_spectral_recon_loss_weight = multi_mel_spectral_recon_loss_weight
+        self.feat_match_loss_weight = feat_match_loss_weight
+        self.tpr_loss_weight = tpr_loss_weight
+        self.tpr_loss_tau = tpr_loss_tau
+
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        if batch['turn'] == 'generator':
+            return self.forward_generator(batch, device)
+        else:
+            return self.forward_discriminator(batch, device)
+
+    def forward_generator(self, batch, device):
+        real_speech = batch['speech'].to(device)
+        pitch_feat = batch['pitch_feat'].to(device)
+        # 1. calculate generator outputs
+        generated_speech, generated_f0 = self.generator(batch, device)
+        # 2. calculate discriminator outputs
+        y_d_rs, y_d_gs, fmap_rs, fmap_gs = self.discriminator(real_speech, generated_speech)
+        # 3. calculate generator losses, feature loss, mel loss, tpr losses [Optional]
+        loss_gen, _ = generator_loss(y_d_gs)
+        loss_fm = feature_loss(fmap_rs, fmap_gs)
+        loss_mel = mel_loss(real_speech, generated_speech, self.mel_spec_transform)
+        if self.tpr_loss_weight != 0:
+            loss_tpr = tpr_loss(y_d_rs, y_d_gs, self.tpr_loss_tau)
+        else:
+            loss_tpr = torch.zeros(1).to(device)
+        loss_f0 = F.l1_loss(generated_f0, pitch_feat)
+        loss = loss_gen + self.feat_match_loss_weight * loss_fm + \
+            self.multi_mel_spectral_recon_loss_weight * loss_mel + \
+            self.tpr_loss_weight * loss_tpr + loss_f0
+        return {'loss': loss, 'loss_gen': loss_gen, 'loss_fm': loss_fm, 'loss_mel': loss_mel, 'loss_tpr': loss_tpr, 'loss_f0': loss_f0}
+
+    def forward_discriminator(self, batch, device):
+        real_speech = batch['speech'].to(device)
+        # 1. calculate generator outputs
+        with torch.no_grad():
+            generated_speech, generated_f0 = self.generator(batch, device)
+        # 2. calculate discriminator outputs
+        y_d_rs, y_d_gs, fmap_rs, fmap_gs = self.discriminator(real_speech, generated_speech)
+        # 3. calculate discriminator losses, tpr losses [Optional]
+        loss_disc, _, _ = discriminator_loss(y_d_rs, y_d_gs)
+        if self.tpr_loss_weight != 0:
+            loss_tpr = tpr_loss(y_d_rs, y_d_gs, self.tpr_loss_tau)
+        else:
+            loss_tpr = torch.zeros(1).to(device)
+        loss = loss_disc + self.tpr_loss_weight * loss_tpr
+        return {'loss': loss, 'loss_disc': loss_disc, 'loss_tpr': loss_tpr}
--- a/examples/music_generation/inspiremusic/llm/__pycache__/llm.cpython-310.pyc
+++ b/examples/music_generation/inspiremusic/llm/__pycache__/llm.cpython-310.pyc
--- a/examples/music_generation/inspiremusic/llm/llm.py
+++ b/examples/music_generation/inspiremusic/llm/llm.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Optional, Callable, List, Generator
+import torch
+from torch import nn
+from torch.nn.utils.rnn import pad_sequence, unpad_sequence
+from inspiremusic.utils.common import IGNORE_ID
+from inspiremusic.transformer.label_smoothing_loss import LabelSmoothingLoss
+from inspiremusic.utils.common import th_accuracy, DTYPES
+from torch import Tensor
+from math import log
+from einops import rearrange, reduce, repeat
+import logging
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+class SinusoidalEmbedding(nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x: Tensor) -> Tensor:
+        device, half_dim = x.device, self.dim // 2
+        emb = torch.tensor(log(10000) / (half_dim - 1), device=device)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = rearrange(x, "i -> i 1") * rearrange(emb, "j -> 1 j")
+        return torch.cat((emb.sin(), emb.cos()), dim=-1).to(torch.float16)
+
+class LLM(torch.nn.Module):
+    def __init__(
+            self,
+            text_encoder_input_size: int,
+            llm_input_size: int,
+            llm_output_size: int,
+            audio_token_size: int,
+            llm: torch.nn.Module,
+            sampling: Callable,
+            text_encoder_conf: Dict = None,
+            length_normalized_loss: bool = True,
+            lsm_weight: float = 0.0,
+            frozen_input_embed: bool = False,
+            dtype: str = "fp16",
+            **kwargs,
+    ):
+        super().__init__()
+        self.dtype = DTYPES.get(dtype, torch.float32)
+        self.llm_input_size = llm_input_size
+        self.audio_token_size = audio_token_size
+        # 1. build text token inputs related modules
+
+        if llm is None:
+            self.text_embedding = torch.nn.Embedding(text_token_size, text_encoder_input_size)
+        else:
+            self.text_embedding = llm.model.model.embed_tokens
+            if frozen_input_embed:
+                print("Freezing input embedding layer")
+                for p in self.text_embedding.parameters():
+                    p.requires_grad = False
+        self.chorus_embedding = torch.nn.Embedding(5, llm_input_size)  # intro, chorus, verse1, verse2 , outro
+
+        self.text_encoder_conf = text_encoder_conf
+        self.text_encoder = self.build_encoder(text_encoder_conf)
+        self.infer_cfg_ratio = kwargs.get("infer_cfg_ratio", None)
+        logging.info(f"infer_cfg_ratio: {self.infer_cfg_ratio}")
+        self.train_cfg_ratio = kwargs.get("train_cfg_ratio", None)
+        logging.info(f"train_cfg_ratio: {self.train_cfg_ratio}")
+        # 2. build audio token language model related modules
+        self.sos_eos = 0
+        self.task_id = 1
+
+        self.llm_embedding = torch.nn.Embedding(2, llm_input_size)
+        self.llm = llm
+        self.llm_decoder = nn.Linear(llm_output_size, audio_token_size + 1)
+        self.criterion_ce = LabelSmoothingLoss(
+                size=audio_token_size + 1,
+                padding_idx=IGNORE_ID,
+                smoothing=lsm_weight,
+                normalize_length=length_normalized_loss,
+        )
+
+        # 3. [Optional] build audio token related modules
+        self.speech_embedding = torch.nn.Embedding(audio_token_size, llm_input_size)
+        self.spk_embed_affine_layer = torch.nn.Linear(192, llm_input_size)
+        self.num_codebooks = 4
+        # 4. sampling method
+        self.sampling = sampling
+        self.time_embedding = SinusoidalEmbedding(llm_input_size)
+
+    def cfg_dropout(self, text_token, text_token_len, p):
+        # Classifier-Free Guidance Dropout
+        B = text_token.size(0)
+        num_samples_to_mask = int(p * B)
+        if num_samples_to_mask == 0:
+            num_samples_to_mask = 1
+        indices_to_mask = torch.randperm(B, device=text_token.device)[:num_samples_to_mask]
+        text_token[indices_to_mask] = 0
+        text_token_len[indices_to_mask] = 0
+
+        return text_token, text_token_len
+
+    def build_encoder(self, encoder_conf=None):
+        if encoder_conf is None:
+            assert hasattr(self, "encoder_conf"), \
+                "function param encoder_conf is None and model doesn't has encoder_conf attribute either."
+            encoder_conf = self.encoder_conf
+
+        encoder_name = encoder_conf.pop("name", "transformer")
+        model = None
+        if encoder_name == "transformer":
+            from inspiremusic.transformer.encoder.conformer_encoder import ConformerEncoder
+            model = ConformerEncoder(
+                    **encoder_conf,
+                    input_size=self.input_size,
+                    use_cnn_module=False,
+                    macaron_style=False,
+            )
+        elif encoder_name == "conformer":
+            from inspiremusic.transformer.encoder.conformer_encoder import ConformerEncoder
+            model = ConformerEncoder(
+                    **encoder_conf,
+                    input_size=self.input_size,
+            )
+        elif encoder_name == "llama_encoder":
+            from inspiremusic.transformer.encoder.llama_encoder import LlamaEncoder
+            model = LlamaEncoder(
+                    **encoder_conf,
+                    input_size=self.input_size,
+            )
+        elif encoder_name == "qwen2":
+            from inspiremusic.transformer.encoder.qwen_encoder import QwenEncoder
+            model = QwenEncoder(
+                    **encoder_conf,
+                    input_size=self.input_size,
+            )
+        elif encoder_name == "qwen2.5":
+            from inspiremusic.transformer.encoder.qwen_encoder import QwenEncoder
+            model = QwenEncoder(
+                    **encoder_conf,
+                    input_size=self.input_size,
+            )
+
+        encoder_conf["name"] = encoder_name
+
+        return model
+
+    def encode(self,
+            text: torch.Tensor,
+            text_lengths: torch.Tensor):
+        if self.text_encoder is not None:
+            encoder_out, encoder_mask = self.text_encoder(text, text_lengths,
+                                                          decoding_chunk_size=1,
+                                                          num_decoding_left_chunks=-1)
+            encoder_out_lens = encoder_mask.squeeze(1).sum(1)
+            encoder_out = self.text_encoder_affine_layer(encoder_out)
+        else:
+            encoder_out, encoder_out_lens = text, text_lengths
+        return encoder_out, encoder_out_lens
+
+    def pad_unpad_sequence(self, sos_eos_emb, embeddings, text_token,
+                           text_token_len, task_id_emb, audio_token,
+                           audio_token_len, seg_len):
+        text_token = unpad_sequence(text_token, text_token_len.cpu(),
+                                    batch_first=True)
+
+        audio_token = unpad_sequence(audio_token, audio_token_len.cpu(),
+                                     batch_first=True)
+
+        for i in range(len(embeddings)):
+            embeddings[i] = unpad_sequence(embeddings[i], seg_len.cpu(), batch_first=True)
+
+        lm_input = [torch.concat([sos_eos_emb.squeeze(dim=0)] + [embedding[i] for embedding in embeddings] + [text_token[i], task_id_emb.squeeze(dim=0), audio_token[i]], dim=0) for i in range(len(text_token))]
+        lm_input_len = torch.tensor([i.size(0) for i in lm_input], dtype=torch.int32)
+        lm_input = pad_sequence(lm_input, batch_first=True, padding_value=IGNORE_ID)
+        return lm_input, lm_input_len
+
+    def forward(
+            self,
+            batch: dict,
+            device: torch.device,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+        """
+        Args:
+            text: (B, L, D)
+            text_lengths: (B,)
+            audio: (B, T, N) or (B, T)
+            audio_lengths: (B,)
+        """
+        mask = True
+        text_token = batch['text_token'].to(device)
+        text_token_len = batch['text_token_len'].to(device)
+        if "semantic_token" not in batch:
+            audio_token = batch['acoustic_token'].to(device)
+            audio_token_len = batch['acoustic_token_len'].to(device)
+            audio_token = audio_token.view(audio_token.size(0), -1, self.num_codebooks)
+            audio_token = audio_token[:, :, 0]
+            audio_token_len = (audio_token_len / self.num_codebooks).long()
+
+        else:
+            audio_token = batch['semantic_token'].to(device)
+            audio_token_len = batch['semantic_token_len'].to(device)
+
+        time_start = batch['time_start'].to(device)
+        time_end = batch['time_end'].to(device)
+        chorus = batch['chorus'].to(device)
+        # 1. encode text_token
+
+        if self.train_cfg_ratio > 0:
+            # Classifier-Free Guidance
+            text_token, _ = self.cfg_dropout(text_token, text_token_len, self.train_cfg_ratio)
+
+        # 2. Time Embedding & chorus embedding
+        text_token = self.text_embedding(text_token)
+        text_token, text_token_len = self.encode(text_token, text_token_len)
+        if mask:
+            time_mask = time_start != -1.0
+            seg_len = time_mask.sum(-1)
+            time_start = time_start.masked_fill(~time_mask, 0.0)
+            time_end = time_end.masked_fill(~time_mask, 0.0)
+            chorus = chorus.masked_fill(~time_mask, 0)
+            time_start_embed = self.time_embedding(time_start.view(-1)).to(text_token.dtype)
+            time_end_embed = self.time_embedding(time_end.view(-1)).to(text_token.dtype)
+            time_start_embed = time_start_embed.view(chorus.size(0), chorus.size(1), -1)
+            time_end_embed = time_end_embed.view(chorus.size(0), chorus.size(1), -1)
+            chorus_embed = self.chorus_embedding(chorus)
+            lm_target = [torch.tensor([IGNORE_ID] * (1 + 3 * seg_len[i] + text_token_len[i]) + audio_token[i,:audio_token_len[i]].tolist() + [self.audio_token_size]) for i in range(text_token.size(0))]
+        else:
+            time_start_embed = self.time_embedding(time_start).to(text_token.dtype)
+            time_end_embed = self.time_embedding(time_end).to(text_token.dtype)
+            chorus_embed = self.chorus_embedding(chorus)
+
+            lm_target = [torch.tensor(
+                [IGNORE_ID] * (4 + text_token_len[i]) + audio_token[i,:audio_token_len[i]].tolist() + [self.audio_token_size]) for i in range(text_token.size(0))]
+
+        lm_target = pad_sequence(lm_target, batch_first=True, padding_value=IGNORE_ID).to(device)
+
+        # 3. eos and task_id
+        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+
+        # 4. encode audio_token
+        audio_token = self.speech_embedding(audio_token)
+
+        # 5. unpad and pad
+        lm_input, lm_input_len = self.pad_unpad_sequence(sos_eos_emb,
+                                                         [time_start_embed,
+                                                          time_end_embed,
+                                                          chorus_embed],
+                                                         text_token,
+                                                         text_token_len,
+                                                         task_id_emb,
+                                                         audio_token,
+                                                         audio_token_len,
+                                                         seg_len)
+        # 6. run lm forward
+        lm_output, lm_output_mask = self.llm(lm_input.to(self.dtype), lm_input_len.to(device))
+        logits = self.llm_decoder(lm_output)
+        loss = self.criterion_ce(logits, lm_target)
+
+        acc = th_accuracy(logits.view(-1, self.audio_token_size + 1), lm_target, ignore_label=IGNORE_ID)
+
+        return {'loss': loss, 'acc': acc}
+
+    def sampling_ids(
+            self,
+            weighted_scores: torch.Tensor,
+            decoded_tokens: List,
+            ignore_eos: bool = True,
+    ):
+        top_ids = self.sampling(weighted_scores, decoded_tokens)
+        return top_ids
+
+    @torch.inference_mode()
+    def inference(
+            self,
+            text: torch.Tensor,
+            text_len: torch.Tensor,
+            audio_token: torch.Tensor,
+            audio_token_len: torch.Tensor,
+            prompt_text: torch.Tensor,
+            prompt_text_len: torch.Tensor,
+            prompt_audio_token: torch.Tensor,
+            prompt_audio_token_len: torch.Tensor,
+            embeddings: List,
+            duration_to_gen: float = 30,
+            task: str = "continuation",
+            token_rate: int = 75,
+            limit_audio_prompt_len: int = 5,
+    ) -> Generator[torch.Tensor, None, None]:
+        device = text.device
+
+        if text is not None:
+            text = torch.concat([prompt_text, text], dim=1)
+            text_len += prompt_text_len
+            infer_cfg = self.infer_cfg_ratio >= 0.0
+            if infer_cfg:
+                text_cfg = self.text_embedding(text.new_zeros(text.shape))
+            text = self.text_embedding(text)
+
+            # 1. encode text
+            text, text_len = self.encode(text, text_len)
+
+        # 2. encode embedding
+        if embeddings is not None:
+            time_start, time_end, chorus = embeddings
+
+            if len(chorus.shape) == 1:
+                time_start_embed = self.time_embedding(time_start).reshape(1, 1, -1)  # .half()
+                time_end_embed = self.time_embedding(time_end).reshape(1, 1, -1)  # .half()
+                chorus_embed = self.chorus_embedding(chorus).reshape(1, 1, -1)  # .half()
+            else:
+                time_start_embed = self.time_embedding(
+                    time_start.view(-1)).reshape(1, chorus.size(1), -1)  # .half()
+                time_end_embed = self.time_embedding(time_end.view(-1)).reshape(1, chorus.size(1), -1)  # .half()
+                chorus_embed = self.chorus_embedding(chorus)  # .half()
+
+        # 3. concat llm_input
+        sos_eos_emb = self.llm_embedding.weight[self.sos_eos].reshape(1, 1, -1)
+        task_id_emb = self.llm_embedding.weight[self.task_id].reshape(1, 1, -1)
+
+        if audio_token_len:
+            audio_token = audio_token[:, :(limit_audio_prompt_len * token_rate)]
+            audio_token_emb = self.speech_embedding(audio_token)
+        else:
+            audio_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
+
+        if prompt_audio_token_len:
+            prompt_audio_token_emb = self.speech_embedding(prompt_audio_token)
+        else:
+            prompt_audio_token_emb = torch.zeros(1, 0, self.llm_input_size, dtype=text.dtype).to(device)
+        # Check if removing prompt audio token will fail decoding.
+
+        if task == "continuation":
+            lm_input = torch.concat(
+                    [sos_eos_emb, time_start_embed, time_end_embed,
+                     chorus_embed, text, task_id_emb, audio_token_emb], dim=1)
+
+            if infer_cfg:
+                audio_cfg = self.speech_embedding(
+                    audio_token.new_zeros(audio_token.shape))
+                lm_cf_input = torch.concat(
+                        [sos_eos_emb, torch.rand_like(time_start_embed),
+                         torch.rand_like(time_end_embed),
+                         torch.rand_like(chorus_embed), text_cfg, task_id_emb,
+                         audio_cfg], dim=1)
+                lm_input = torch.cat([lm_input, lm_cf_input], 0)
+        else:
+            lm_input = torch.concat(
+                    [sos_eos_emb, time_start_embed, time_end_embed,
+                     chorus_embed, text, task_id_emb], dim=1)
+            if infer_cfg:
+                lm_cf_input = torch.concat(
+                        [sos_eos_emb, torch.rand_like(time_start_embed),
+                         torch.rand_like(time_end_embed),
+                         torch.rand_like(chorus_embed), text_cfg, task_id_emb],
+                        dim=1)
+                lm_input = torch.cat([lm_input, lm_cf_input], 0)
+
+        # 4. cal min/max_length
+        min_len = int(0.9 * duration_to_gen * token_rate)
+        max_len = duration_to_gen * token_rate
+
+        # 5. step by step decode
+        out_tokens = []
+        offset = 0
+        state = None
+
+        for i in range(int(max_len)):
+            y_pred, _, state = self.llm.forward_one_step(lm_input.to(self.dtype), torch.ones(lm_input.shape[0], lm_input.shape[1], device=lm_input.device).to(torch.bool), cache=state)
+            logits = self.llm_decoder(y_pred[:, -1])
+            if infer_cfg:
+                # perform context free guidance
+                logits_cf = logits[1]
+                logits = logits[0]
+                infer_cfg_ratio = self.infer_cfg_ratio
+                logits = infer_cfg_ratio * logits + (1 - infer_cfg_ratio) * logits_cf
+
+            logp = logits.log_softmax(dim=-1)
+            logp = logp.squeeze(dim=0)
+
+            if i < int(min_len):
+                logp[self.audio_token_size] = torch.tensor(float('-inf'), dtype=self.dtype)
+
+            top_ids = self.sampling_ids(logp, out_tokens, ignore_eos=i < min_len).item()
+
+            if top_ids == self.audio_token_size:
+                break
+
+            # # in stream mode, yield token one by one
+
+            yield torch.tensor([[top_ids]], dtype=torch.int64, device=device)
+            out_tokens.append(top_ids)
+            offset += lm_input.size(1)
+            lm_input = self.speech_embedding.weight[top_ids].reshape(1, 1, -1)
+            if infer_cfg:
+                lm_input = lm_input.repeat(2, 1, 1)
--- a/examples/music_generation/inspiremusic/metrics/clap_score.py
+++ b/examples/music_generation/inspiremusic/metrics/clap_score.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import requests
+from tqdm import tqdm
+import torch
+import numpy as np
+import laion_clap
+from clap_module.factory import load_state_dict
+import librosa
+import pyloudnorm as pyln
+
+# following documentation from https://github.com/LAION-AI/CLAP
+def int16_to_float32(x):
+    return (x / 32767.0).astype(np.float32)
+
+def float32_to_int16(x):
+    x = np.clip(x, a_min=-1., a_max=1.)
+    return (x * 32767.).astype(np.int16)
+
+
+def clap_score(id2text, audio_path, audio_files_extension='.wav', clap_model='music_audioset_epoch_15_esc_90.14.pt'):
+    """
+    Cosine similarity is computed between the LAION-CLAP text embedding of the given prompt and 
+    the LAION-CLAP audio embedding of the generated audio. LION-CLAP: https://github.com/LAION-AI/CLAP
+    
+    This evaluation script assumes that audio_path files are identified with the ids in id2text.
+    
+    clap_score() evaluates all ids in id2text.
+
+    GPU-based computation.
+
+    Select one of the following models from https://github.com/LAION-AI/CLAP:
+        - music_speech_audioset_epoch_15_esc_89.98.pt (used by musicgen)
+        - music_audioset_epoch_15_esc_90.14.pt
+        - music_speech_epoch_15_esc_89.25.pt
+        - 630k-audioset-fusion-best.pt (our default, with "fusion" to handle longer inputs)
+
+    Params:
+    -- id2text: dictionary with the mapping between id (generated audio filenames in audio_path) 
+                and text (prompt used to generate audio). clap_score() evaluates all ids in id2text.
+    -- audio_path: path where the generated audio files to evaluate are available.
+    -- audio_files_extension: files extension (default .wav) in eval_path.
+    -- clap_model: choose one of the above clap_models (default: '630k-audioset-fusion-best.pt').
+    Returns:
+    -- CLAP-LION score
+    """
+    # load model
+    if clap_model == 'music_speech_audioset_epoch_15_esc_89.98.pt':
+        url = 'https://huggingface.co/lukewys/laion_clap/resolve/main/music_speech_audioset_epoch_15_esc_89.98.pt'
+        clap_path = 'CLAP/music_speech_audioset_epoch_15_esc_89.98.pt'
+        model = laion_clap.CLAP_Module(enable_fusion=False, amodel='HTSAT-base',  device='cuda')
+    elif clap_model == 'music_audioset_epoch_15_esc_90.14.pt':
+        url = 'https://huggingface.co/lukewys/laion_clap/resolve/main/music_audioset_epoch_15_esc_90.14.pt'
+        clap_path = 'CLAP/music_audioset_epoch_15_esc_90.14.pt'
+        model = laion_clap.CLAP_Module(enable_fusion=False, amodel='HTSAT-base',  device='cuda')
+    elif clap_model == 'music_speech_epoch_15_esc_89.25.pt':
+        url = 'https://huggingface.co/lukewys/laion_clap/resolve/main/music_speech_epoch_15_esc_89.25.pt'
+        clap_path = 'CLAP/music_speech_epoch_15_esc_89.25.pt'
+        model = laion_clap.CLAP_Module(enable_fusion=False, amodel='HTSAT-base',  device='cuda')
+    elif clap_model == '630k-audioset-fusion-best.pt':
+        url = 'https://huggingface.co/lukewys/laion_clap/resolve/main/630k-audioset-fusion-best.pt'
+        clap_path = 'CLAP/630k-audioset-fusion-best.pt'
+        model = laion_clap.CLAP_Module(enable_fusion=True, device='cuda')
+    else:
+        raise ValueError('clap_model not implemented')
+
+    # download clap_model if not already downloaded
+    if not os.path.exists(clap_path):
+        print('Downloading ', clap_model, '...')
+        os.makedirs(os.path.dirname(clap_path), exist_ok=True)
+
+        response = requests.get(url, stream=True)
+        total_size = int(response.headers.get('content-length', 0))
+
+        with open(clap_path, 'wb') as file:
+            with tqdm(total=total_size, unit='B', unit_scale=True) as progress_bar:
+                for data in response.iter_content(chunk_size=8192):
+                    file.write(data)
+                    progress_bar.update(len(data))
+
+    # fixing CLAP-LION issue, see: https://github.com/LAION-AI/CLAP/issues/118
+    pkg = load_state_dict(clap_path)
+    pkg.pop('text_branch.embeddings.position_ids', None)
+    model.model.load_state_dict(pkg)
+    model.eval()
+
+    if not os.path.isdir(audio_path):        
+        raise ValueError(f'audio_path: {audio_path} does not exist')
+
+    if id2text:   
+        print('[EXTRACTING TEXT EMBEDDINGS] ')
+        batch_size = 64
+        text_emb = {}
+        for i in tqdm(range(0, len(id2text), batch_size)):
+            batch_ids = list(id2text.keys())[i:i+batch_size]
+            batch_texts = [id2text[id] for id in batch_ids]
+            with torch.no_grad():
+                embeddings = model.get_text_embedding(batch_texts, use_tensor=True)
+            for id, emb in zip(batch_ids, embeddings):
+                text_emb[id] = emb
+
+    else:
+        raise ValueError('Must specify id2text')
+
+    print('[EVALUATING GENERATIONS] ', audio_path)
+    score = 0
+    count = 0
+    for id in tqdm(id2text.keys()):
+        file_path = os.path.join(audio_path, str(id)+audio_files_extension)
+        if os.path.isfile(file_path):
+            with torch.no_grad():
+                audio, _ = librosa.load(file_path, sr=48000, mono=True) # sample rate should be 48000
+                audio = pyln.normalize.peak(audio, -1.0)
+                audio = audio.reshape(1, -1) # unsqueeze (1,T)
+                audio = torch.from_numpy(int16_to_float32(float32_to_int16(audio))).float()
+                audio_embeddings = model.get_audio_embedding_from_data(x = audio, use_tensor=True)
+            cosine_sim = torch.nn.functional.cosine_similarity(audio_embeddings, text_emb[id].unsqueeze(0), dim=1, eps=1e-8)[0]
+            print(f"{id} | CLAP score = {cosine_sim}") 
+            score += cosine_sim
+            count += 1
+
+    return score / count if count > 0 else 0
+
--- a/examples/music_generation/inspiremusic/metrics/openl3_fd.py
+++ b/examples/music_generation/inspiremusic/metrics/openl3_fd.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import openl3
+import librosa
+import numpy as np
+from scipy import linalg
+import glob
+from tqdm import tqdm
+import os
+import soxr
+import pyloudnorm as pyln
+
+
+def calculate_embd_statistics(embd_lst):
+    if isinstance(embd_lst, list):
+        embd_lst = np.array(embd_lst)
+    mu = np.mean(embd_lst, axis=0)
+    sigma = np.cov(embd_lst, rowvar=False)
+    return mu, sigma
+
+
+def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
+    """
+    Adapted from: https://github.com/mseitzer/pytorch-fid/blob/master/src/pytorch_fid/fid_score.py
+    Adapted from: https://github.com/gudgud96/frechet-audio-distance/blob/main/frechet_audio_distance/fad.py
+    
+    Numpy implementation of the Frechet Distance.
+    
+    The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
+    and X_2 ~ N(mu_2, C_2) is
+            d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
+
+    Params:
+    -- mu1: Embedding's mean statistics for generated samples.
+    -- mu2: Embedding's mean statistics for reference samples.
+    -- sigma1: Covariance matrix over embeddings for generated samples.
+    -- sigma2: Covariance matrix over embeddings for reference samples.
+    Returns:
+    --  Fréchet Distance.
+    """
+
+    mu1 = np.atleast_1d(mu1)
+    mu2 = np.atleast_1d(mu2)
+
+    sigma1 = np.atleast_2d(sigma1)
+    sigma2 = np.atleast_2d(sigma2)
+
+    assert mu1.shape == mu2.shape, \
+        'Training and test mean vectors have different lengths'
+    assert sigma1.shape == sigma2.shape, \
+        'Training and test covariances have different dimensions'
+
+    diff = mu1 - mu2
+
+    # product might be almost singular
+    covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
+    if not np.isfinite(covmean).all():
+        msg = ('fid calculation produces singular product; '
+            'adding %s to diagonal of cov estimates') % eps
+        print(msg)
+        offset = np.eye(sigma1.shape[0]) * eps
+        covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
+
+    # numerical error might give slight imaginary component
+    if np.iscomplexobj(covmean):
+        if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
+            m = np.max(np.abs(covmean.imag))
+            raise ValueError('Imaginary component {}'.format(m))
+        covmean = covmean.real
+
+    tr_covmean = np.trace(covmean)
+
+    return (diff.dot(diff) + np.trace(sigma1)
+            + np.trace(sigma2) - 2 * tr_covmean)
+
+
+def extract_embeddings(directory_path, channels, samplingrate, content_type, openl3_hop_size, batch_size=16):
+    """
+    Given a list of files, compute their embeddings in batches.
+
+    If channels == 1: stereo audio is downmixed to mono. Mono embeddings are of dim=512.
+
+    If channels == 2: mono audio is "faked" to stereo by copying the mono channel.
+    Stereo embeddings are of dim=1024, since we concatenate L (dim=512) and R (dim=512) embeddings.
+
+    Params:
+    -- directory_path: path where the generated audio files are available.
+    -- channels: 1 (mono), or 2 (stereo) to get mono or stereo embeddings.
+    -- samplingrate: max bandwidth at which we evaluate the given signals. Up to 48kHz.
+    -- content_type: 'music' or 'env' to select a content type specific openl3 model.
+    -- openl3_hop_size: analysis resolution of openl3 in seconds. Openl3's input window is 1 sec. 
+    -- batch_size: number of audio files to process in each batch.
+    Returns:
+    -- list of embeddings: [np.array[], ...], as expected by calculate_frechet_distance()
+    """
+    _, extension = os.path.splitext(directory_path)
+    if extension.lower() == ".scp":
+        wav_files = []
+        with open(directory_path, "r") as f:
+            for line in f:
+                sec = line.strip().split(" ")
+                wav_files.append(sec[1])
+    else:
+        wav_files = glob.glob(directory_path)
+    if len(wav_files) == 0:
+        raise ValueError('No files with this extension in this path!')
+    model = openl3.models.load_audio_embedding_model(input_repr="mel256", content_type=content_type, embedding_size=512)
+    
+    first = True
+    for i in tqdm(range(0, len(wav_files), batch_size)):
+        batch_files = wav_files[i:i+batch_size]
+        batch_audio_l = []
+        batch_audio_r = []
+        batch_sr = []
+        
+        for file in batch_files:
+            audio, sr = librosa.load(file, sr=None, mono=False)
+            audio = audio.T
+            audio = pyln.normalize.peak(audio, -1.0)            
+            if audio.shape[0] < sr: 
+                print('Audio shorter than 1 sec, openl3 will zero-pad it:', file, audio.shape, sr)
+
+            # resample to the desired evaluation bandwidth
+            audio = soxr.resample(audio, sr, samplingrate) # mono/stereo <- mono/stereo, input sr, output sr
+
+            # mono embeddings are stored in batch_audio_l (R channel not used)
+            if channels == 1:
+                batch_audio_l.append(audio)
+
+            elif channels == 2:
+                if audio.ndim == 1:
+                    # if mono, "fake" stereo by copying mono channel to L and R
+                    batch_audio_l.append(audio)
+                    batch_audio_r.append(audio)
+                elif audio.ndim == 2:
+                    # if it's stereo separate channels for openl3
+                    batch_audio_l.append(audio[:,0])
+                    batch_audio_r.append(audio[:,1])
+
+            batch_sr.append(samplingrate)
+
+        # extracting mono embeddings (dim=512) or the L channel for stereo embeddings
+        emb, _ = openl3.get_audio_embedding(batch_audio_l, batch_sr, model=model, verbose=False, hop_size=openl3_hop_size, batch_size=batch_size)
+
+        # format mono embedding
+        if channels == 1:
+            emb = np.concatenate(emb,axis=0)
+        
+        # extracting stereo embeddings (dim=1024), since we concatenate L (dim=512) and R (dim=512) embeddings
+        elif channels == 2:
+            # extract the missing R channel
+            emb_r, _ = openl3.get_audio_embedding(batch_audio_r, batch_sr, model=model, verbose=False, hop_size=openl3_hop_size, batch_size=batch_size)
+            emb = [np.concatenate([l, r], axis=1) for l, r in zip(emb, emb_r)]
+            emb = np.concatenate(emb, axis=0)
+
+        # concatenate embeddings
+        if first:
+            embeddings = emb
+            first = False
+        else:
+            embeddings = np.concatenate([embeddings, emb], axis=0)
+    
+    # return as a list of embeddings: [np.array[], ...]
+    return [e for e in embeddings]
+
+
+def extract_embeddings_nobatching(directory_path, channels, samplingrate, content_type, openl3_hop_size):
+    """
+    Given a list of files, compute their embeddings one by one.
+
+    If channels == 1: stereo audio is downmixed to mono. Mono embeddings are of dim=512.
+
+    If channels == 2: mono audio is "faked" to stereo by copying the mono channel.
+    Stereo embeddings are of dim=1024, since we concatenate L (dim=512) and R (dim=512) embeddings.
+
+    Params:
+    -- directory_path: path where the generated audio files are available.
+    -- channels: 1 (mono), or 2 (stereo) to get mono or stereo embeddings.
+    -- samplingrate: max bandwidth at which we evaluate the given signals. Up to 48kHz.
+    -- content_type: 'music' or 'env' to select a content type specific openl3 model.
+    -- openl3_hop_size: analysis resolution of openl3 in seconds. Openl3's input window is 1 sec. 
+    Returns:
+    -- list of embeddings: [np.array[], ...], as expected by calculate_frechet_distance()
+    """
+    _, extension = os.path.splitext(directory_path)
+    if extension.lower() == ".scp":
+        wav_files = []
+        with open(directory_path, "r") as f:
+            for line in f:
+                sec = line.strip().split(" ")
+                wav_files.append(sec[1])
+    else:
+        wav_files = glob.glob(directory_path)
+    if len(wav_files) == 0:
+        raise ValueError('No files with this extension in this path!')    
+    model = openl3.models.load_audio_embedding_model(input_repr="mel256", content_type=content_type, embedding_size=512)
+
+    first = True
+    for file in tqdm(wav_files):
+        audio, sr = librosa.load(file, sr=None)
+        audio = pyln.normalize.peak(audio, -1.0)
+        if audio.shape[0] < sr: 
+            print('Audio shorter than 1 sec, openl3 will zero-pad it:', file, audio.shape, sr)
+
+        # resample to the desired evaluation bandwidth
+        audio = soxr.resample(audio, sr, samplingrate) # mono/stereo <- mono/stereo, input sr, output sr
+
+        # extracting stereo embeddings (dim=1024), since we concatenate L (dim=512) and R (dim=512) embeddings
+        if channels == 2:
+            if audio.ndim == 1:
+                audio_l3, sr_l3 = audio, samplingrate
+            elif audio.ndim == 2:
+                # if it's stereo separate channels for openl3
+                audio_l3 = [audio[:,0], audio[:,1]]
+                sr_l3 = [samplingrate, samplingrate]
+            emb, _ = openl3.get_audio_embedding(audio_l3, sr_l3, model=model, verbose=False, hop_size=openl3_hop_size)
+            if audio.ndim == 1:
+                # if mono audio, "fake" stereo by concatenating mono embedding as L and R embeddings
+                emb = np.concatenate([emb, emb],axis=1)
+            elif audio.ndim == 2:
+                emb = np.concatenate(emb,axis=1)
+
+        # or extracting mono embeddings (dim=512)
+        elif channels == 1: 
+            emb, _ = openl3.get_audio_embedding(audio, samplingrate, model=model, verbose=False, hop_size=openl3_hop_size)
+
+        # concatenate embeddings
+        if first:
+            embeddings = emb
+            first = False
+        else:
+            embeddings = np.concatenate([embeddings, emb], axis=0)
+    
+    # return as a list of embeddings: [np.array[], ...]
+    return [e for e in embeddings]
+
+
+def openl3_fd(channels, samplingrate, content_type, openl3_hop_size, eval_path, 
+              eval_files_extension='.wav', ref_path=None, ref_files_extension='.wav', load_ref_embeddings=None, batching=False):
+    """
+    Compute the Fréchet Distance between files in eval_path and ref_path.
+    
+    Fréchet distance computed on top of openl3 embeddings.
+
+    GPU-based computation.
+
+    Extracting the embeddings is timeconsuming. After being computed once, we store them.
+    We store pre-computed reference embedding statistics in load/openl3_fd/ 
+    To load those and save computation, just set the path in load_ref_embeddings.
+    If load_ref_embeddings is set, ref_path is not required.
+
+    Params:
+    -- channels: 1 (mono), or 2 (stereo) to get the Fréchet Distance over mono or stereo embeddings.
+    -- samplingrate: max bandwith at wich we evaluate the given signals. Up to 48kHz.
+    -- content_type: 'music' or 'env' to select a content type for openl3.
+    -- openl3_hop_size: analysis resolution of openl3 in seconds. Openl3's input window is 1 sec.
+    -- eval_path: path where the generated audio files to evaluate are available.
+    -- eval_files_extenstion: files extension (default .wav) in eval_path.
+    -- ref_path: path where the reference audio files are available. (instead of load_ref_embeddings)
+    -- ref_files_extension: files extension (default .wav) in ref_path.
+    -- load_ref_embeddings: path to the reference embedding statistics. (inestead of ref_path)
+    -- batching: set batch size (with an int) or set to False (default False).
+    Returns:
+    -- Fréchet distance.
+    """
+
+    if not os.path.isdir(eval_path):        
+        raise ValueError('eval_path does not exist')
+
+    if load_ref_embeddings:
+        if not os.path.exists(load_ref_embeddings):
+            raise ValueError('load_ref_embeddings does not exist')
+        print('[LOADING REFERENCE EMBEDDINGS] ', load_ref_embeddings)
+        loaded = np.load(load_ref_embeddings)
+        mu_ref = loaded['mu_ref']
+        sigma_ref = loaded['sigma_ref']
+
+    else:
+        if ref_path:
+            if not os.path.isdir(ref_path):
+                if not os.path.isfile(ref_path):
+                    raise ValueError("ref_path does not exist")
+            if os.path.isfile(ref_path):
+                path = ref_path
+            else:
+                path = os.path.join(ref_path, '*'+ref_files_extension)
+            print('[EXTRACTING REFERENCE EMBEDDINGS] ', path)
+            if batching:
+                ref_embeddings = extract_embeddings(path, channels, samplingrate, content_type, openl3_hop_size, batch_size=batching)
+            else:
+                ref_embeddings = extract_embeddings_nobatching(path, channels, samplingrate, content_type, openl3_hop_size)            
+            mu_ref, sigma_ref = calculate_embd_statistics(ref_embeddings)
+
+            # store statistics to load later on
+            if not os.path.exists('load/openl3_fd'):
+                os.makedirs('load/openl3_fd/')
+            save_ref_embeddings_path = (
+                'load/openl3_fd/' +
+                path.replace('/', '_') +
+                '__channels' + str(channels) +
+                '__' + str(samplingrate) +
+                '__openl3' + str(content_type) +
+                '__openl3hopsize' + str(openl3_hop_size) +
+                '__batch' + str(batching) +
+                '.npz'
+            )                
+            np.savez(save_ref_embeddings_path, mu_ref=mu_ref, sigma_ref=sigma_ref)
+            print('[REFERENCE EMBEDDINGS][SAVED] ', save_ref_embeddings_path)
+
+        else:
+            raise ValueError('Must specify ref_path or load_ref_embeddings')
+
+    path = os.path.join(eval_path, '*'+eval_files_extension)
+    print('[EXTRACTING EVALUATION EMBEDDINGS] ', path)
+    if batching:
+        eval_embeddings = extract_embeddings(path, channels, samplingrate, content_type, openl3_hop_size, batch_size=batching)
+    else:
+        eval_embeddings = extract_embeddings_nobatching(path, channels, samplingrate, content_type, openl3_hop_size)    
+    mu_eval, sigma_eval = calculate_embd_statistics(eval_embeddings)
+
+    fd = calculate_frechet_distance(mu_eval, sigma_eval, mu_ref, sigma_ref)
+    if load_ref_embeddings:
+        print('[FRéCHET DISTANCE] ', eval_path, load_ref_embeddings, fd)
+    else:
+        print('[FRéCHET DISTANCE] ', eval_path, ref_path, fd)
+
+    return fd
\ No newline at end of file
--- a/examples/music_generation/inspiremusic/metrics/passt_kld.py
+++ b/examples/music_generation/inspiremusic/metrics/passt_kld.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
+
+import os
+import contextlib
+from functools import partial
+from tqdm import tqdm
+import pickle
+import numpy as np
+import librosa
+from hear21passt.base import get_basic_model
+import pyloudnorm as pyln
+
+import torch
+import torch.nn.functional as F
+
+
+SAMPLING_RATE = 32000
+
+
+class _patch_passt_stft:
+    """    
+    From version 1.8.0, return_complex must always be given explicitly 
+    for real inputs and return_complex=False has been deprecated.
+
+    Decorator to patch torch.stft in PaSST that uses an old stft version.
+
+    Adapted from: https://github.com/facebookresearch/audiocraft/blob/a2b96756956846e194c9255d0cdadc2b47c93f1b/audiocraft/metrics/kld.py
+    """
+    def __init__(self):
+        self.old_stft = torch.stft
+
+    def __enter__(self):
+        # return_complex is a mandatory parameter in latest torch versions.
+        # torch is throwing RuntimeErrors when not set.
+        # see: https://pytorch.org/docs/1.7.1/generated/torch.stft.html?highlight=stft#torch.stft
+        # see: https://github.com/kkoutini/passt_hear21/commit/dce83183674e559162b49924d666c0a916dc967a
+        torch.stft = partial(torch.stft, return_complex=False)
+
+    def __exit__(self, *exc):
+        torch.stft = self.old_stft
+
+
+def return_probabilities(model, audio_path, window_size=10, overlap=5, collect='mean'):
+    """
+    Given an audio and the PaSST model, return the probabilities of each AudioSet class.
+
+    Audio is converted to mono at 32kHz.
+
+    PaSST model is trained with 10 sec inputs. We refer to this parameter as the window_size.
+    We set it to 10 sec for consistency with PaSST training.
+
+    For longer audios, we split audio into overlapping analysis windows of window_size and overlap of 10 and 5 seconds.
+    PaSST supports 10, 20 or 30 sec inputs. Not longer inputs: https://github.com/kkoutini/PaSST/issues/19 
+
+    Note that AudioSet taggers normally use sigmoid output layers. Yet, to compute the
+    KL we work with normalized probabilities by running a softmax over logits as in MusicGen:
+    https://github.com/facebookresearch/audiocraft/blob/a2b96756956846e194c9255d0cdadc2b47c93f1b/audiocraft/metrics/kld.py
+
+    This implementation assumes run will be on GPU.
+
+    Params:
+    -- model: PaSST model on a GPU.
+    -- audio_path: path to the audio to be loaded with librosa.
+    -- window_size (default=10 sec): analysis window (and receptive field) of PaSST.
+    -- overlap (default=5 sec): overlap of the running analysis window for inputs longar than window_size (10 sec).
+    -- collect (default='mean'): for longer inputs, aggregate/collect via 'mean' or 'max' pooling along logits vector.
+    Returns:
+    --  527 probabilities (after softmax, no logarithm).
+    """
+    # load the audio using librosa
+    audio, _ = librosa.load(audio_path, sr=SAMPLING_RATE, mono=True)
+    audio = pyln.normalize.peak(audio, -1.0)
+
+    # calculate the step size for the analysis windows with the specified overlap
+    step_size = int((window_size - overlap) * SAMPLING_RATE)
+
+    # iterate over the audio, creating analysis windows
+    probabilities = []
+    for i in range(0, max(step_size, len(audio) - step_size), step_size):
+        # extract the current analysis window
+        window = audio[i:i + int(window_size * SAMPLING_RATE)]
+
+        # pad the window with zeros if it's shorter than the desired window size
+        if len(window) < int(window_size * SAMPLING_RATE):
+            # discard window if it's too small (avoid mostly zeros predicted as silence), as in MusicGen:
+            # https://github.com/facebookresearch/audiocraft/blob/a2b96756956846e194c9255d0cdadc2b47c93f1b/audiocraft/metrics/kld.py
+            if len(window) > int(window_size * SAMPLING_RATE * 0.15):
+                tmp = np.zeros(int(window_size * SAMPLING_RATE))
+                tmp[:len(window)] = window
+                window = tmp
+
+        # convert to a PyTorch tensor and move to GPU
+        audio_wave = torch.from_numpy(window.astype(np.float32)).unsqueeze(0).cuda()
+
+        # get the probabilities for this analysis window
+        with open(os.devnull, 'w') as f, contextlib.redirect_stdout(f):
+            with torch.no_grad(), _patch_passt_stft():
+                logits = model(audio_wave)
+                probabilities.append(torch.squeeze(logits))
+
+    probabilities = torch.stack(probabilities)
+    if collect == 'mean':
+        probabilities = torch.mean(probabilities, dim=0)
+    elif collect == 'max':
+        probabilities, _ = torch.max(probabilities, dim=0)
+
+    return F.softmax(probabilities, dim=0).squeeze().cpu()
+
+
+def passt_kld(ids, eval_path, eval_files_extension='.wav', ref_path=None, ref_files_extension='.wav', load_ref_probabilities=None, no_ids=[], collect='mean'):
+    """
+    Compute KL-divergence between the label probabilities of the generated audio with respect to the original audio.
+    Both generated audio (in eval_path) and original audio (in ref_path) are represented by the same prompt/description.
+    Audios are identified by an id, that is the name of the file in both directories and links the audio with the prompt/description.
+    segmenting the audio
+
+    For inputs longer that the 10 sec PaSST was trained on, we aggregate/collect via 'mean' (default) or 'max' pooling along the logits vector.
+    We split the inpot into overlapping analysis windows. Subsequently, we aggregate/collect (accross windows) the generated logits and then apply a softmax. 
+
+    This evaluation script assumes that ids are in both ref_path and eval_path.
+
+    We label probabilities via the PaSST model: https://github.com/kkoutini/PaSST
+
+    GPU-based computation.
+    
+    Extracting the probabilities is timeconsuming. After being computed once, we store them.
+    We store pre-computed reference probabilities in load/ 
+    To load those and save computation, just set the path in load_ref_probabilities.
+    If load_ref_probabilities is set, ref_path is not required.
+
+    Params:
+    -- ids: list of ids present in both eval_path and ref_path. 
+    -- eval_path: path where the generated audio files to evaluate are available.
+    -- eval_files_extenstion: files extension (default .wav) in eval_path.
+    -- ref_path: path where the reference audio files are available. (instead of load_ref_probabilities)
+    -- ref_files_extenstion: files extension (default .wav) in ref_path.
+    -- load_ref_probabilities: path to the reference probabilities. (inestead of ref_path)
+    -- no_ids: it is possible that some reference audio is corrupted or not present. Ignore some this list of ids.
+    -- collect (default='mean'): for longer inputs, aggregate/collect via 'mean' or 'max' pooling along the logits vector.
+    Returns:
+    -- KL divergence
+    """
+    with open(os.devnull, 'w') as f, contextlib.redirect_stdout(f): # capturing all useless outputs from passt
+        # load model
+        model = get_basic_model(mode="logits")
+        model.eval()
+        model = model.cuda()
+
+    if not os.path.isdir(eval_path):
+        if not os.path.isfile(eval_path): 
+            raise ValueError('eval_path does not exist')
+
+    if load_ref_probabilities:
+        if not os.path.exists(load_ref_probabilities):
+            raise ValueError('load_ref_probabilities does not exist')     
+        print('[LOADING REFERENCE PROBABILITIES] ', load_ref_probabilities)
+        with open(load_ref_probabilities, 'rb') as fp:
+            ref_p = pickle.load(fp)
+
+    else:
+        if ref_path:
+            if not os.path.isdir(ref_path):
+                if os.path.isfile(ref_path):
+                    id2utt = {}
+                    with open(ref_path, "r") as f:
+                        for line in f:
+                            sec = line.strip().split(" ")
+                            id2utt[sec[0]] = sec[1]
+                    f.close()
+                else:
+                    raise ValueError("ref_path does not exist")        
+            print('[EXTRACTING REFERENCE PROBABILITIES] ', ref_path)
+            ref_p = {}
+            for id in tqdm(ids):
+                if id not in no_ids:
+                    try:
+                        if os.path.isfile(ref_path):
+                            if id in id2utt.keys():
+                                audio_path = id2utt[id]
+                            else:
+                                raise ValueError(f"id: {id} not in {ref_path}!")  
+                        else:
+                            audio_path = os.path.join(ref_path, str(id)+ref_files_extension)
+                        if os.path.isfile(audio_path):
+                            ref_p[id] = return_probabilities(model, audio_path, collect=collect)
+                    except Exception as e:
+                        print(f"An unexpected error occurred with {id}: {e}\nIf you failed to download it you can add it to no_ids list.")
+
+            # store reference probabilities to load later on
+            if not os.path.exists('load/passt_kld/'):
+                os.makedirs('load/passt_kld/')
+            save_ref_probabilities_path = 'load/passt_kld/'+ref_path.replace('/', '_')+'_collect'+str(collect)+'__reference_probabilities.pkl'
+            with open(save_ref_probabilities_path, 'wb') as fp:
+                pickle.dump(ref_p, fp)        
+            print('[REFERENCE EMBEDDINGS][SAVED] ', save_ref_probabilities_path)
+
+        else:
+            raise ValueError('Must specify ref_path or load_ref_probabilities')
+
+    print('[EVALUATING GENERATIONS] ', eval_path)
+            
+    passt_kl = 0
+    count = 0
+    for id in tqdm(ids):
+        if id not in no_ids:
+            try:
+                audio_path = os.path.join(eval_path, str(id)+eval_files_extension)
+                if os.path.isfile(audio_path):
+                    eval_p = return_probabilities(model, audio_path, collect=collect)
+                    # note: F.kl_div(x, y) is KL(y||x)
+                    # see: https://github.com/pytorch/pytorch/issues/7337
+                    # see: https://discuss.pytorch.org/t/kl-divergence-different-results-from-tf/56903/2
+                    passt_kl += F.kl_div((ref_p[id] + 1e-6).log(), eval_p, reduction='sum', log_target=False)
+                    count += 1
+            except Exception as e:
+                print(f"An unexpected error occurred with {id}: {e}\nIf you failed to download it you can add it to no_ids list.")
+    return passt_kl / count if count > 0 else 0
--- a/examples/music_generation/inspiremusic/music_tokenizer/__init__.py
+++ b/examples/music_generation/inspiremusic/music_tokenizer/__init__.py
--- a/examples/music_generation/inspiremusic/music_tokenizer/__pycache__/__init__.cpython-310.pyc
+++ b/examples/music_generation/inspiremusic/music_tokenizer/__pycache__/__init__.cpython-310.pyc
--- a/examples/music_generation/inspiremusic/music_tokenizer/__pycache__/env.cpython-310.pyc
+++ b/examples/music_generation/inspiremusic/music_tokenizer/__pycache__/env.cpython-310.pyc
--- a/examples/music_generation/inspiremusic/music_tokenizer/__pycache__/models.cpython-310.pyc
+++ b/examples/music_generation/inspiremusic/music_tokenizer/__pycache__/models.cpython-310.pyc
--- a/examples/music_generation/inspiremusic/music_tokenizer/__pycache__/vqvae.cpython-310.pyc
+++ b/examples/music_generation/inspiremusic/music_tokenizer/__pycache__/vqvae.cpython-310.pyc
--- a/examples/music_generation/inspiremusic/music_tokenizer/env.py
+++ b/examples/music_generation/inspiremusic/music_tokenizer/env.py
+# Copyright (c) 2024 Alibaba Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))