init push

ecd710b6 · wangzhengtao · ecd710b6 · ecd710b6 · ecd710b6 · ecd710b6
Commit ecd710b6 authored Apr 25, 2025 by wangzhengtao
20 changed files
--- a/kimia_infer/models/detokenizer/semantic_fm_prefix_streaming.py
+++ b/kimia_infer/models/detokenizer/semantic_fm_prefix_streaming.py
+import yaml
+import logging
+import time
+
+import os
+import torch
+
+from .flow_matching.ode_wrapper import StreamingODEWrapperForPrefix
+from .flow_matching.model import DiTPrefix
+from .flow_matching.scheduler import StreamingFlowMatchingScheduler
+
+
+logger = logging.getLogger(__name__)
+
+
+class StreamingSemanticFMWrapper:
+    def __init__(
+        self,
+        speech_model: DiTPrefix,
+        max_kv_cache_tokens=900,
+        max_prompt_chunk=2,
+        use_cfg=True,
+        use_cfg_rescale=True,
+        cfg_init=1.5,
+        cfg_scale=7.5,
+        cfg_schedule="linear",
+        cfg_token_id=0,
+        normalize_mel=False,
+        mel_mean=None,
+        mel_std=None,
+        device: torch.device = torch.device("cpu"),
+    ) -> None:
+
+        self.dtype = torch.bfloat16
+        self.speech_model = speech_model.to(device).to(self.dtype)
+        self.speech_model = self.speech_model.eval()
+        self.device = device
+        self.normalize_mel = normalize_mel
+        self.mel_mean = mel_mean
+        self.mel_std = mel_std
+
+        self.use_cfg = use_cfg
+        self.use_cfg_rescale = use_cfg_rescale
+        self.cfg_init = cfg_init
+        self.cfg_scale = cfg_scale
+        self.cfg_schedule = cfg_schedule
+
+        self.incremental_state = {}
+        self.condition_cache = {"previous_seqlen": 0}
+
+        logger.info(
+            f">>> SemanticFMWrapper initialized with use_cfg={use_cfg}, use_cfg_rescale={use_cfg_rescale}, cfg_init={cfg_init}, cfg_scale={cfg_scale}, cfg_schedule={cfg_schedule}"
+        )
+
+        self.scheduler = StreamingFlowMatchingScheduler()
+        self.ode_wrapper = StreamingODEWrapperForPrefix(
+            net=self.speech_model,
+            x_mask=None,
+            x_cond=None,
+            use_cfg=use_cfg,
+            use_cfg_rescale=use_cfg_rescale,
+            cfg_init=cfg_init,
+            cfg_scale=cfg_scale,
+            cfg_schedule=cfg_schedule,
+            cfg_token_id=cfg_token_id,
+        )
+
+        self.max_kv_cache_tokens = max_kv_cache_tokens
+        self.max_prompt_chunk = max_prompt_chunk
+        self.reserve_kv_cache_tokens = 0
+
+    @torch.inference_mode()
+    def infer_chunk(
+        self,
+        xt_chunk,
+        semantic_tokens_chunk,
+        start_position_id,
+        cache=None,
+        look_ahead_tokens=0,
+        ode_steps=15,
+        verbose=False,
+        ode_solver="neural_ode_euler",
+    ):
+        """
+        semantic_tokens: [T_1], torch.LongTensor
+        xt: [T_2, 80], torch.Tensor, DO NOT normalize it outside
+        ode_steps: int, number of ode steps, default 15
+        verbose: bool, default False
+        ode_solver: str, ode solver, expected in ("neural_ode_euler", "naive_euler"), default "neural_ode_euler"
+        """
+        bs = 1
+
+        self.scheduler.set_timesteps(ode_steps)
+
+        semantic_tokens_chunk = semantic_tokens_chunk.unsqueeze(0).to(self.device)
+        xt_chunk = xt_chunk.unsqueeze(0).to(self.device).to(self.dtype)
+
+        t_span = torch.linspace(0, 1, self.scheduler.timesteps)
+
+        x_mask = torch.zeros(bs, xt_chunk.shape[1], device=self.device).bool()
+
+        cache_ret = self.ode_wrapper.set_conditions(
+            x_mask=x_mask,
+            x_cond=semantic_tokens_chunk,
+            start_position_id=start_position_id,
+            cache=self.condition_cache,
+        )
+
+        if verbose:
+            t_start = time.time()
+        if ode_solver == "neural_ode_euler":
+            x_t = self.scheduler.sample_by_neuralode(
+                self.ode_wrapper, time_steps=t_span, xt=xt_chunk, verbose=False
+            )
+        elif ode_solver == "naive_euler":
+            x_t = self.scheduler.sample(
+                ode_wrapper=self.ode_wrapper,
+                time_steps=t_span,
+                xt=xt_chunk,
+                verbose=False,
+            )
+        else:
+            raise NotImplementedError(
+                "ode_solver should be in ('neural_ode_euler', 'naive_euler')"
+            )
+
+        if look_ahead_tokens > 0:
+            semantic_tokens_left = semantic_tokens_chunk.view(-1)[-look_ahead_tokens:]
+            cache["semantic_token"] = semantic_tokens_left
+            x_t_ret = x_t[:, :-look_ahead_tokens, :]
+        else:
+            x_t_ret = x_t
+
+        if look_ahead_tokens > 0:
+            x_mask = torch.zeros(
+                bs, xt_chunk.shape[1] - look_ahead_tokens, device=self.device
+            ).bool()
+            self.condition_cache = self.ode_wrapper.set_conditions(
+                x_mask=x_mask,
+                x_cond=semantic_tokens_chunk[:, :-look_ahead_tokens],
+                start_position_id=start_position_id,
+                cache=self.condition_cache,
+            )
+            self.ode_wrapper(torch.Tensor([0.999]).to(x_t_ret.device), x_t_ret)
+        else:
+            self.condition_cache = cache_ret
+
+        if verbose:
+            t_end = time.time()
+            logger.info(f"[ODE Chunk] Time cost: {t_end - t_start}")
+
+        if self.normalize_mel:
+            x_t_ret = x_t_ret * self.mel_std + self.mel_mean
+        return x_t_ret.squeeze(0)
+
+    @torch.inference_mode()
+    def infer_mel(
+        self,
+        semantic_tokens,
+        ode_steps=15,
+        chunk_size=150,
+        verbose=False,
+        ode_solver="neural_ode_euler",
+    ):
+        """
+        semantic_tokens: [T_1], torch.LongTensor
+        prompt: [T_2, 80], torch.Tensor, DO NOT normalize it outside
+        prompt_semantic_tokens, [T_2], torch.LongTensor
+        ode_steps: int, number of ode steps, default 15
+        verbose: bool, default False
+        ode_solver: str, ode solver, expected in ("neural_ode_euler", "naive_euler"), default "neural_ode_euler"
+        """
+        assert semantic_tokens.dim() == 1
+
+        x_t = torch.randn(semantic_tokens.shape[0], 80).to(self.device).to(self.dtype)
+
+        seq_len = semantic_tokens.shape[0]
+
+        num_chunks = seq_len // chunk_size
+        if seq_len % chunk_size != 0:
+            num_chunks += 1
+
+        x_pred_collect = []
+
+        if verbose:
+            t_start = time.time()
+
+        for chunk_id in range(num_chunks):
+            start = chunk_id * chunk_size
+            end = min(start + chunk_size, seq_len)
+            semantic_tokens_chunk = semantic_tokens[start:end]
+            x_t_chunk = x_t[start:end, :]
+
+            x_pred = self.infer_chunk(
+                xt_chunk=x_t_chunk,
+                semantic_tokens_chunk=semantic_tokens_chunk,
+                start_position_id=self.start_position_id,
+                ode_steps=ode_steps,
+                verbose=verbose,
+                ode_solver=ode_solver,
+            )
+            self.start_position_id += end - start
+            self.update_incremental_state()
+
+            x_pred_collect.append(x_pred)
+
+        if verbose:
+            t_end = time.time()
+            logger.info(f"[ODE] Time cost: {t_end - t_start}")
+
+        x_pred = torch.cat(x_pred_collect, dim=0)
+
+        return x_pred
+
+    def clear_all_states(self):
+        self.start_position_id = 0
+        self.condition_cache = {"previous_seqlen": 0}
+        self.ode_wrapper.clear_all_states()
+
+    def state_dict(self):
+        return {
+            "start_position_id": self.start_position_id,
+            "ode_wrapper": self.ode_wrapper.state_dict(),
+            "condition_cache": self.condition_cache,
+        }
+
+    def load_state_dict(self, state_dict):
+        if state_dict is not None:
+            self.start_position_id = state_dict["start_position_id"]
+            self.ode_wrapper.load_state_dict(state_dict["ode_wrapper"])
+            self.condition_cache = state_dict["condition_cache"]
+
+    def update_incremental_state(self):
+        self.ode_wrapper.update_incremental_state(
+            reserve_kv_cache_tokens=0,
+            max_kv_cache_tokens=self.max_kv_cache_tokens,
+            condition_cache=self.condition_cache,
+        )
+
+    @torch.inference_mode()
+    def prefill(self, mel, semantic_token, chunk_size=150, verbose=False):
+        """
+        mel: [T, 80], torch.Tensor
+        semantic_token: [T], torch.LongTensor
+        chunk_size: int, default 150
+        """
+        assert mel.dim() == 2
+        assert semantic_token.dim() == 1
+        assert (
+            semantic_token.shape[0] == mel.shape[0]
+        ), "Semantic token and mel shape mismatch"
+        seq_len = mel.shape[0]
+        num_chunks = min(seq_len // chunk_size, self.max_prompt_chunk)
+        start_pos = seq_len - num_chunks * chunk_size
+
+        res_mel = mel[:start_pos, :]
+        res_semantic_token = semantic_token[:start_pos]
+        self.prefill_chunk(
+            res_mel, res_semantic_token, start_position_id=self.start_position_id
+        )
+        self.start_position_id += start_pos
+        self.update_incremental_state()
+        self.reserve_kv_cache_tokens += self.ode_wrapper.kv_cache_tokens
+
+        if verbose:
+            logger.info("Prefilling prompt with {} chunks".format(num_chunks))
+            start_time = time.time()
+
+        for chunk_id in range(num_chunks):
+            start = start_pos + chunk_id * chunk_size
+            end = start + chunk_size
+            mel_chunk = mel[start:end, :]
+            semantic_token_chunk = semantic_token[start:end]
+
+            self.prefill_chunk(
+                mel_chunk,
+                semantic_token_chunk,
+                start_position_id=self.start_position_id,
+            )
+            self.start_position_id += end - start
+
+            self.update_incremental_state()
+            self.reserve_kv_cache_tokens += self.ode_wrapper.kv_cache_tokens
+
+        if verbose:
+            logger.info(
+                "Prefilling done in {:.2f} seconds".format(time.time() - start_time)
+            )
+
+    def prefill_chunk(self, mel_chunk, semantic_tokens_chunk, start_position_id=0):
+        """
+        mel_chunk: [T, 80], torch.Tensor, T is the chunk size
+        semantic_tokens_chunk: [T], torch.LongTensor
+        start_position_id: int, default 0
+        """
+        bs = 1
+
+        semantic_tokens_chunk = semantic_tokens_chunk.unsqueeze(0).to(self.device)
+        mel_chunk = mel_chunk.unsqueeze(0).to(self.device).to(self.dtype)
+
+        if self.normalize_mel:
+            mel_chunk = (mel_chunk - self.mel_mean) / self.mel_std
+
+        x_mask = torch.zeros(bs, mel_chunk.shape[1], device=self.device).bool()
+
+        self.condition_cache = self.ode_wrapper.set_conditions(
+            x_mask=x_mask,
+            x_cond=semantic_tokens_chunk,
+            start_position_id=start_position_id,
+            cache=self.condition_cache,
+        )
+
+        x_t = torch.Tensor([0.999]).to(self.device)
+
+        self.ode_wrapper(x_t, mel_chunk)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_config,
+        ckpt_path,
+        device,
+        max_prompt_chunk=2,
+        max_kv_cache_tokens=900,
+        use_cfg=True,
+        use_cfg_rescale=True,
+        cfg_init=1.5,
+        cfg_scale=7.5,
+        cfg_schedule="linear",
+    ):
+
+        # open yaml file
+        with open(model_config, "r") as f:
+            config = yaml.safe_load(f)
+        model_config = config["model"]["dit"]
+        dit = DiTPrefix(
+            input_size=model_config["input_size"],
+            semantic_vocab_size=model_config["semantic_vocab_size"] + 1,
+            hidden_size=model_config["hidden_size"],
+            depth=model_config["depth"],
+            num_heads=model_config["num_heads"],
+            mlp_ratio=model_config["mlp_ratio"],
+            ffn_type=model_config.get("ffn_type", "conv1d_conv1d"),
+            ffn_gated_glu=model_config.get("ffn_gated_glu", True),
+            ffn_act_layer=model_config.get("ffn_act_layer", "gelu"),
+            ffn_conv_kernel_size=model_config.get("ffn_conv_kernel_size", 5),
+            use_rope=model_config.get("use_rope", False),
+            rope_params=model_config.get(
+                "rope_params",
+                {
+                    "max_position_embeddings": 4096,
+                    "rope_base": 10000,
+                    "rope_interpolation_factor": 1,
+                },
+            ),
+            position_embedding_type=model_config["position_embedding_type"],
+            max_seq_len=model_config["max_seq_len"],
+            output_size=model_config["input_size"],
+            prompt_cfg_dropout=0,
+        )
+        cfg_semantic_token_id = model_config["semantic_vocab_size"]
+
+        # load state_dict
+        state_dict = torch.load(ckpt_path, map_location="cpu", weights_only=True)[
+            "state_dict"
+        ]
+        speech_model_params = {
+            k.replace("speech_model.", ""): v
+            for k, v in state_dict.items()
+            if "speech_model" in k
+        }
+        dit.load_state_dict(speech_model_params, strict=True)
+        logger.info(f">>> Loaded checkpoint from {ckpt_path}")
+
+        return cls(
+            speech_model=dit,
+            device=device,
+            normalize_mel=config["normalize_mel"],
+            mel_mean=config["mel_mean"],
+            mel_std=config["mel_std"],
+            max_prompt_chunk=max_prompt_chunk,
+            max_kv_cache_tokens=max_kv_cache_tokens,
+            use_cfg=use_cfg,
+            use_cfg_rescale=use_cfg_rescale,
+            cfg_init=cfg_init,
+            cfg_scale=cfg_scale,
+            cfg_schedule=cfg_schedule,
+            cfg_token_id=cfg_semantic_token_id,
+        )
--- a/kimia_infer/models/detokenizer/vocoder/activations.py
+++ b/kimia_infer/models/detokenizer/vocoder/activations.py
+import torch
+from torch import nn, sin, pow
+from torch.nn import Parameter
+
+
+class Snake(nn.Module):
+    """
+    Implementation of a sine-based periodic activation function
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter
+    References:
+        - This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snake(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    """
+
+    def __init__(
+        self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False
+    ):
+        """
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha: trainable parameter
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            alpha will be trained along with the rest of your model.
+        """
+        super(Snake, self).__init__()
+        self.in_features = in_features
+
+        # Initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # Log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+        else:  # Linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+
+        self.alpha.requires_grad = alpha_trainable
+
+        self.no_div_by_zero = 0.000000001
+
+    def forward(self, x):
+        """
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        Snake ∶= x + 1/a * sin^2 (xa)
+        """
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # Line up with x to [B, C, T]
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+        x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+
+        return x
+
+
+class SnakeBeta(nn.Module):
+    """
+    A modified Snake function which uses separate parameters for the magnitude of the periodic components
+    Shape:
+        - Input: (B, C, T)
+        - Output: (B, C, T), same shape as the input
+    Parameters:
+        - alpha - trainable parameter that controls frequency
+        - beta - trainable parameter that controls magnitude
+    References:
+        - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
+        https://arxiv.org/abs/2006.08195
+    Examples:
+        >>> a1 = snakebeta(256)
+        >>> x = torch.randn(256)
+        >>> x = a1(x)
+    """
+
+    def __init__(
+        self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False
+    ):
+        """
+        Initialization.
+        INPUT:
+            - in_features: shape of the input
+            - alpha - trainable parameter that controls frequency
+            - beta - trainable parameter that controls magnitude
+            alpha is initialized to 1 by default, higher values = higher-frequency.
+            beta is initialized to 1 by default, higher values = higher-magnitude.
+            alpha will be trained along with the rest of your model.
+        """
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+
+        # Initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:  # Log scale alphas initialized to zeros
+            self.alpha = Parameter(torch.zeros(in_features) * alpha)
+            self.beta = Parameter(torch.zeros(in_features) * alpha)
+        else:  # Linear scale alphas initialized to ones
+            self.alpha = Parameter(torch.ones(in_features) * alpha)
+            self.beta = Parameter(torch.ones(in_features) * alpha)
+
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+
+        self.no_div_by_zero = 0.000000001
+
+    def forward(self, x):
+        """
+        Forward pass of the function.
+        Applies the function to the input elementwise.
+        SnakeBeta ∶= x + 1/b * sin^2 (xa)
+        """
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)  # Line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
+
+        return x
--- a/kimia_infer/models/detokenizer/vocoder/alias_free_activation/__init__.py
+++ b/kimia_infer/models/detokenizer/vocoder/alias_free_activation/__init__.py
--- a/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/__init__.py
+++ b/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/__init__.py
--- a/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/activation1d.py
+++ b/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/activation1d.py
+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+
+import torch
+import torch.nn as nn
+from ..torch.resample import UpSample1d, DownSample1d
+
+# load fused CUDA kernel: this enables importing anti_alias_activation_cuda
+from . import load
+
+anti_alias_activation_cuda = load.load()
+
+
+class FusedAntiAliasActivation(torch.autograd.Function):
+    """
+    Assumes filter size 12, replication padding on upsampling/downsampling, and logscale alpha/beta parameters as inputs.
+    The hyperparameters are hard-coded in the kernel to maximize speed.
+    NOTE: The fused kenrel is incorrect for Activation1d with different hyperparameters.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, up_ftr, down_ftr, alpha, beta):
+        activation_results = anti_alias_activation_cuda.forward(
+            inputs, up_ftr, down_ftr, alpha, beta
+        )
+
+        return activation_results
+
+    @staticmethod
+    def backward(ctx, output_grads):
+        raise NotImplementedError
+        return output_grads, None, None
+
+
+class Activation1d(nn.Module):
+    def __init__(
+        self,
+        activation,
+        up_ratio: int = 2,
+        down_ratio: int = 2,
+        up_kernel_size: int = 12,
+        down_kernel_size: int = 12,
+        fused: bool = True,
+    ):
+        super().__init__()
+        self.up_ratio = up_ratio
+        self.down_ratio = down_ratio
+        self.act = activation
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+
+        self.fused = fused  # Whether to use fused CUDA kernel or not
+
+    def forward(self, x):
+        if not self.fused:
+            x = self.upsample(x)
+            x = self.act(x)
+            x = self.downsample(x)
+            return x
+        else:
+            if self.act.__class__.__name__ == "Snake":
+                beta = self.act.alpha.data  # Snake uses same params for alpha and beta
+            else:
+                beta = (
+                    self.act.beta.data
+                )  # Snakebeta uses different params for alpha and beta
+            alpha = self.act.alpha.data
+            if (
+                not self.act.alpha_logscale
+            ):  # Exp baked into cuda kernel, cancel it out with a log
+                alpha = torch.log(alpha)
+                beta = torch.log(beta)
+
+            x = FusedAntiAliasActivation.apply(
+                x, self.upsample.filter, self.downsample.lowpass.filter, alpha, beta
+            )
+            return x
--- a/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/anti_alias_activation.cpp
+++ b/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/anti_alias_activation.cpp
+/* coding=utf-8
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ #include <torch/extension.h>
+
+extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("forward", &fwd_cuda, "Anti-Alias Activation forward (CUDA)");
+}
\ No newline at end of file
--- a/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/anti_alias_activation_cuda.cu
+++ b/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/anti_alias_activation_cuda.cu
+/* coding=utf-8
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_profiler_api.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include "type_shim.h"
+#include <assert.h>
+#include <cfloat>
+#include <limits>
+#include <stdint.h>
+#include <c10/macros/Macros.h>
+
+namespace
+{
+    // Hard-coded hyperparameters
+    // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
+    constexpr int ELEMENTS_PER_LDG_STG = 1; //(WARP_ITERATIONS < 4) ? 1 : 4;
+    constexpr int BUFFER_SIZE = 32;
+    constexpr int FILTER_SIZE = 12;
+    constexpr int HALF_FILTER_SIZE = 6;
+    constexpr int UPSAMPLE_REPLICATION_PAD = 5; // 5 on each side, matching torch impl
+    constexpr int DOWNSAMPLE_REPLICATION_PAD_LEFT = 5; // matching torch impl
+    constexpr int DOWNSAMPLE_REPLICATION_PAD_RIGHT = 6; // matching torch impl
+
+    template <typename input_t, typename output_t, typename acc_t>
+    __global__ void anti_alias_activation_forward(
+        output_t *dst,
+        const input_t *src,
+        const input_t *up_ftr,
+        const input_t *down_ftr,
+        const input_t *alpha,
+        const input_t *beta,
+        int batch_size,
+        int channels,
+        int seq_len)
+    {
+        // Up and downsample filters
+        input_t up_filter[FILTER_SIZE];
+        input_t down_filter[FILTER_SIZE];
+
+        // Load data from global memory including extra indices reserved for replication paddings
+        input_t elements[2 * FILTER_SIZE + 2 * BUFFER_SIZE + 2 * UPSAMPLE_REPLICATION_PAD] = {0};
+        input_t intermediates[2 * FILTER_SIZE + 2 * BUFFER_SIZE + DOWNSAMPLE_REPLICATION_PAD_LEFT + DOWNSAMPLE_REPLICATION_PAD_RIGHT] = {0};
+
+        // Output stores downsampled output before writing to dst
+        output_t output[BUFFER_SIZE];
+
+        // blockDim/threadIdx = (128, 1, 1)
+        // gridDim/blockIdx = (seq_blocks, channels, batches)
+        int block_offset = (blockIdx.x * 128 * BUFFER_SIZE + seq_len * (blockIdx.y + gridDim.y * blockIdx.z));
+        int local_offset = threadIdx.x * BUFFER_SIZE;
+        int seq_offset = blockIdx.x * 128 * BUFFER_SIZE + local_offset;
+
+        // intermediate have double the seq_len
+        int intermediate_local_offset = threadIdx.x * BUFFER_SIZE * 2;
+        int intermediate_seq_offset = blockIdx.x * 128 * BUFFER_SIZE * 2 + intermediate_local_offset;
+
+        // Get values needed for replication padding before moving pointer
+        const input_t *right_most_pntr = src + (seq_len * (blockIdx.y + gridDim.y * blockIdx.z));
+        input_t seq_left_most_value = right_most_pntr[0];
+        input_t seq_right_most_value = right_most_pntr[seq_len - 1];
+
+        // Move src and dst pointers
+        src += block_offset + local_offset;
+        dst += block_offset + local_offset;
+
+        // Alpha and beta values for snake activatons. Applies exp by default
+        alpha = alpha + blockIdx.y;
+        input_t alpha_val = expf(alpha[0]);
+        beta = beta + blockIdx.y;
+        input_t beta_val = expf(beta[0]);
+
+        #pragma unroll
+        for (int it = 0; it < FILTER_SIZE; it += 1)
+        {
+            up_filter[it] = up_ftr[it];
+            down_filter[it] = down_ftr[it];
+        }
+
+        // Apply replication padding for upsampling, matching torch impl
+        #pragma unroll
+        for (int it = -HALF_FILTER_SIZE; it < BUFFER_SIZE + HALF_FILTER_SIZE; it += 1)
+        {
+            int element_index = seq_offset + it; // index for element
+            if ((element_index < 0) && (element_index >= -UPSAMPLE_REPLICATION_PAD))
+            {
+                elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_left_most_value;
+            }
+            if ((element_index >= seq_len) && (element_index < seq_len + UPSAMPLE_REPLICATION_PAD))
+            {
+                elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_right_most_value;
+            }
+            if ((element_index >= 0) && (element_index < seq_len))
+            {
+                elements[2 * (HALF_FILTER_SIZE + it)] = 2 * src[it];
+            }
+        }
+
+        // Apply upsampling strided convolution and write to intermediates. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT for replication padding of the downsampilng conv later
+        #pragma unroll
+        for (int it = 0; it < (2 * BUFFER_SIZE + 2 * FILTER_SIZE); it += 1)
+        {
+            input_t acc = 0.0;
+            int element_index = intermediate_seq_offset + it; // index for intermediate
+            #pragma unroll
+            for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1)
+            {
+                if ((element_index + f_idx) >= 0)
+                {
+                    acc += up_filter[f_idx] * elements[it + f_idx];
+                }
+            }
+            intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] = acc;
+        }
+
+        // Apply activation function. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT and DOWNSAMPLE_REPLICATION_PAD_RIGHT for replication padding of the downsampilng conv later
+        double no_div_by_zero = 0.000000001;
+        #pragma unroll
+        for (int it = 0; it < 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it += 1)
+        {
+            intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] += (1.0 / (beta_val + no_div_by_zero)) * sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val) * sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val);
+        }
+
+        // Apply replication padding before downsampling conv from intermediates
+        #pragma unroll
+        for (int it = 0; it < DOWNSAMPLE_REPLICATION_PAD_LEFT; it += 1)
+        {
+            intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT];
+        }
+        #pragma unroll
+        for (int it = DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it < DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE + DOWNSAMPLE_REPLICATION_PAD_RIGHT; it += 1)
+        {
+            intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE - 1];
+        }
+
+        // Apply downsample strided convolution (assuming stride=2) from intermediates
+        #pragma unroll
+        for (int it = 0; it < BUFFER_SIZE; it += 1)
+        {
+            input_t acc = 0.0;
+            #pragma unroll
+            for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1)
+            {
+                // Add constant DOWNSAMPLE_REPLICATION_PAD_RIGHT to match torch implementation
+                acc += down_filter[f_idx] * intermediates[it * 2 + f_idx + DOWNSAMPLE_REPLICATION_PAD_RIGHT];
+            }
+            output[it] = acc;
+        }
+
+        // Write output to dst
+        #pragma unroll
+        for (int it = 0;  it < BUFFER_SIZE;  it += ELEMENTS_PER_LDG_STG)
+        {
+            int element_index = seq_offset + it;
+            if (element_index < seq_len)
+            {
+                dst[it] = output[it];
+            }
+        }
+
+    }
+
+    template <typename input_t, typename output_t, typename acc_t>
+    void dispatch_anti_alias_activation_forward(
+        output_t *dst,
+        const input_t *src,
+        const input_t *up_ftr,
+        const input_t *down_ftr,
+        const input_t *alpha,
+        const input_t *beta,
+        int batch_size,
+        int channels,
+        int seq_len)
+    {
+        if (seq_len == 0)
+        {
+            return;
+        }
+        else
+        {
+            // Use 128 threads per block to maximimize gpu utilization
+            constexpr int threads_per_block = 128;
+            constexpr int seq_len_per_block = 4096;
+            int blocks_per_seq_len = (seq_len + seq_len_per_block - 1) / seq_len_per_block;
+            dim3 blocks(blocks_per_seq_len, channels, batch_size);
+            dim3 threads(threads_per_block, 1, 1);
+
+            anti_alias_activation_forward<input_t, output_t, acc_t>
+                <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, up_ftr, down_ftr, alpha, beta, batch_size, channels, seq_len);
+        }
+    }
+}
+
+extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta)
+{
+    // Input is a 3d tensor with dimensions [batches, channels, seq_len]
+    const int batches = input.size(0);
+    const int channels = input.size(1);
+    const int seq_len = input.size(2);
+
+    // Output
+    auto act_options = input.options().requires_grad(false);
+
+    torch::Tensor anti_alias_activation_results =
+        torch::empty({batches, channels, seq_len}, act_options);
+
+    void *input_ptr = static_cast<void *>(input.data_ptr());
+    void *up_filter_ptr = static_cast<void *>(up_filter.data_ptr());
+    void *down_filter_ptr = static_cast<void *>(down_filter.data_ptr());
+    void *alpha_ptr = static_cast<void *>(alpha.data_ptr());
+    void *beta_ptr = static_cast<void *>(beta.data_ptr());
+    void *anti_alias_activation_results_ptr = static_cast<void *>(anti_alias_activation_results.data_ptr());
+
+    DISPATCH_FLOAT_HALF_AND_BFLOAT(
+        input.scalar_type(),
+        "dispatch anti alias activation_forward",
+        dispatch_anti_alias_activation_forward<scalar_t, scalar_t, float>(
+            reinterpret_cast<scalar_t *>(anti_alias_activation_results_ptr),
+            reinterpret_cast<const scalar_t *>(input_ptr),
+            reinterpret_cast<const scalar_t *>(up_filter_ptr),
+            reinterpret_cast<const scalar_t *>(down_filter_ptr),
+            reinterpret_cast<const scalar_t *>(alpha_ptr),
+            reinterpret_cast<const scalar_t *>(beta_ptr),
+            batches,
+            channels,
+            seq_len););
+    return anti_alias_activation_results;
+}
\ No newline at end of file
--- a/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/compat.h
+++ b/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/compat.h
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*This code is copied fron NVIDIA apex:
+ *     https://github.com/NVIDIA/apex
+ *     with minor changes. */
+
+#ifndef TORCH_CHECK
+#define TORCH_CHECK AT_CHECK
+#endif
+
+#ifdef VERSION_GE_1_3
+#define DATA_PTR data_ptr
+#else
+#define DATA_PTR data
+#endif
--- a/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/load.py
+++ b/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/load.py
+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+
+import os
+import pathlib
+import subprocess
+
+from torch.utils import cpp_extension
+
+"""
+Setting this param to a list has a problem of generating different compilation commands (with diferent order of architectures) and leading to recompilation of fused kernels. 
+Set it to empty stringo avoid recompilation and assign arch flags explicity in extra_cuda_cflags below
+"""
+os.environ["TORCH_CUDA_ARCH_LIST"] = ""
+
+
+def load():
+    # Check if cuda 11 is installed for compute capability 8.0
+    cc_flag = []
+    _, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
+    if int(bare_metal_major) >= 11:
+        cc_flag.append("-gencode")
+        cc_flag.append("arch=compute_80,code=sm_80")
+
+    # Build path
+    srcpath = pathlib.Path(__file__).parent.absolute()
+    buildpath = srcpath / "build"
+    _create_build_dir(buildpath)
+
+    # Helper function to build the kernels.
+    def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
+        return cpp_extension.load(
+            name=name,
+            sources=sources,
+            build_directory=buildpath,
+            extra_cflags=[
+                "-O3",
+            ],
+            extra_cuda_cflags=[
+                "-O3",
+                "-gencode",
+                "arch=compute_70,code=sm_70",
+                "--use_fast_math",
+            ]
+            + extra_cuda_flags
+            + cc_flag,
+            verbose=True,
+        )
+
+    extra_cuda_flags = [
+        "-U__CUDA_NO_HALF_OPERATORS__",
+        "-U__CUDA_NO_HALF_CONVERSIONS__",
+        "--expt-relaxed-constexpr",
+        "--expt-extended-lambda",
+    ]
+
+    sources = [
+        srcpath / "anti_alias_activation.cpp",
+        srcpath / "anti_alias_activation_cuda.cu",
+    ]
+    anti_alias_activation_cuda = _cpp_extention_load_helper(
+        "anti_alias_activation_cuda", sources, extra_cuda_flags
+    )
+
+    return anti_alias_activation_cuda
+
+
+def _get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output(
+        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
+    )
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    release = output[release_idx].split(".")
+    bare_metal_major = release[0]
+    bare_metal_minor = release[1][0]
+
+    return raw_output, bare_metal_major, bare_metal_minor
+
+
+def _create_build_dir(buildpath):
+    try:
+        os.mkdir(buildpath)
+    except OSError:
+        if not os.path.isdir(buildpath):
+            print(f"Creation of the build directory {buildpath} failed")
--- a/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/type_shim.h
+++ b/kimia_infer/models/detokenizer/vocoder/alias_free_activation/cuda/type_shim.h
+/* coding=utf-8
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include "compat.h"
+
+#define DISPATCH_FLOAT_HALF_AND_BFLOAT(TYPE, NAME, ...)                 \
+	switch (TYPE)                                                       \
+	{                                                                   \
+	case at::ScalarType::Float:                                         \
+	{                                                                   \
+		using scalar_t = float;                                         \
+		__VA_ARGS__;                                                    \
+		break;                                                          \
+	}                                                                   \
+	case at::ScalarType::Half:                                          \
+	{                                                                   \
+		using scalar_t = at::Half;                                      \
+		__VA_ARGS__;                                                    \
+		break;                                                          \
+	}                                                                   \
+	case at::ScalarType::BFloat16:                                      \
+	{                                                                   \
+		using scalar_t = at::BFloat16;                                  \
+		__VA_ARGS__;                                                    \
+		break;                                                          \
+	}                                                                   \
+	default:                                                            \
+		AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
+	}
+
+#define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
+	switch (TYPEIN)                                                            \
+	{                                                                          \
+	case at::ScalarType::Float:                                                \
+	{                                                                          \
+		using scalar_t_in = float;                                             \
+		switch (TYPEOUT)                                                       \
+		{                                                                      \
+		case at::ScalarType::Float:                                            \
+		{                                                                      \
+			using scalar_t_out = float;                                        \
+			__VA_ARGS__;                                                       \
+			break;                                                             \
+		}                                                                      \
+		case at::ScalarType::Half:                                             \
+		{                                                                      \
+			using scalar_t_out = at::Half;                                     \
+			__VA_ARGS__;                                                       \
+			break;                                                             \
+		}                                                                      \
+		case at::ScalarType::BFloat16:                                         \
+		{                                                                      \
+			using scalar_t_out = at::BFloat16;                                 \
+			__VA_ARGS__;                                                       \
+			break;                                                             \
+		}                                                                      \
+		default:                                                               \
+			AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
+		}                                                                      \
+		break;                                                                 \
+	}                                                                          \
+	case at::ScalarType::Half:                                                 \
+	{                                                                          \
+		using scalar_t_in = at::Half;                                          \
+		using scalar_t_out = at::Half;                                         \
+		__VA_ARGS__;                                                           \
+		break;                                                                 \
+	}                                                                          \
+	case at::ScalarType::BFloat16:                                             \
+	{                                                                          \
+		using scalar_t_in = at::BFloat16;                                      \
+		using scalar_t_out = at::BFloat16;                                     \
+		__VA_ARGS__;                                                           \
+		break;                                                                 \
+	}                                                                          \
+	default:                                                                   \
+		AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'");      \
+	}
--- a/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/__init__.py
+++ b/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/__init__.py
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+from .filter import *
+from .resample import *
+from .act import *
--- a/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/act.py
+++ b/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/act.py
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+import torch.nn as nn
+from .resample import UpSample1d, DownSample1d
+
+
+class Activation1d(nn.Module):
+    def __init__(
+        self,
+        activation,
+        up_ratio: int = 2,
+        down_ratio: int = 2,
+        up_kernel_size: int = 12,
+        down_kernel_size: int = 12,
+    ):
+        super().__init__()
+        self.up_ratio = up_ratio
+        self.down_ratio = down_ratio
+        self.act = activation
+        self.upsample = UpSample1d(up_ratio, up_kernel_size)
+        self.downsample = DownSample1d(down_ratio, down_kernel_size)
+
+    # x: [B,C,T]
+    def forward(self, x):
+        x = self.upsample(x)
+        x = self.act(x)
+        x = self.downsample(x)
+
+        return x
--- a/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/filter.py
+++ b/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/filter.py
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+
+if "sinc" in dir(torch):
+    sinc = torch.sinc
+else:
+    # This code is adopted from adefossez's julius.core.sinc under the MIT License
+    # https://adefossez.github.io/julius/julius/core.html
+    #   LICENSE is in incl_licenses directory.
+    def sinc(x: torch.Tensor):
+        """
+        Implementation of sinc, i.e. sin(pi * x) / (pi * x)
+        __Warning__: Different to julius.sinc, the input is multiplied by `pi`!
+        """
+        return torch.where(
+            x == 0,
+            torch.tensor(1.0, device=x.device, dtype=x.dtype),
+            torch.sin(math.pi * x) / math.pi / x,
+        )
+
+
+# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
+# https://adefossez.github.io/julius/julius/lowpass.html
+#   LICENSE is in incl_licenses directory.
+def kaiser_sinc_filter1d(
+    cutoff, half_width, kernel_size
+):  # return filter [1,1,kernel_size]
+    even = kernel_size % 2 == 0
+    half_size = kernel_size // 2
+
+    # For kaiser window
+    delta_f = 4 * half_width
+    A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
+    if A > 50.0:
+        beta = 0.1102 * (A - 8.7)
+    elif A >= 21.0:
+        beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
+    else:
+        beta = 0.0
+    window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
+
+    # ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
+    if even:
+        time = torch.arange(-half_size, half_size) + 0.5
+    else:
+        time = torch.arange(kernel_size) - half_size
+    if cutoff == 0:
+        filter_ = torch.zeros_like(time)
+    else:
+        filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
+        """
+        Normalize filter to have sum = 1, otherwise we will have a small leakage of the constant component in the input signal.
+        """
+        filter_ /= filter_.sum()
+        filter = filter_.view(1, 1, kernel_size)
+
+    return filter
+
+
+class LowPassFilter1d(nn.Module):
+    def __init__(
+        self,
+        cutoff=0.5,
+        half_width=0.6,
+        stride: int = 1,
+        padding: bool = True,
+        padding_mode: str = "replicate",
+        kernel_size: int = 12,
+    ):
+        """
+        kernel_size should be even number for stylegan3 setup, in this implementation, odd number is also possible.
+        """
+        super().__init__()
+        if cutoff < -0.0:
+            raise ValueError("Minimum cutoff must be larger than zero.")
+        if cutoff > 0.5:
+            raise ValueError("A cutoff above 0.5 does not make sense.")
+        self.kernel_size = kernel_size
+        self.even = kernel_size % 2 == 0
+        self.pad_left = kernel_size // 2 - int(self.even)
+        self.pad_right = kernel_size // 2
+        self.stride = stride
+        self.padding = padding
+        self.padding_mode = padding_mode
+        filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
+        self.register_buffer("filter", filter)
+
+    # Input [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+
+        if self.padding:
+            x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
+        out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
+
+        return out
--- a/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/resample.py
+++ b/kimia_infer/models/detokenizer/vocoder/alias_free_activation/torch/resample.py
+# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
+#   LICENSE is in incl_licenses directory.
+
+import torch.nn as nn
+from torch.nn import functional as F
+from .filter import LowPassFilter1d
+from .filter import kaiser_sinc_filter1d
+
+
+class UpSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = (
+            int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        )
+        self.stride = ratio
+        self.pad = self.kernel_size // ratio - 1
+        self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
+        self.pad_right = (
+            self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
+        )
+        filter = kaiser_sinc_filter1d(
+            cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size
+        )
+        self.register_buffer("filter", filter)
+
+    # x: [B, C, T]
+    def forward(self, x):
+        _, C, _ = x.shape
+
+        x = F.pad(x, (self.pad, self.pad), mode="replicate")
+        x = self.ratio * F.conv_transpose1d(
+            x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C
+        )
+        x = x[..., self.pad_left : -self.pad_right]
+
+        return x
+
+
+class DownSample1d(nn.Module):
+    def __init__(self, ratio=2, kernel_size=None):
+        super().__init__()
+        self.ratio = ratio
+        self.kernel_size = (
+            int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
+        )
+        self.lowpass = LowPassFilter1d(
+            cutoff=0.5 / ratio,
+            half_width=0.6 / ratio,
+            stride=ratio,
+            kernel_size=self.kernel_size,
+        )
+
+    def forward(self, x):
+        xx = self.lowpass(x)
+
+        return xx
--- a/kimia_infer/models/detokenizer/vocoder/bigvgan.py
+++ b/kimia_infer/models/detokenizer/vocoder/bigvgan.py
+# Copyright (c) 2024 NVIDIA CORPORATION.
+#   Licensed under the MIT license.
+
+# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
+#   LICENSE is in incl_licenses directory.
+
+import os
+import json
+from pathlib import Path
+from typing import Optional, Union, Dict
+
+import torch
+import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d
+from torch.nn.utils import weight_norm, remove_weight_norm
+
+from .activations import Snake, SnakeBeta
+from .utils import init_weights, get_padding
+from .alias_free_activation.torch.act import Activation1d as TorchActivation1d
+from .utils import AttrDict
+
+from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
+
+
+def load_hparams_from_json(path) -> AttrDict:
+    with open(path) as f:
+        data = f.read()
+    return AttrDict(json.loads(data))
+
+
+class AMPBlock1(torch.nn.Module):
+    """
+    AMPBlock applies Snake / SnakeBeta activation functions with trainable parameters that control periodicity, defined for each layer.
+    AMPBlock1 has additional self.convs2 that contains additional Conv1d layers with a fixed dilation=1 followed by each layer in self.convs1
+
+    Args:
+        h (AttrDict): Hyperparameters.
+        channels (int): Number of convolution channels.
+        kernel_size (int): Size of the convolution kernel. Default is 3.
+        dilation (tuple): Dilation rates for the convolutions. Each dilation layer has two convolutions. Default is (1, 3, 5).
+        activation (str): Activation function type. Should be either 'snake' or 'snakebeta'. Default is None.
+    """
+
+    def __init__(
+        self,
+        h: AttrDict,
+        channels: int,
+        kernel_size: int = 3,
+        dilation: tuple = (1, 3, 5),
+        activation: str = None,
+    ):
+        super().__init__()
+
+        self.h = h
+
+        self.convs1 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        stride=1,
+                        dilation=d,
+                        padding=get_padding(kernel_size, d),
+                    )
+                )
+                for d in dilation
+            ]
+        )
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        stride=1,
+                        dilation=1,
+                        padding=get_padding(kernel_size, 1),
+                    )
+                )
+                for _ in range(len(dilation))
+            ]
+        )
+        self.convs2.apply(init_weights)
+
+        self.num_layers = len(self.convs1) + len(
+            self.convs2
+        )  # Total number of conv layers
+
+        # Select which Activation1d, lazy-load cuda version to ensure backward compatibility
+        if self.h.get("use_cuda_kernel", False):
+            from .alias_free_activation.cuda.activation1d import (
+                Activation1d as CudaActivation1d,
+            )
+
+            Activation1d = CudaActivation1d
+        else:
+            Activation1d = TorchActivation1d
+
+        # Activation functions
+        if activation == "snake":
+            self.activations = nn.ModuleList(
+                [
+                    Activation1d(
+                        activation=Snake(channels, alpha_logscale=h.snake_logscale)
+                    )
+                    for _ in range(self.num_layers)
+                ]
+            )
+        elif activation == "snakebeta":
+            self.activations = nn.ModuleList(
+                [
+                    Activation1d(
+                        activation=SnakeBeta(channels, alpha_logscale=h.snake_logscale)
+                    )
+                    for _ in range(self.num_layers)
+                ]
+            )
+        else:
+            raise NotImplementedError(
+                "activation incorrectly specified. check the config file and look for 'activation'."
+            )
+
+    def forward(self, x):
+        acts1, acts2 = self.activations[::2], self.activations[1::2]
+        for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
+            xt = a1(x)
+            xt = c1(xt)
+            xt = a2(xt)
+            xt = c2(xt)
+            x = xt + x
+
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+
+
+class AMPBlock2(torch.nn.Module):
+    """
+    AMPBlock applies Snake / SnakeBeta activation functions with trainable parameters that control periodicity, defined for each layer.
+    Unlike AMPBlock1, AMPBlock2 does not contain extra Conv1d layers with fixed dilation=1
+
+    Args:
+        h (AttrDict): Hyperparameters.
+        channels (int): Number of convolution channels.
+        kernel_size (int): Size of the convolution kernel. Default is 3.
+        dilation (tuple): Dilation rates for the convolutions. Each dilation layer has two convolutions. Default is (1, 3, 5).
+        activation (str): Activation function type. Should be either 'snake' or 'snakebeta'. Default is None.
+    """
+
+    def __init__(
+        self,
+        h: AttrDict,
+        channels: int,
+        kernel_size: int = 3,
+        dilation: tuple = (1, 3, 5),
+        activation: str = None,
+    ):
+        super().__init__()
+
+        self.h = h
+
+        self.convs = nn.ModuleList(
+            [
+                weight_norm(
+                    Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        stride=1,
+                        dilation=d,
+                        padding=get_padding(kernel_size, d),
+                    )
+                )
+                for d in dilation
+            ]
+        )
+        self.convs.apply(init_weights)
+
+        self.num_layers = len(self.convs)  # Total number of conv layers
+
+        # Select which Activation1d, lazy-load cuda version to ensure backward compatibility
+        if self.h.get("use_cuda_kernel", False):
+            from .alias_free_activation.cuda.activation1d import (
+                Activation1d as CudaActivation1d,
+            )
+
+            Activation1d = CudaActivation1d
+        else:
+            Activation1d = TorchActivation1d
+
+        # Activation functions
+        if activation == "snake":
+            self.activations = nn.ModuleList(
+                [
+                    Activation1d(
+                        activation=Snake(channels, alpha_logscale=h.snake_logscale)
+                    )
+                    for _ in range(self.num_layers)
+                ]
+            )
+        elif activation == "snakebeta":
+            self.activations = nn.ModuleList(
+                [
+                    Activation1d(
+                        activation=SnakeBeta(channels, alpha_logscale=h.snake_logscale)
+                    )
+                    for _ in range(self.num_layers)
+                ]
+            )
+        else:
+            raise NotImplementedError(
+                "activation incorrectly specified. check the config file and look for 'activation'."
+            )
+
+    def forward(self, x):
+        for c, a in zip(self.convs, self.activations):
+            xt = a(x)
+            xt = c(xt)
+            x = xt + x
+
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+
+
+class BigVGAN(
+    torch.nn.Module,
+    PyTorchModelHubMixin,
+    library_name="bigvgan",
+    repo_url="https://github.com/NVIDIA/BigVGAN",
+    docs_url="https://github.com/NVIDIA/BigVGAN/blob/main/README.md",
+    pipeline_tag="audio-to-audio",
+    license="mit",
+    tags=["neural-vocoder", "audio-generation", "arxiv:2206.04658"],
+):
+    """
+    BigVGAN is a neural vocoder model that applies anti-aliased periodic activation for residual blocks (resblocks).
+    New in BigVGAN-v2: it can optionally use optimized CUDA kernels for AMP (anti-aliased multi-periodicity) blocks.
+
+    Args:
+        h (AttrDict): Hyperparameters.
+        use_cuda_kernel (bool): If set to True, loads optimized CUDA kernels for AMP. This should be used for inference only, as training is not supported with CUDA kernels.
+
+    Note:
+        - The `use_cuda_kernel` parameter should be used for inference only, as training with CUDA kernels is not supported.
+        - Ensure that the activation function is correctly specified in the hyperparameters (h.activation).
+    """
+
+    def __init__(self, h: AttrDict, use_cuda_kernel: bool = False):
+        super().__init__()
+        self.h = h
+        self.h["use_cuda_kernel"] = use_cuda_kernel
+
+        # Select which Activation1d, lazy-load cuda version to ensure backward compatibility
+        if self.h.get("use_cuda_kernel", False):
+            from .alias_free_activation.cuda.activation1d import (
+                Activation1d as CudaActivation1d,
+            )
+
+            Activation1d = CudaActivation1d
+        else:
+            Activation1d = TorchActivation1d
+
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+
+        # Pre-conv
+        self.conv_pre = weight_norm(
+            Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)
+        )
+
+        # Define which AMPBlock to use. BigVGAN uses AMPBlock1 as default
+        if h.resblock == "1":
+            resblock_class = AMPBlock1
+        elif h.resblock == "2":
+            resblock_class = AMPBlock2
+        else:
+            raise ValueError(
+                f"Incorrect resblock class specified in hyperparameters. Got {h.resblock}"
+            )
+
+        # Transposed conv-based upsamplers. does not apply anti-aliasing
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(
+                nn.ModuleList(
+                    [
+                        weight_norm(
+                            ConvTranspose1d(
+                                h.upsample_initial_channel // (2**i),
+                                h.upsample_initial_channel // (2 ** (i + 1)),
+                                k,
+                                u,
+                                padding=(k - u) // 2,
+                            )
+                        )
+                    ]
+                )
+            )
+
+        # Residual blocks using anti-aliased multi-periodicity composition modules (AMP)
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)
+            ):
+                self.resblocks.append(
+                    resblock_class(h, ch, k, d, activation=h.activation)
+                )
+
+        # Post-conv
+        activation_post = (
+            Snake(ch, alpha_logscale=h.snake_logscale)
+            if h.activation == "snake"
+            else (
+                SnakeBeta(ch, alpha_logscale=h.snake_logscale)
+                if h.activation == "snakebeta"
+                else None
+            )
+        )
+        if activation_post is None:
+            raise NotImplementedError(
+                "activation incorrectly specified. check the config file and look for 'activation'."
+            )
+
+        self.activation_post = Activation1d(activation=activation_post)
+
+        # Whether to use bias for the final conv_post. Default to True for backward compatibility
+        self.use_bias_at_final = h.get("use_bias_at_final", True)
+        self.conv_post = weight_norm(
+            Conv1d(ch, 1, 7, 1, padding=3, bias=self.use_bias_at_final)
+        )
+
+        # Weight initialization
+        for i in range(len(self.ups)):
+            self.ups[i].apply(init_weights)
+        self.conv_post.apply(init_weights)
+
+        # Final tanh activation. Defaults to True for backward compatibility
+        self.use_tanh_at_final = h.get("use_tanh_at_final", True)
+
+    def forward(self, x):
+        # Pre-conv
+        x = self.conv_pre(x)
+
+        for i in range(self.num_upsamples):
+            # Upsampling
+            for i_up in range(len(self.ups[i])):
+                x = self.ups[i][i_up](x)
+            # AMP blocks
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+
+        # Post-conv
+        x = self.activation_post(x)
+        x = self.conv_post(x)
+        # Final tanh activation
+        if self.use_tanh_at_final:
+            x = torch.tanh(x)
+        else:
+            x = torch.clamp(x, min=-1.0, max=1.0)  # Bound the output to [-1, 1]
+
+        return x
+
+    def remove_weight_norm(self):
+        try:
+            print("Removing weight norm...")
+            for l in self.ups:
+                for l_i in l:
+                    remove_weight_norm(l_i)
+            for l in self.resblocks:
+                l.remove_weight_norm()
+            remove_weight_norm(self.conv_pre)
+            remove_weight_norm(self.conv_post)
+        except ValueError:
+            print("[INFO] Model already removed weight norm. Skipping!")
+            pass
+
+    # Additional methods for huggingface_hub support
+    def _save_pretrained(self, save_directory: Path) -> None:
+        """Save weights and config.json from a Pytorch model to a local directory."""
+
+        model_path = save_directory / "bigvgan_generator.pt"
+        torch.save({"generator": self.state_dict()}, model_path)
+
+        config_path = save_directory / "config.json"
+        with open(config_path, "w") as config_file:
+            json.dump(self.h, config_file, indent=4)
+
+    @classmethod
+    def _from_pretrained(
+        cls,
+        *,
+        model_id: str,
+        revision: str,
+        cache_dir: str,
+        force_download: bool,
+        proxies: Optional[Dict],
+        resume_download: bool,
+        local_files_only: bool,
+        token: Union[str, bool, None],
+        map_location: str = "cpu",  # Additional argument
+        strict: bool = False,  # Additional argument
+        use_cuda_kernel: bool = False,
+        **model_kwargs,
+    ):
+        """Load Pytorch pretrained weights and return the loaded model."""
+
+        # Download and load hyperparameters (h) used by BigVGAN
+        if os.path.isdir(model_id):
+            print("Loading config.json from local directory")
+            config_file = os.path.join(model_id, "config.json")
+        else:
+            config_file = hf_hub_download(
+                repo_id=model_id,
+                filename="config.json",
+                revision=revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                token=token,
+                local_files_only=local_files_only,
+            )
+        h = load_hparams_from_json(config_file)
+
+        # instantiate BigVGAN using h
+        if use_cuda_kernel:
+            print(
+                f"[WARNING] You have specified use_cuda_kernel=True during BigVGAN.from_pretrained(). Only inference is supported (training is not implemented)!"
+            )
+            print(
+                f"[WARNING] You need nvcc and ninja installed in your system that matches your PyTorch build is using to build the kernel. If not, the model will fail to initialize or generate incorrect waveform!"
+            )
+            print(
+                f"[WARNING] For detail, see the official GitHub repository: https://github.com/NVIDIA/BigVGAN?tab=readme-ov-file#using-custom-cuda-kernel-for-synthesis"
+            )
+        model = cls(h, use_cuda_kernel=use_cuda_kernel)
+
+        # Download and load pretrained generator weight
+        if os.path.isdir(model_id):
+            print("Loading weights from local directory")
+            model_file = os.path.join(model_id, "bigvgan_generator.pt")
+        else:
+            print(f"Loading weights from {model_id}")
+            model_file = hf_hub_download(
+                repo_id=model_id,
+                filename="bigvgan_generator.pt",
+                revision=revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                token=token,
+                local_files_only=local_files_only,
+            )
+
+        checkpoint_dict = torch.load(
+            model_file, map_location=map_location, weights_only=True
+        )
+
+        try:
+            model.load_state_dict(checkpoint_dict["generator"])
+        except RuntimeError:
+            print(
+                f"[INFO] the pretrained checkpoint does not contain weight norm. Loading the checkpoint after removing weight norm!"
+            )
+            model.remove_weight_norm()
+            model.load_state_dict(checkpoint_dict["generator"])
+
+        return model
--- a/kimia_infer/models/detokenizer/vocoder/utils.py
+++ b/kimia_infer/models/detokenizer/vocoder/utils.py
+from librosa.filters import mel as librosa_mel_fn
+import torch
+import os
+
+mel_basis_cache = {}
+hann_window_cache = {}
+
+
+def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
+    return torch.log(torch.clamp(x, min=clip_val) * C)
+
+
+def spectral_normalize_torch(magnitudes):
+    return dynamic_range_compression_torch(magnitudes)
+
+
+def get_melspec(
+    y: torch.Tensor,
+    n_fft: int,
+    num_mels: int,
+    sampling_rate: int,
+    hop_size: int,
+    win_size: int,
+    fmin: int,
+    fmax: int = None,
+    center: bool = False,
+) -> torch.Tensor:
+    """
+    Calculate the mel spectrogram of an input signal.
+    This function uses slaney norm for the librosa mel filterbank (using librosa.filters.mel) and uses Hann window for STFT (using torch.stft).
+
+    Args:
+        y (torch.Tensor): Input signal.
+        n_fft (int): FFT size.
+        num_mels (int): Number of mel bins.
+        sampling_rate (int): Sampling rate of the input signal.
+        hop_size (int): Hop size for STFT.
+        win_size (int): Window size for STFT.
+        fmin (int): Minimum frequency for mel filterbank.
+        fmax (int): Maximum frequency for mel filterbank. If None, defaults to half the sampling rate (fmax = sr / 2.0) inside librosa_mel_fn
+        center (bool): Whether to pad the input to center the frames. Default is False.
+
+    Returns:
+        torch.Tensor: Mel spectrogram.
+    """
+    if torch.min(y) < -1.0:
+        print(f"[WARNING] Min value of input waveform signal is {torch.min(y)}")
+    if torch.max(y) > 1.0:
+        print(f"[WARNING] Max value of input waveform signal is {torch.max(y)}")
+
+    device = y.device
+    key = f"{n_fft}_{num_mels}_{sampling_rate}_{hop_size}_{win_size}_{fmin}_{fmax}_{device}"
+
+    if key not in mel_basis_cache:
+        mel = librosa_mel_fn(
+            sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
+        )
+        mel_basis_cache[key] = torch.from_numpy(mel).float().to(device)
+        hann_window_cache[key] = torch.hann_window(win_size).to(device)
+
+    mel_basis = mel_basis_cache[key]
+    hann_window = hann_window_cache[key]
+
+    padding = (n_fft - hop_size) // 2
+    y = torch.nn.functional.pad(
+        y.unsqueeze(1), (padding, padding), mode="reflect"
+    ).squeeze(1)
+
+    spec = torch.stft(
+        y,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window,
+        center=center,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+    spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
+
+    mel_spec = torch.matmul(mel_basis, spec)
+    mel_spec = spectral_normalize_torch(mel_spec)
+
+    return mel_spec
+
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def load_checkpoint(filepath, device):
+    assert os.path.isfile(filepath)
+    print(f"Loading '{filepath}'")
+    checkpoint_dict = torch.load(filepath, map_location=device, weights_only=True)
+    print("Complete.")
+    return checkpoint_dict
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
--- a/kimia_infer/models/tokenizer/__init__.py
+++ b/kimia_infer/models/tokenizer/__init__.py
--- a/glm4 @ eb00ce91
+++ b/glm4 @ eb00ce91
+Subproject commit eb00ce9142e8d98b0ed7c57cd47e0d6d5dce9a1a
--- a/kimia_infer/models/tokenizer/glm4_tokenizer.py
+++ b/kimia_infer/models/tokenizer/glm4_tokenizer.py
+import torch
+import librosa
+import os
+
+from transformers import WhisperFeatureExtractor
+from .glm4.speech_tokenizer.modeling_whisper import WhisperVQEncoder
+from .glm4.speech_tokenizer.utils import extract_speech_token
+from torch import nn
+
+
+class Glm4Tokenizer(nn.Module):
+    def __init__(self, tokenizer_path):
+        super().__init__()
+        self.whisper_model = WhisperVQEncoder.from_pretrained(tokenizer_path).eval()
+        self.feature_extractor = WhisperFeatureExtractor.from_pretrained(tokenizer_path)
+
+    def tokenize(self, speech=None, audio_path=None, sr=16000):
+        if audio_path:
+            audio, sr = librosa.load(audio_path, sr=16000)
+            audio = torch.tensor(audio).unsqueeze(0)
+            audio_info = (audio, sr)
+        else:
+            assert speech is not None
+            assert sr
+            if isinstance(speech, list):
+                speech = torch.tensor(speech).unsqueeze(0)
+            if len(speech.shape) == 1:
+                speech = speech.unsqueeze(0)
+            audio_info = (speech, sr)
+
+        audio_tokens = extract_speech_token(
+            self.whisper_model, self.feature_extractor, [audio_info]
+        )[0]
+        audio_tokens = torch.tensor(audio_tokens).unsqueeze(0)
+        return audio_tokens
--- a/kimia_infer/models/tokenizer/whisper_Lv3/mel_filters.npz
+++ b/kimia_infer/models/tokenizer/whisper_Lv3/mel_filters.npz