"...git@developer.sourcefind.cn:modelzoo/stylegan2_mmcv.git" did not exist on "1401de15d079af4d9d9f995f2d57ddb6d930d7f0"
Commit ecd710b6 authored by wangzhengtao's avatar wangzhengtao
Browse files

init push

parents
import yaml
import logging
import time
import os
import torch
from .flow_matching.ode_wrapper import StreamingODEWrapperForPrefix
from .flow_matching.model import DiTPrefix
from .flow_matching.scheduler import StreamingFlowMatchingScheduler
logger = logging.getLogger(__name__)
class StreamingSemanticFMWrapper:
def __init__(
self,
speech_model: DiTPrefix,
max_kv_cache_tokens=900,
max_prompt_chunk=2,
use_cfg=True,
use_cfg_rescale=True,
cfg_init=1.5,
cfg_scale=7.5,
cfg_schedule="linear",
cfg_token_id=0,
normalize_mel=False,
mel_mean=None,
mel_std=None,
device: torch.device = torch.device("cpu"),
) -> None:
self.dtype = torch.bfloat16
self.speech_model = speech_model.to(device).to(self.dtype)
self.speech_model = self.speech_model.eval()
self.device = device
self.normalize_mel = normalize_mel
self.mel_mean = mel_mean
self.mel_std = mel_std
self.use_cfg = use_cfg
self.use_cfg_rescale = use_cfg_rescale
self.cfg_init = cfg_init
self.cfg_scale = cfg_scale
self.cfg_schedule = cfg_schedule
self.incremental_state = {}
self.condition_cache = {"previous_seqlen": 0}
logger.info(
f">>> SemanticFMWrapper initialized with use_cfg={use_cfg}, use_cfg_rescale={use_cfg_rescale}, cfg_init={cfg_init}, cfg_scale={cfg_scale}, cfg_schedule={cfg_schedule}"
)
self.scheduler = StreamingFlowMatchingScheduler()
self.ode_wrapper = StreamingODEWrapperForPrefix(
net=self.speech_model,
x_mask=None,
x_cond=None,
use_cfg=use_cfg,
use_cfg_rescale=use_cfg_rescale,
cfg_init=cfg_init,
cfg_scale=cfg_scale,
cfg_schedule=cfg_schedule,
cfg_token_id=cfg_token_id,
)
self.max_kv_cache_tokens = max_kv_cache_tokens
self.max_prompt_chunk = max_prompt_chunk
self.reserve_kv_cache_tokens = 0
@torch.inference_mode()
def infer_chunk(
self,
xt_chunk,
semantic_tokens_chunk,
start_position_id,
cache=None,
look_ahead_tokens=0,
ode_steps=15,
verbose=False,
ode_solver="neural_ode_euler",
):
"""
semantic_tokens: [T_1], torch.LongTensor
xt: [T_2, 80], torch.Tensor, DO NOT normalize it outside
ode_steps: int, number of ode steps, default 15
verbose: bool, default False
ode_solver: str, ode solver, expected in ("neural_ode_euler", "naive_euler"), default "neural_ode_euler"
"""
bs = 1
self.scheduler.set_timesteps(ode_steps)
semantic_tokens_chunk = semantic_tokens_chunk.unsqueeze(0).to(self.device)
xt_chunk = xt_chunk.unsqueeze(0).to(self.device).to(self.dtype)
t_span = torch.linspace(0, 1, self.scheduler.timesteps)
x_mask = torch.zeros(bs, xt_chunk.shape[1], device=self.device).bool()
cache_ret = self.ode_wrapper.set_conditions(
x_mask=x_mask,
x_cond=semantic_tokens_chunk,
start_position_id=start_position_id,
cache=self.condition_cache,
)
if verbose:
t_start = time.time()
if ode_solver == "neural_ode_euler":
x_t = self.scheduler.sample_by_neuralode(
self.ode_wrapper, time_steps=t_span, xt=xt_chunk, verbose=False
)
elif ode_solver == "naive_euler":
x_t = self.scheduler.sample(
ode_wrapper=self.ode_wrapper,
time_steps=t_span,
xt=xt_chunk,
verbose=False,
)
else:
raise NotImplementedError(
"ode_solver should be in ('neural_ode_euler', 'naive_euler')"
)
if look_ahead_tokens > 0:
semantic_tokens_left = semantic_tokens_chunk.view(-1)[-look_ahead_tokens:]
cache["semantic_token"] = semantic_tokens_left
x_t_ret = x_t[:, :-look_ahead_tokens, :]
else:
x_t_ret = x_t
if look_ahead_tokens > 0:
x_mask = torch.zeros(
bs, xt_chunk.shape[1] - look_ahead_tokens, device=self.device
).bool()
self.condition_cache = self.ode_wrapper.set_conditions(
x_mask=x_mask,
x_cond=semantic_tokens_chunk[:, :-look_ahead_tokens],
start_position_id=start_position_id,
cache=self.condition_cache,
)
self.ode_wrapper(torch.Tensor([0.999]).to(x_t_ret.device), x_t_ret)
else:
self.condition_cache = cache_ret
if verbose:
t_end = time.time()
logger.info(f"[ODE Chunk] Time cost: {t_end - t_start}")
if self.normalize_mel:
x_t_ret = x_t_ret * self.mel_std + self.mel_mean
return x_t_ret.squeeze(0)
@torch.inference_mode()
def infer_mel(
self,
semantic_tokens,
ode_steps=15,
chunk_size=150,
verbose=False,
ode_solver="neural_ode_euler",
):
"""
semantic_tokens: [T_1], torch.LongTensor
prompt: [T_2, 80], torch.Tensor, DO NOT normalize it outside
prompt_semantic_tokens, [T_2], torch.LongTensor
ode_steps: int, number of ode steps, default 15
verbose: bool, default False
ode_solver: str, ode solver, expected in ("neural_ode_euler", "naive_euler"), default "neural_ode_euler"
"""
assert semantic_tokens.dim() == 1
x_t = torch.randn(semantic_tokens.shape[0], 80).to(self.device).to(self.dtype)
seq_len = semantic_tokens.shape[0]
num_chunks = seq_len // chunk_size
if seq_len % chunk_size != 0:
num_chunks += 1
x_pred_collect = []
if verbose:
t_start = time.time()
for chunk_id in range(num_chunks):
start = chunk_id * chunk_size
end = min(start + chunk_size, seq_len)
semantic_tokens_chunk = semantic_tokens[start:end]
x_t_chunk = x_t[start:end, :]
x_pred = self.infer_chunk(
xt_chunk=x_t_chunk,
semantic_tokens_chunk=semantic_tokens_chunk,
start_position_id=self.start_position_id,
ode_steps=ode_steps,
verbose=verbose,
ode_solver=ode_solver,
)
self.start_position_id += end - start
self.update_incremental_state()
x_pred_collect.append(x_pred)
if verbose:
t_end = time.time()
logger.info(f"[ODE] Time cost: {t_end - t_start}")
x_pred = torch.cat(x_pred_collect, dim=0)
return x_pred
def clear_all_states(self):
self.start_position_id = 0
self.condition_cache = {"previous_seqlen": 0}
self.ode_wrapper.clear_all_states()
def state_dict(self):
return {
"start_position_id": self.start_position_id,
"ode_wrapper": self.ode_wrapper.state_dict(),
"condition_cache": self.condition_cache,
}
def load_state_dict(self, state_dict):
if state_dict is not None:
self.start_position_id = state_dict["start_position_id"]
self.ode_wrapper.load_state_dict(state_dict["ode_wrapper"])
self.condition_cache = state_dict["condition_cache"]
def update_incremental_state(self):
self.ode_wrapper.update_incremental_state(
reserve_kv_cache_tokens=0,
max_kv_cache_tokens=self.max_kv_cache_tokens,
condition_cache=self.condition_cache,
)
@torch.inference_mode()
def prefill(self, mel, semantic_token, chunk_size=150, verbose=False):
"""
mel: [T, 80], torch.Tensor
semantic_token: [T], torch.LongTensor
chunk_size: int, default 150
"""
assert mel.dim() == 2
assert semantic_token.dim() == 1
assert (
semantic_token.shape[0] == mel.shape[0]
), "Semantic token and mel shape mismatch"
seq_len = mel.shape[0]
num_chunks = min(seq_len // chunk_size, self.max_prompt_chunk)
start_pos = seq_len - num_chunks * chunk_size
res_mel = mel[:start_pos, :]
res_semantic_token = semantic_token[:start_pos]
self.prefill_chunk(
res_mel, res_semantic_token, start_position_id=self.start_position_id
)
self.start_position_id += start_pos
self.update_incremental_state()
self.reserve_kv_cache_tokens += self.ode_wrapper.kv_cache_tokens
if verbose:
logger.info("Prefilling prompt with {} chunks".format(num_chunks))
start_time = time.time()
for chunk_id in range(num_chunks):
start = start_pos + chunk_id * chunk_size
end = start + chunk_size
mel_chunk = mel[start:end, :]
semantic_token_chunk = semantic_token[start:end]
self.prefill_chunk(
mel_chunk,
semantic_token_chunk,
start_position_id=self.start_position_id,
)
self.start_position_id += end - start
self.update_incremental_state()
self.reserve_kv_cache_tokens += self.ode_wrapper.kv_cache_tokens
if verbose:
logger.info(
"Prefilling done in {:.2f} seconds".format(time.time() - start_time)
)
def prefill_chunk(self, mel_chunk, semantic_tokens_chunk, start_position_id=0):
"""
mel_chunk: [T, 80], torch.Tensor, T is the chunk size
semantic_tokens_chunk: [T], torch.LongTensor
start_position_id: int, default 0
"""
bs = 1
semantic_tokens_chunk = semantic_tokens_chunk.unsqueeze(0).to(self.device)
mel_chunk = mel_chunk.unsqueeze(0).to(self.device).to(self.dtype)
if self.normalize_mel:
mel_chunk = (mel_chunk - self.mel_mean) / self.mel_std
x_mask = torch.zeros(bs, mel_chunk.shape[1], device=self.device).bool()
self.condition_cache = self.ode_wrapper.set_conditions(
x_mask=x_mask,
x_cond=semantic_tokens_chunk,
start_position_id=start_position_id,
cache=self.condition_cache,
)
x_t = torch.Tensor([0.999]).to(self.device)
self.ode_wrapper(x_t, mel_chunk)
@classmethod
def from_pretrained(
cls,
model_config,
ckpt_path,
device,
max_prompt_chunk=2,
max_kv_cache_tokens=900,
use_cfg=True,
use_cfg_rescale=True,
cfg_init=1.5,
cfg_scale=7.5,
cfg_schedule="linear",
):
# open yaml file
with open(model_config, "r") as f:
config = yaml.safe_load(f)
model_config = config["model"]["dit"]
dit = DiTPrefix(
input_size=model_config["input_size"],
semantic_vocab_size=model_config["semantic_vocab_size"] + 1,
hidden_size=model_config["hidden_size"],
depth=model_config["depth"],
num_heads=model_config["num_heads"],
mlp_ratio=model_config["mlp_ratio"],
ffn_type=model_config.get("ffn_type", "conv1d_conv1d"),
ffn_gated_glu=model_config.get("ffn_gated_glu", True),
ffn_act_layer=model_config.get("ffn_act_layer", "gelu"),
ffn_conv_kernel_size=model_config.get("ffn_conv_kernel_size", 5),
use_rope=model_config.get("use_rope", False),
rope_params=model_config.get(
"rope_params",
{
"max_position_embeddings": 4096,
"rope_base": 10000,
"rope_interpolation_factor": 1,
},
),
position_embedding_type=model_config["position_embedding_type"],
max_seq_len=model_config["max_seq_len"],
output_size=model_config["input_size"],
prompt_cfg_dropout=0,
)
cfg_semantic_token_id = model_config["semantic_vocab_size"]
# load state_dict
state_dict = torch.load(ckpt_path, map_location="cpu", weights_only=True)[
"state_dict"
]
speech_model_params = {
k.replace("speech_model.", ""): v
for k, v in state_dict.items()
if "speech_model" in k
}
dit.load_state_dict(speech_model_params, strict=True)
logger.info(f">>> Loaded checkpoint from {ckpt_path}")
return cls(
speech_model=dit,
device=device,
normalize_mel=config["normalize_mel"],
mel_mean=config["mel_mean"],
mel_std=config["mel_std"],
max_prompt_chunk=max_prompt_chunk,
max_kv_cache_tokens=max_kv_cache_tokens,
use_cfg=use_cfg,
use_cfg_rescale=use_cfg_rescale,
cfg_init=cfg_init,
cfg_scale=cfg_scale,
cfg_schedule=cfg_schedule,
cfg_token_id=cfg_semantic_token_id,
)
import torch
from torch import nn, sin, pow
from torch.nn import Parameter
class Snake(nn.Module):
"""
Implementation of a sine-based periodic activation function
Shape:
- Input: (B, C, T)
- Output: (B, C, T), same shape as the input
Parameters:
- alpha - trainable parameter
References:
- This activation function is from this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
https://arxiv.org/abs/2006.08195
Examples:
>>> a1 = snake(256)
>>> x = torch.randn(256)
>>> x = a1(x)
"""
def __init__(
self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False
):
"""
Initialization.
INPUT:
- in_features: shape of the input
- alpha: trainable parameter
alpha is initialized to 1 by default, higher values = higher-frequency.
alpha will be trained along with the rest of your model.
"""
super(Snake, self).__init__()
self.in_features = in_features
# Initialize alpha
self.alpha_logscale = alpha_logscale
if self.alpha_logscale: # Log scale alphas initialized to zeros
self.alpha = Parameter(torch.zeros(in_features) * alpha)
else: # Linear scale alphas initialized to ones
self.alpha = Parameter(torch.ones(in_features) * alpha)
self.alpha.requires_grad = alpha_trainable
self.no_div_by_zero = 0.000000001
def forward(self, x):
"""
Forward pass of the function.
Applies the function to the input elementwise.
Snake ∶= x + 1/a * sin^2 (xa)
"""
alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # Line up with x to [B, C, T]
if self.alpha_logscale:
alpha = torch.exp(alpha)
x = x + (1.0 / (alpha + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
return x
class SnakeBeta(nn.Module):
"""
A modified Snake function which uses separate parameters for the magnitude of the periodic components
Shape:
- Input: (B, C, T)
- Output: (B, C, T), same shape as the input
Parameters:
- alpha - trainable parameter that controls frequency
- beta - trainable parameter that controls magnitude
References:
- This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
https://arxiv.org/abs/2006.08195
Examples:
>>> a1 = snakebeta(256)
>>> x = torch.randn(256)
>>> x = a1(x)
"""
def __init__(
self, in_features, alpha=1.0, alpha_trainable=True, alpha_logscale=False
):
"""
Initialization.
INPUT:
- in_features: shape of the input
- alpha - trainable parameter that controls frequency
- beta - trainable parameter that controls magnitude
alpha is initialized to 1 by default, higher values = higher-frequency.
beta is initialized to 1 by default, higher values = higher-magnitude.
alpha will be trained along with the rest of your model.
"""
super(SnakeBeta, self).__init__()
self.in_features = in_features
# Initialize alpha
self.alpha_logscale = alpha_logscale
if self.alpha_logscale: # Log scale alphas initialized to zeros
self.alpha = Parameter(torch.zeros(in_features) * alpha)
self.beta = Parameter(torch.zeros(in_features) * alpha)
else: # Linear scale alphas initialized to ones
self.alpha = Parameter(torch.ones(in_features) * alpha)
self.beta = Parameter(torch.ones(in_features) * alpha)
self.alpha.requires_grad = alpha_trainable
self.beta.requires_grad = alpha_trainable
self.no_div_by_zero = 0.000000001
def forward(self, x):
"""
Forward pass of the function.
Applies the function to the input elementwise.
SnakeBeta ∶= x + 1/b * sin^2 (xa)
"""
alpha = self.alpha.unsqueeze(0).unsqueeze(-1) # Line up with x to [B, C, T]
beta = self.beta.unsqueeze(0).unsqueeze(-1)
if self.alpha_logscale:
alpha = torch.exp(alpha)
beta = torch.exp(beta)
x = x + (1.0 / (beta + self.no_div_by_zero)) * pow(sin(x * alpha), 2)
return x
# Copyright (c) 2024 NVIDIA CORPORATION.
# Licensed under the MIT license.
import torch
import torch.nn as nn
from ..torch.resample import UpSample1d, DownSample1d
# load fused CUDA kernel: this enables importing anti_alias_activation_cuda
from . import load
anti_alias_activation_cuda = load.load()
class FusedAntiAliasActivation(torch.autograd.Function):
"""
Assumes filter size 12, replication padding on upsampling/downsampling, and logscale alpha/beta parameters as inputs.
The hyperparameters are hard-coded in the kernel to maximize speed.
NOTE: The fused kenrel is incorrect for Activation1d with different hyperparameters.
"""
@staticmethod
def forward(ctx, inputs, up_ftr, down_ftr, alpha, beta):
activation_results = anti_alias_activation_cuda.forward(
inputs, up_ftr, down_ftr, alpha, beta
)
return activation_results
@staticmethod
def backward(ctx, output_grads):
raise NotImplementedError
return output_grads, None, None
class Activation1d(nn.Module):
def __init__(
self,
activation,
up_ratio: int = 2,
down_ratio: int = 2,
up_kernel_size: int = 12,
down_kernel_size: int = 12,
fused: bool = True,
):
super().__init__()
self.up_ratio = up_ratio
self.down_ratio = down_ratio
self.act = activation
self.upsample = UpSample1d(up_ratio, up_kernel_size)
self.downsample = DownSample1d(down_ratio, down_kernel_size)
self.fused = fused # Whether to use fused CUDA kernel or not
def forward(self, x):
if not self.fused:
x = self.upsample(x)
x = self.act(x)
x = self.downsample(x)
return x
else:
if self.act.__class__.__name__ == "Snake":
beta = self.act.alpha.data # Snake uses same params for alpha and beta
else:
beta = (
self.act.beta.data
) # Snakebeta uses different params for alpha and beta
alpha = self.act.alpha.data
if (
not self.act.alpha_logscale
): # Exp baked into cuda kernel, cancel it out with a log
alpha = torch.log(alpha)
beta = torch.log(beta)
x = FusedAntiAliasActivation.apply(
x, self.upsample.filter, self.downsample.lowpass.filter, alpha, beta
)
return x
/* coding=utf-8
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <torch/extension.h>
extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta);
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &fwd_cuda, "Anti-Alias Activation forward (CUDA)");
}
\ No newline at end of file
/* coding=utf-8
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <ATen/ATen.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <ATen/cuda/CUDAContext.h>
#include <torch/extension.h>
#include "type_shim.h"
#include <assert.h>
#include <cfloat>
#include <limits>
#include <stdint.h>
#include <c10/macros/Macros.h>
namespace
{
// Hard-coded hyperparameters
// WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and
constexpr int ELEMENTS_PER_LDG_STG = 1; //(WARP_ITERATIONS < 4) ? 1 : 4;
constexpr int BUFFER_SIZE = 32;
constexpr int FILTER_SIZE = 12;
constexpr int HALF_FILTER_SIZE = 6;
constexpr int UPSAMPLE_REPLICATION_PAD = 5; // 5 on each side, matching torch impl
constexpr int DOWNSAMPLE_REPLICATION_PAD_LEFT = 5; // matching torch impl
constexpr int DOWNSAMPLE_REPLICATION_PAD_RIGHT = 6; // matching torch impl
template <typename input_t, typename output_t, typename acc_t>
__global__ void anti_alias_activation_forward(
output_t *dst,
const input_t *src,
const input_t *up_ftr,
const input_t *down_ftr,
const input_t *alpha,
const input_t *beta,
int batch_size,
int channels,
int seq_len)
{
// Up and downsample filters
input_t up_filter[FILTER_SIZE];
input_t down_filter[FILTER_SIZE];
// Load data from global memory including extra indices reserved for replication paddings
input_t elements[2 * FILTER_SIZE + 2 * BUFFER_SIZE + 2 * UPSAMPLE_REPLICATION_PAD] = {0};
input_t intermediates[2 * FILTER_SIZE + 2 * BUFFER_SIZE + DOWNSAMPLE_REPLICATION_PAD_LEFT + DOWNSAMPLE_REPLICATION_PAD_RIGHT] = {0};
// Output stores downsampled output before writing to dst
output_t output[BUFFER_SIZE];
// blockDim/threadIdx = (128, 1, 1)
// gridDim/blockIdx = (seq_blocks, channels, batches)
int block_offset = (blockIdx.x * 128 * BUFFER_SIZE + seq_len * (blockIdx.y + gridDim.y * blockIdx.z));
int local_offset = threadIdx.x * BUFFER_SIZE;
int seq_offset = blockIdx.x * 128 * BUFFER_SIZE + local_offset;
// intermediate have double the seq_len
int intermediate_local_offset = threadIdx.x * BUFFER_SIZE * 2;
int intermediate_seq_offset = blockIdx.x * 128 * BUFFER_SIZE * 2 + intermediate_local_offset;
// Get values needed for replication padding before moving pointer
const input_t *right_most_pntr = src + (seq_len * (blockIdx.y + gridDim.y * blockIdx.z));
input_t seq_left_most_value = right_most_pntr[0];
input_t seq_right_most_value = right_most_pntr[seq_len - 1];
// Move src and dst pointers
src += block_offset + local_offset;
dst += block_offset + local_offset;
// Alpha and beta values for snake activatons. Applies exp by default
alpha = alpha + blockIdx.y;
input_t alpha_val = expf(alpha[0]);
beta = beta + blockIdx.y;
input_t beta_val = expf(beta[0]);
#pragma unroll
for (int it = 0; it < FILTER_SIZE; it += 1)
{
up_filter[it] = up_ftr[it];
down_filter[it] = down_ftr[it];
}
// Apply replication padding for upsampling, matching torch impl
#pragma unroll
for (int it = -HALF_FILTER_SIZE; it < BUFFER_SIZE + HALF_FILTER_SIZE; it += 1)
{
int element_index = seq_offset + it; // index for element
if ((element_index < 0) && (element_index >= -UPSAMPLE_REPLICATION_PAD))
{
elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_left_most_value;
}
if ((element_index >= seq_len) && (element_index < seq_len + UPSAMPLE_REPLICATION_PAD))
{
elements[2 * (HALF_FILTER_SIZE + it)] = 2 * seq_right_most_value;
}
if ((element_index >= 0) && (element_index < seq_len))
{
elements[2 * (HALF_FILTER_SIZE + it)] = 2 * src[it];
}
}
// Apply upsampling strided convolution and write to intermediates. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT for replication padding of the downsampilng conv later
#pragma unroll
for (int it = 0; it < (2 * BUFFER_SIZE + 2 * FILTER_SIZE); it += 1)
{
input_t acc = 0.0;
int element_index = intermediate_seq_offset + it; // index for intermediate
#pragma unroll
for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1)
{
if ((element_index + f_idx) >= 0)
{
acc += up_filter[f_idx] * elements[it + f_idx];
}
}
intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] = acc;
}
// Apply activation function. It reserves DOWNSAMPLE_REPLICATION_PAD_LEFT and DOWNSAMPLE_REPLICATION_PAD_RIGHT for replication padding of the downsampilng conv later
double no_div_by_zero = 0.000000001;
#pragma unroll
for (int it = 0; it < 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it += 1)
{
intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] += (1.0 / (beta_val + no_div_by_zero)) * sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val) * sinf(intermediates[it + DOWNSAMPLE_REPLICATION_PAD_LEFT] * alpha_val);
}
// Apply replication padding before downsampling conv from intermediates
#pragma unroll
for (int it = 0; it < DOWNSAMPLE_REPLICATION_PAD_LEFT; it += 1)
{
intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT];
}
#pragma unroll
for (int it = DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE; it < DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE + DOWNSAMPLE_REPLICATION_PAD_RIGHT; it += 1)
{
intermediates[it] = intermediates[DOWNSAMPLE_REPLICATION_PAD_LEFT + 2 * BUFFER_SIZE + 2 * FILTER_SIZE - 1];
}
// Apply downsample strided convolution (assuming stride=2) from intermediates
#pragma unroll
for (int it = 0; it < BUFFER_SIZE; it += 1)
{
input_t acc = 0.0;
#pragma unroll
for (int f_idx = 0; f_idx < FILTER_SIZE; f_idx += 1)
{
// Add constant DOWNSAMPLE_REPLICATION_PAD_RIGHT to match torch implementation
acc += down_filter[f_idx] * intermediates[it * 2 + f_idx + DOWNSAMPLE_REPLICATION_PAD_RIGHT];
}
output[it] = acc;
}
// Write output to dst
#pragma unroll
for (int it = 0; it < BUFFER_SIZE; it += ELEMENTS_PER_LDG_STG)
{
int element_index = seq_offset + it;
if (element_index < seq_len)
{
dst[it] = output[it];
}
}
}
template <typename input_t, typename output_t, typename acc_t>
void dispatch_anti_alias_activation_forward(
output_t *dst,
const input_t *src,
const input_t *up_ftr,
const input_t *down_ftr,
const input_t *alpha,
const input_t *beta,
int batch_size,
int channels,
int seq_len)
{
if (seq_len == 0)
{
return;
}
else
{
// Use 128 threads per block to maximimize gpu utilization
constexpr int threads_per_block = 128;
constexpr int seq_len_per_block = 4096;
int blocks_per_seq_len = (seq_len + seq_len_per_block - 1) / seq_len_per_block;
dim3 blocks(blocks_per_seq_len, channels, batch_size);
dim3 threads(threads_per_block, 1, 1);
anti_alias_activation_forward<input_t, output_t, acc_t>
<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, up_ftr, down_ftr, alpha, beta, batch_size, channels, seq_len);
}
}
}
extern "C" torch::Tensor fwd_cuda(torch::Tensor const &input, torch::Tensor const &up_filter, torch::Tensor const &down_filter, torch::Tensor const &alpha, torch::Tensor const &beta)
{
// Input is a 3d tensor with dimensions [batches, channels, seq_len]
const int batches = input.size(0);
const int channels = input.size(1);
const int seq_len = input.size(2);
// Output
auto act_options = input.options().requires_grad(false);
torch::Tensor anti_alias_activation_results =
torch::empty({batches, channels, seq_len}, act_options);
void *input_ptr = static_cast<void *>(input.data_ptr());
void *up_filter_ptr = static_cast<void *>(up_filter.data_ptr());
void *down_filter_ptr = static_cast<void *>(down_filter.data_ptr());
void *alpha_ptr = static_cast<void *>(alpha.data_ptr());
void *beta_ptr = static_cast<void *>(beta.data_ptr());
void *anti_alias_activation_results_ptr = static_cast<void *>(anti_alias_activation_results.data_ptr());
DISPATCH_FLOAT_HALF_AND_BFLOAT(
input.scalar_type(),
"dispatch anti alias activation_forward",
dispatch_anti_alias_activation_forward<scalar_t, scalar_t, float>(
reinterpret_cast<scalar_t *>(anti_alias_activation_results_ptr),
reinterpret_cast<const scalar_t *>(input_ptr),
reinterpret_cast<const scalar_t *>(up_filter_ptr),
reinterpret_cast<const scalar_t *>(down_filter_ptr),
reinterpret_cast<const scalar_t *>(alpha_ptr),
reinterpret_cast<const scalar_t *>(beta_ptr),
batches,
channels,
seq_len););
return anti_alias_activation_results;
}
\ No newline at end of file
/* coding=utf-8
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*This code is copied fron NVIDIA apex:
* https://github.com/NVIDIA/apex
* with minor changes. */
#ifndef TORCH_CHECK
#define TORCH_CHECK AT_CHECK
#endif
#ifdef VERSION_GE_1_3
#define DATA_PTR data_ptr
#else
#define DATA_PTR data
#endif
# Copyright (c) 2024 NVIDIA CORPORATION.
# Licensed under the MIT license.
import os
import pathlib
import subprocess
from torch.utils import cpp_extension
"""
Setting this param to a list has a problem of generating different compilation commands (with diferent order of architectures) and leading to recompilation of fused kernels.
Set it to empty stringo avoid recompilation and assign arch flags explicity in extra_cuda_cflags below
"""
os.environ["TORCH_CUDA_ARCH_LIST"] = ""
def load():
# Check if cuda 11 is installed for compute capability 8.0
cc_flag = []
_, bare_metal_major, _ = _get_cuda_bare_metal_version(cpp_extension.CUDA_HOME)
if int(bare_metal_major) >= 11:
cc_flag.append("-gencode")
cc_flag.append("arch=compute_80,code=sm_80")
# Build path
srcpath = pathlib.Path(__file__).parent.absolute()
buildpath = srcpath / "build"
_create_build_dir(buildpath)
# Helper function to build the kernels.
def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
return cpp_extension.load(
name=name,
sources=sources,
build_directory=buildpath,
extra_cflags=[
"-O3",
],
extra_cuda_cflags=[
"-O3",
"-gencode",
"arch=compute_70,code=sm_70",
"--use_fast_math",
]
+ extra_cuda_flags
+ cc_flag,
verbose=True,
)
extra_cuda_flags = [
"-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__",
"--expt-relaxed-constexpr",
"--expt-extended-lambda",
]
sources = [
srcpath / "anti_alias_activation.cpp",
srcpath / "anti_alias_activation_cuda.cu",
]
anti_alias_activation_cuda = _cpp_extention_load_helper(
"anti_alias_activation_cuda", sources, extra_cuda_flags
)
return anti_alias_activation_cuda
def _get_cuda_bare_metal_version(cuda_dir):
raw_output = subprocess.check_output(
[cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
)
output = raw_output.split()
release_idx = output.index("release") + 1
release = output[release_idx].split(".")
bare_metal_major = release[0]
bare_metal_minor = release[1][0]
return raw_output, bare_metal_major, bare_metal_minor
def _create_build_dir(buildpath):
try:
os.mkdir(buildpath)
except OSError:
if not os.path.isdir(buildpath):
print(f"Creation of the build directory {buildpath} failed")
/* coding=utf-8
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <ATen/ATen.h>
#include "compat.h"
#define DISPATCH_FLOAT_HALF_AND_BFLOAT(TYPE, NAME, ...) \
switch (TYPE) \
{ \
case at::ScalarType::Float: \
{ \
using scalar_t = float; \
__VA_ARGS__; \
break; \
} \
case at::ScalarType::Half: \
{ \
using scalar_t = at::Half; \
__VA_ARGS__; \
break; \
} \
case at::ScalarType::BFloat16: \
{ \
using scalar_t = at::BFloat16; \
__VA_ARGS__; \
break; \
} \
default: \
AT_ERROR(#NAME, " not implemented for '", toString(TYPE), "'"); \
}
#define DISPATCH_FLOAT_HALF_AND_BFLOAT_INOUT_TYPES(TYPEIN, TYPEOUT, NAME, ...) \
switch (TYPEIN) \
{ \
case at::ScalarType::Float: \
{ \
using scalar_t_in = float; \
switch (TYPEOUT) \
{ \
case at::ScalarType::Float: \
{ \
using scalar_t_out = float; \
__VA_ARGS__; \
break; \
} \
case at::ScalarType::Half: \
{ \
using scalar_t_out = at::Half; \
__VA_ARGS__; \
break; \
} \
case at::ScalarType::BFloat16: \
{ \
using scalar_t_out = at::BFloat16; \
__VA_ARGS__; \
break; \
} \
default: \
AT_ERROR(#NAME, " not implemented for '", toString(TYPEOUT), "'"); \
} \
break; \
} \
case at::ScalarType::Half: \
{ \
using scalar_t_in = at::Half; \
using scalar_t_out = at::Half; \
__VA_ARGS__; \
break; \
} \
case at::ScalarType::BFloat16: \
{ \
using scalar_t_in = at::BFloat16; \
using scalar_t_out = at::BFloat16; \
__VA_ARGS__; \
break; \
} \
default: \
AT_ERROR(#NAME, " not implemented for '", toString(TYPEIN), "'"); \
}
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
# LICENSE is in incl_licenses directory.
from .filter import *
from .resample import *
from .act import *
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
# LICENSE is in incl_licenses directory.
import torch.nn as nn
from .resample import UpSample1d, DownSample1d
class Activation1d(nn.Module):
def __init__(
self,
activation,
up_ratio: int = 2,
down_ratio: int = 2,
up_kernel_size: int = 12,
down_kernel_size: int = 12,
):
super().__init__()
self.up_ratio = up_ratio
self.down_ratio = down_ratio
self.act = activation
self.upsample = UpSample1d(up_ratio, up_kernel_size)
self.downsample = DownSample1d(down_ratio, down_kernel_size)
# x: [B,C,T]
def forward(self, x):
x = self.upsample(x)
x = self.act(x)
x = self.downsample(x)
return x
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
# LICENSE is in incl_licenses directory.
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
if "sinc" in dir(torch):
sinc = torch.sinc
else:
# This code is adopted from adefossez's julius.core.sinc under the MIT License
# https://adefossez.github.io/julius/julius/core.html
# LICENSE is in incl_licenses directory.
def sinc(x: torch.Tensor):
"""
Implementation of sinc, i.e. sin(pi * x) / (pi * x)
__Warning__: Different to julius.sinc, the input is multiplied by `pi`!
"""
return torch.where(
x == 0,
torch.tensor(1.0, device=x.device, dtype=x.dtype),
torch.sin(math.pi * x) / math.pi / x,
)
# This code is adopted from adefossez's julius.lowpass.LowPassFilters under the MIT License
# https://adefossez.github.io/julius/julius/lowpass.html
# LICENSE is in incl_licenses directory.
def kaiser_sinc_filter1d(
cutoff, half_width, kernel_size
): # return filter [1,1,kernel_size]
even = kernel_size % 2 == 0
half_size = kernel_size // 2
# For kaiser window
delta_f = 4 * half_width
A = 2.285 * (half_size - 1) * math.pi * delta_f + 7.95
if A > 50.0:
beta = 0.1102 * (A - 8.7)
elif A >= 21.0:
beta = 0.5842 * (A - 21) ** 0.4 + 0.07886 * (A - 21.0)
else:
beta = 0.0
window = torch.kaiser_window(kernel_size, beta=beta, periodic=False)
# ratio = 0.5/cutoff -> 2 * cutoff = 1 / ratio
if even:
time = torch.arange(-half_size, half_size) + 0.5
else:
time = torch.arange(kernel_size) - half_size
if cutoff == 0:
filter_ = torch.zeros_like(time)
else:
filter_ = 2 * cutoff * window * sinc(2 * cutoff * time)
"""
Normalize filter to have sum = 1, otherwise we will have a small leakage of the constant component in the input signal.
"""
filter_ /= filter_.sum()
filter = filter_.view(1, 1, kernel_size)
return filter
class LowPassFilter1d(nn.Module):
def __init__(
self,
cutoff=0.5,
half_width=0.6,
stride: int = 1,
padding: bool = True,
padding_mode: str = "replicate",
kernel_size: int = 12,
):
"""
kernel_size should be even number for stylegan3 setup, in this implementation, odd number is also possible.
"""
super().__init__()
if cutoff < -0.0:
raise ValueError("Minimum cutoff must be larger than zero.")
if cutoff > 0.5:
raise ValueError("A cutoff above 0.5 does not make sense.")
self.kernel_size = kernel_size
self.even = kernel_size % 2 == 0
self.pad_left = kernel_size // 2 - int(self.even)
self.pad_right = kernel_size // 2
self.stride = stride
self.padding = padding
self.padding_mode = padding_mode
filter = kaiser_sinc_filter1d(cutoff, half_width, kernel_size)
self.register_buffer("filter", filter)
# Input [B, C, T]
def forward(self, x):
_, C, _ = x.shape
if self.padding:
x = F.pad(x, (self.pad_left, self.pad_right), mode=self.padding_mode)
out = F.conv1d(x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C)
return out
# Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
# LICENSE is in incl_licenses directory.
import torch.nn as nn
from torch.nn import functional as F
from .filter import LowPassFilter1d
from .filter import kaiser_sinc_filter1d
class UpSample1d(nn.Module):
def __init__(self, ratio=2, kernel_size=None):
super().__init__()
self.ratio = ratio
self.kernel_size = (
int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
)
self.stride = ratio
self.pad = self.kernel_size // ratio - 1
self.pad_left = self.pad * self.stride + (self.kernel_size - self.stride) // 2
self.pad_right = (
self.pad * self.stride + (self.kernel_size - self.stride + 1) // 2
)
filter = kaiser_sinc_filter1d(
cutoff=0.5 / ratio, half_width=0.6 / ratio, kernel_size=self.kernel_size
)
self.register_buffer("filter", filter)
# x: [B, C, T]
def forward(self, x):
_, C, _ = x.shape
x = F.pad(x, (self.pad, self.pad), mode="replicate")
x = self.ratio * F.conv_transpose1d(
x, self.filter.expand(C, -1, -1), stride=self.stride, groups=C
)
x = x[..., self.pad_left : -self.pad_right]
return x
class DownSample1d(nn.Module):
def __init__(self, ratio=2, kernel_size=None):
super().__init__()
self.ratio = ratio
self.kernel_size = (
int(6 * ratio // 2) * 2 if kernel_size is None else kernel_size
)
self.lowpass = LowPassFilter1d(
cutoff=0.5 / ratio,
half_width=0.6 / ratio,
stride=ratio,
kernel_size=self.kernel_size,
)
def forward(self, x):
xx = self.lowpass(x)
return xx
# Copyright (c) 2024 NVIDIA CORPORATION.
# Licensed under the MIT license.
# Adapted from https://github.com/jik876/hifi-gan under the MIT license.
# LICENSE is in incl_licenses directory.
import os
import json
from pathlib import Path
from typing import Optional, Union, Dict
import torch
import torch.nn as nn
from torch.nn import Conv1d, ConvTranspose1d
from torch.nn.utils import weight_norm, remove_weight_norm
from .activations import Snake, SnakeBeta
from .utils import init_weights, get_padding
from .alias_free_activation.torch.act import Activation1d as TorchActivation1d
from .utils import AttrDict
from huggingface_hub import PyTorchModelHubMixin, hf_hub_download
def load_hparams_from_json(path) -> AttrDict:
with open(path) as f:
data = f.read()
return AttrDict(json.loads(data))
class AMPBlock1(torch.nn.Module):
"""
AMPBlock applies Snake / SnakeBeta activation functions with trainable parameters that control periodicity, defined for each layer.
AMPBlock1 has additional self.convs2 that contains additional Conv1d layers with a fixed dilation=1 followed by each layer in self.convs1
Args:
h (AttrDict): Hyperparameters.
channels (int): Number of convolution channels.
kernel_size (int): Size of the convolution kernel. Default is 3.
dilation (tuple): Dilation rates for the convolutions. Each dilation layer has two convolutions. Default is (1, 3, 5).
activation (str): Activation function type. Should be either 'snake' or 'snakebeta'. Default is None.
"""
def __init__(
self,
h: AttrDict,
channels: int,
kernel_size: int = 3,
dilation: tuple = (1, 3, 5),
activation: str = None,
):
super().__init__()
self.h = h
self.convs1 = nn.ModuleList(
[
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
stride=1,
dilation=d,
padding=get_padding(kernel_size, d),
)
)
for d in dilation
]
)
self.convs1.apply(init_weights)
self.convs2 = nn.ModuleList(
[
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
stride=1,
dilation=1,
padding=get_padding(kernel_size, 1),
)
)
for _ in range(len(dilation))
]
)
self.convs2.apply(init_weights)
self.num_layers = len(self.convs1) + len(
self.convs2
) # Total number of conv layers
# Select which Activation1d, lazy-load cuda version to ensure backward compatibility
if self.h.get("use_cuda_kernel", False):
from .alias_free_activation.cuda.activation1d import (
Activation1d as CudaActivation1d,
)
Activation1d = CudaActivation1d
else:
Activation1d = TorchActivation1d
# Activation functions
if activation == "snake":
self.activations = nn.ModuleList(
[
Activation1d(
activation=Snake(channels, alpha_logscale=h.snake_logscale)
)
for _ in range(self.num_layers)
]
)
elif activation == "snakebeta":
self.activations = nn.ModuleList(
[
Activation1d(
activation=SnakeBeta(channels, alpha_logscale=h.snake_logscale)
)
for _ in range(self.num_layers)
]
)
else:
raise NotImplementedError(
"activation incorrectly specified. check the config file and look for 'activation'."
)
def forward(self, x):
acts1, acts2 = self.activations[::2], self.activations[1::2]
for c1, c2, a1, a2 in zip(self.convs1, self.convs2, acts1, acts2):
xt = a1(x)
xt = c1(xt)
xt = a2(xt)
xt = c2(xt)
x = xt + x
return x
def remove_weight_norm(self):
for l in self.convs1:
remove_weight_norm(l)
for l in self.convs2:
remove_weight_norm(l)
class AMPBlock2(torch.nn.Module):
"""
AMPBlock applies Snake / SnakeBeta activation functions with trainable parameters that control periodicity, defined for each layer.
Unlike AMPBlock1, AMPBlock2 does not contain extra Conv1d layers with fixed dilation=1
Args:
h (AttrDict): Hyperparameters.
channels (int): Number of convolution channels.
kernel_size (int): Size of the convolution kernel. Default is 3.
dilation (tuple): Dilation rates for the convolutions. Each dilation layer has two convolutions. Default is (1, 3, 5).
activation (str): Activation function type. Should be either 'snake' or 'snakebeta'. Default is None.
"""
def __init__(
self,
h: AttrDict,
channels: int,
kernel_size: int = 3,
dilation: tuple = (1, 3, 5),
activation: str = None,
):
super().__init__()
self.h = h
self.convs = nn.ModuleList(
[
weight_norm(
Conv1d(
channels,
channels,
kernel_size,
stride=1,
dilation=d,
padding=get_padding(kernel_size, d),
)
)
for d in dilation
]
)
self.convs.apply(init_weights)
self.num_layers = len(self.convs) # Total number of conv layers
# Select which Activation1d, lazy-load cuda version to ensure backward compatibility
if self.h.get("use_cuda_kernel", False):
from .alias_free_activation.cuda.activation1d import (
Activation1d as CudaActivation1d,
)
Activation1d = CudaActivation1d
else:
Activation1d = TorchActivation1d
# Activation functions
if activation == "snake":
self.activations = nn.ModuleList(
[
Activation1d(
activation=Snake(channels, alpha_logscale=h.snake_logscale)
)
for _ in range(self.num_layers)
]
)
elif activation == "snakebeta":
self.activations = nn.ModuleList(
[
Activation1d(
activation=SnakeBeta(channels, alpha_logscale=h.snake_logscale)
)
for _ in range(self.num_layers)
]
)
else:
raise NotImplementedError(
"activation incorrectly specified. check the config file and look for 'activation'."
)
def forward(self, x):
for c, a in zip(self.convs, self.activations):
xt = a(x)
xt = c(xt)
x = xt + x
def remove_weight_norm(self):
for l in self.convs:
remove_weight_norm(l)
class BigVGAN(
torch.nn.Module,
PyTorchModelHubMixin,
library_name="bigvgan",
repo_url="https://github.com/NVIDIA/BigVGAN",
docs_url="https://github.com/NVIDIA/BigVGAN/blob/main/README.md",
pipeline_tag="audio-to-audio",
license="mit",
tags=["neural-vocoder", "audio-generation", "arxiv:2206.04658"],
):
"""
BigVGAN is a neural vocoder model that applies anti-aliased periodic activation for residual blocks (resblocks).
New in BigVGAN-v2: it can optionally use optimized CUDA kernels for AMP (anti-aliased multi-periodicity) blocks.
Args:
h (AttrDict): Hyperparameters.
use_cuda_kernel (bool): If set to True, loads optimized CUDA kernels for AMP. This should be used for inference only, as training is not supported with CUDA kernels.
Note:
- The `use_cuda_kernel` parameter should be used for inference only, as training with CUDA kernels is not supported.
- Ensure that the activation function is correctly specified in the hyperparameters (h.activation).
"""
def __init__(self, h: AttrDict, use_cuda_kernel: bool = False):
super().__init__()
self.h = h
self.h["use_cuda_kernel"] = use_cuda_kernel
# Select which Activation1d, lazy-load cuda version to ensure backward compatibility
if self.h.get("use_cuda_kernel", False):
from .alias_free_activation.cuda.activation1d import (
Activation1d as CudaActivation1d,
)
Activation1d = CudaActivation1d
else:
Activation1d = TorchActivation1d
self.num_kernels = len(h.resblock_kernel_sizes)
self.num_upsamples = len(h.upsample_rates)
# Pre-conv
self.conv_pre = weight_norm(
Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3)
)
# Define which AMPBlock to use. BigVGAN uses AMPBlock1 as default
if h.resblock == "1":
resblock_class = AMPBlock1
elif h.resblock == "2":
resblock_class = AMPBlock2
else:
raise ValueError(
f"Incorrect resblock class specified in hyperparameters. Got {h.resblock}"
)
# Transposed conv-based upsamplers. does not apply anti-aliasing
self.ups = nn.ModuleList()
for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
self.ups.append(
nn.ModuleList(
[
weight_norm(
ConvTranspose1d(
h.upsample_initial_channel // (2**i),
h.upsample_initial_channel // (2 ** (i + 1)),
k,
u,
padding=(k - u) // 2,
)
)
]
)
)
# Residual blocks using anti-aliased multi-periodicity composition modules (AMP)
self.resblocks = nn.ModuleList()
for i in range(len(self.ups)):
ch = h.upsample_initial_channel // (2 ** (i + 1))
for j, (k, d) in enumerate(
zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)
):
self.resblocks.append(
resblock_class(h, ch, k, d, activation=h.activation)
)
# Post-conv
activation_post = (
Snake(ch, alpha_logscale=h.snake_logscale)
if h.activation == "snake"
else (
SnakeBeta(ch, alpha_logscale=h.snake_logscale)
if h.activation == "snakebeta"
else None
)
)
if activation_post is None:
raise NotImplementedError(
"activation incorrectly specified. check the config file and look for 'activation'."
)
self.activation_post = Activation1d(activation=activation_post)
# Whether to use bias for the final conv_post. Default to True for backward compatibility
self.use_bias_at_final = h.get("use_bias_at_final", True)
self.conv_post = weight_norm(
Conv1d(ch, 1, 7, 1, padding=3, bias=self.use_bias_at_final)
)
# Weight initialization
for i in range(len(self.ups)):
self.ups[i].apply(init_weights)
self.conv_post.apply(init_weights)
# Final tanh activation. Defaults to True for backward compatibility
self.use_tanh_at_final = h.get("use_tanh_at_final", True)
def forward(self, x):
# Pre-conv
x = self.conv_pre(x)
for i in range(self.num_upsamples):
# Upsampling
for i_up in range(len(self.ups[i])):
x = self.ups[i][i_up](x)
# AMP blocks
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i * self.num_kernels + j](x)
else:
xs += self.resblocks[i * self.num_kernels + j](x)
x = xs / self.num_kernels
# Post-conv
x = self.activation_post(x)
x = self.conv_post(x)
# Final tanh activation
if self.use_tanh_at_final:
x = torch.tanh(x)
else:
x = torch.clamp(x, min=-1.0, max=1.0) # Bound the output to [-1, 1]
return x
def remove_weight_norm(self):
try:
print("Removing weight norm...")
for l in self.ups:
for l_i in l:
remove_weight_norm(l_i)
for l in self.resblocks:
l.remove_weight_norm()
remove_weight_norm(self.conv_pre)
remove_weight_norm(self.conv_post)
except ValueError:
print("[INFO] Model already removed weight norm. Skipping!")
pass
# Additional methods for huggingface_hub support
def _save_pretrained(self, save_directory: Path) -> None:
"""Save weights and config.json from a Pytorch model to a local directory."""
model_path = save_directory / "bigvgan_generator.pt"
torch.save({"generator": self.state_dict()}, model_path)
config_path = save_directory / "config.json"
with open(config_path, "w") as config_file:
json.dump(self.h, config_file, indent=4)
@classmethod
def _from_pretrained(
cls,
*,
model_id: str,
revision: str,
cache_dir: str,
force_download: bool,
proxies: Optional[Dict],
resume_download: bool,
local_files_only: bool,
token: Union[str, bool, None],
map_location: str = "cpu", # Additional argument
strict: bool = False, # Additional argument
use_cuda_kernel: bool = False,
**model_kwargs,
):
"""Load Pytorch pretrained weights and return the loaded model."""
# Download and load hyperparameters (h) used by BigVGAN
if os.path.isdir(model_id):
print("Loading config.json from local directory")
config_file = os.path.join(model_id, "config.json")
else:
config_file = hf_hub_download(
repo_id=model_id,
filename="config.json",
revision=revision,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
resume_download=resume_download,
token=token,
local_files_only=local_files_only,
)
h = load_hparams_from_json(config_file)
# instantiate BigVGAN using h
if use_cuda_kernel:
print(
f"[WARNING] You have specified use_cuda_kernel=True during BigVGAN.from_pretrained(). Only inference is supported (training is not implemented)!"
)
print(
f"[WARNING] You need nvcc and ninja installed in your system that matches your PyTorch build is using to build the kernel. If not, the model will fail to initialize or generate incorrect waveform!"
)
print(
f"[WARNING] For detail, see the official GitHub repository: https://github.com/NVIDIA/BigVGAN?tab=readme-ov-file#using-custom-cuda-kernel-for-synthesis"
)
model = cls(h, use_cuda_kernel=use_cuda_kernel)
# Download and load pretrained generator weight
if os.path.isdir(model_id):
print("Loading weights from local directory")
model_file = os.path.join(model_id, "bigvgan_generator.pt")
else:
print(f"Loading weights from {model_id}")
model_file = hf_hub_download(
repo_id=model_id,
filename="bigvgan_generator.pt",
revision=revision,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
resume_download=resume_download,
token=token,
local_files_only=local_files_only,
)
checkpoint_dict = torch.load(
model_file, map_location=map_location, weights_only=True
)
try:
model.load_state_dict(checkpoint_dict["generator"])
except RuntimeError:
print(
f"[INFO] the pretrained checkpoint does not contain weight norm. Loading the checkpoint after removing weight norm!"
)
model.remove_weight_norm()
model.load_state_dict(checkpoint_dict["generator"])
return model
from librosa.filters import mel as librosa_mel_fn
import torch
import os
mel_basis_cache = {}
hann_window_cache = {}
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
return torch.log(torch.clamp(x, min=clip_val) * C)
def spectral_normalize_torch(magnitudes):
return dynamic_range_compression_torch(magnitudes)
def get_melspec(
y: torch.Tensor,
n_fft: int,
num_mels: int,
sampling_rate: int,
hop_size: int,
win_size: int,
fmin: int,
fmax: int = None,
center: bool = False,
) -> torch.Tensor:
"""
Calculate the mel spectrogram of an input signal.
This function uses slaney norm for the librosa mel filterbank (using librosa.filters.mel) and uses Hann window for STFT (using torch.stft).
Args:
y (torch.Tensor): Input signal.
n_fft (int): FFT size.
num_mels (int): Number of mel bins.
sampling_rate (int): Sampling rate of the input signal.
hop_size (int): Hop size for STFT.
win_size (int): Window size for STFT.
fmin (int): Minimum frequency for mel filterbank.
fmax (int): Maximum frequency for mel filterbank. If None, defaults to half the sampling rate (fmax = sr / 2.0) inside librosa_mel_fn
center (bool): Whether to pad the input to center the frames. Default is False.
Returns:
torch.Tensor: Mel spectrogram.
"""
if torch.min(y) < -1.0:
print(f"[WARNING] Min value of input waveform signal is {torch.min(y)}")
if torch.max(y) > 1.0:
print(f"[WARNING] Max value of input waveform signal is {torch.max(y)}")
device = y.device
key = f"{n_fft}_{num_mels}_{sampling_rate}_{hop_size}_{win_size}_{fmin}_{fmax}_{device}"
if key not in mel_basis_cache:
mel = librosa_mel_fn(
sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
)
mel_basis_cache[key] = torch.from_numpy(mel).float().to(device)
hann_window_cache[key] = torch.hann_window(win_size).to(device)
mel_basis = mel_basis_cache[key]
hann_window = hann_window_cache[key]
padding = (n_fft - hop_size) // 2
y = torch.nn.functional.pad(
y.unsqueeze(1), (padding, padding), mode="reflect"
).squeeze(1)
spec = torch.stft(
y,
n_fft,
hop_length=hop_size,
win_length=win_size,
window=hann_window,
center=center,
pad_mode="reflect",
normalized=False,
onesided=True,
return_complex=True,
)
spec = torch.sqrt(torch.view_as_real(spec).pow(2).sum(-1) + 1e-9)
mel_spec = torch.matmul(mel_basis, spec)
mel_spec = spectral_normalize_torch(mel_spec)
return mel_spec
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
def load_checkpoint(filepath, device):
assert os.path.isfile(filepath)
print(f"Loading '{filepath}'")
checkpoint_dict = torch.load(filepath, map_location=device, weights_only=True)
print("Complete.")
return checkpoint_dict
def init_weights(m, mean=0.0, std=0.01):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
m.weight.data.normal_(mean, std)
def get_padding(kernel_size, dilation=1):
return int((kernel_size * dilation - dilation) / 2)
Subproject commit eb00ce9142e8d98b0ed7c57cd47e0d6d5dce9a1a
import torch
import librosa
import os
from transformers import WhisperFeatureExtractor
from .glm4.speech_tokenizer.modeling_whisper import WhisperVQEncoder
from .glm4.speech_tokenizer.utils import extract_speech_token
from torch import nn
class Glm4Tokenizer(nn.Module):
def __init__(self, tokenizer_path):
super().__init__()
self.whisper_model = WhisperVQEncoder.from_pretrained(tokenizer_path).eval()
self.feature_extractor = WhisperFeatureExtractor.from_pretrained(tokenizer_path)
def tokenize(self, speech=None, audio_path=None, sr=16000):
if audio_path:
audio, sr = librosa.load(audio_path, sr=16000)
audio = torch.tensor(audio).unsqueeze(0)
audio_info = (audio, sr)
else:
assert speech is not None
assert sr
if isinstance(speech, list):
speech = torch.tensor(speech).unsqueeze(0)
if len(speech.shape) == 1:
speech = speech.unsqueeze(0)
audio_info = (speech, sr)
audio_tokens = extract_speech_token(
self.whisper_model, self.feature_extractor, [audio_info]
)[0]
audio_tokens = torch.tensor(audio_tokens).unsqueeze(0)
return audio_tokens
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment