v1.0

aa0c8efc · chenzk · aa0c8efc · aa0c8efc · aa0c8efc · aa0c8efc
Commit aa0c8efc authored Nov 06, 2024 by chenzk
20 changed files
--- a/src/f5_tts/infer/examples/multi/town.flac
+++ b/src/f5_tts/infer/examples/multi/town.flac
--- a/src/f5_tts/infer/examples/vocab.txt
+++ b/src/f5_tts/infer/examples/vocab.txt
--- a/src/f5_tts/infer/infer_cli.py
+++ b/src/f5_tts/infer/infer_cli.py
+import argparse
+import codecs
+import os
+import re
+from importlib.resources import files
+from pathlib import Path
+import numpy as np
+import soundfile as sf
+import tomli
+from cached_path import cached_path
+from f5_tts.infer.utils_infer import (
+    infer_process,
+    load_model,
+    load_vocoder,
+    preprocess_ref_audio_text,
+    remove_silence_for_generated_wav,
+)
+from f5_tts.model import DiT, UNetT
+parser = argparse.ArgumentParser(
+    prog="python3 infer-cli.py",
+    description="Commandline interface for E2/F5 TTS with Advanced Batch Processing.",
+    epilog="Specify options above to override one or more settings from config.",
+)
+parser.add_argument(
+    "-c",
+    "--config",
+    help="Configuration file. Default=infer/examples/basic/basic.toml",
+    default=os.path.join(files("f5_tts").joinpath("infer/examples/basic"), "basic.toml"),
+)
+parser.add_argument(
+    "-m",
+    "--model",
+    help="F5-TTS | E2-TTS",
+)
+parser.add_argument(
+    "-p",
+    "--ckpt_file",
+    help="The Checkpoint .pt",
+)
+parser.add_argument(
+    "-v",
+    "--vocab_file",
+    help="The vocab .txt",
+)
+parser.add_argument("-r", "--ref_audio", type=str, help="Reference audio file < 15 seconds.")
+parser.add_argument("-s", "--ref_text", type=str, default="666", help="Subtitle for the reference audio.")
+parser.add_argument(
+    "-t",
+    "--gen_text",
+    type=str,
+    help="Text to generate.",
+)
+parser.add_argument(
+    "-f",
+    "--gen_file",
+    type=str,
+    help="File with text to generate. Ignores --text",
+)
+parser.add_argument(
+    "-o",
+    "--output_dir",
+    type=str,
+    help="Path to output folder..",
+)
+parser.add_argument(
+    "--remove_silence",
+    help="Remove silence.",
+)
+parser.add_argument("--vocoder_name", type=str, default="vocos", choices=["vocos", "bigvgan"], help="vocoder name")
+parser.add_argument(
+    "--load_vocoder_from_local",
+    action="store_true",
+    help="load vocoder from local. Default: ../checkpoints/charactr/vocos-mel-24khz",
+)
+parser.add_argument(
+    "--speed",
+    type=float,
+    default=1.0,
+    help="Adjust the speed of the audio generation (default: 1.0)",
+)
+args = parser.parse_args()
+config = tomli.load(open(args.config, "rb"))
+ref_audio = args.ref_audio if args.ref_audio else config["ref_audio"]
+ref_text = args.ref_text if args.ref_text != "666" else config["ref_text"]
+gen_text = args.gen_text if args.gen_text else config["gen_text"]
+gen_file = args.gen_file if args.gen_file else config["gen_file"]
+# patches for pip pkg user
+if "infer/examples/" in ref_audio:
+    ref_audio = str(files("f5_tts").joinpath(f"{ref_audio}"))
+if "infer/examples/" in gen_file:
+    gen_file = str(files("f5_tts").joinpath(f"{gen_file}"))
+if "voices" in config:
+    for voice in config["voices"]:
+        voice_ref_audio = config["voices"][voice]["ref_audio"]
+        if "infer/examples/" in voice_ref_audio:
+            config["voices"][voice]["ref_audio"] = str(files("f5_tts").joinpath(f"{voice_ref_audio}"))
+if gen_file:
+    gen_text = codecs.open(gen_file, "r", "utf-8").read()
+output_dir = args.output_dir if args.output_dir else config["output_dir"]
+model = args.model if args.model else config["model"]
+ckpt_file = args.ckpt_file if args.ckpt_file else ""
+vocab_file = args.vocab_file if args.vocab_file else ""
+remove_silence = args.remove_silence if args.remove_silence else config["remove_silence"]
+speed = args.speed
+wave_path = Path(output_dir) / "infer_cli_out.wav"
+# spectrogram_path = Path(output_dir) / "infer_cli_out.png"
+if args.vocoder_name == "vocos":
+    # vocoder_local_path = "../checkpoints/vocos-mel-24khz"
+    vocoder_local_path = "charactr/vocos-mel-24khz"
+elif args.vocoder_name == "bigvgan":
+    vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
+mel_spec_type = args.vocoder_name
+vocoder = load_vocoder(vocoder_name=mel_spec_type, is_local=args.load_vocoder_from_local, local_path=vocoder_local_path)
+# load models
+if model == "F5-TTS":
+    model_cls = DiT
+    model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
+    if ckpt_file == "":
+        if args.vocoder_name == "vocos":
+            repo_name = "F5-TTS"
+            exp_name = "F5TTS_Base"
+            ckpt_step = 1200000
+            # ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
+            ckpt_file = str(cached_path(f"SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors"))
+            # ckpt_file = f"ckpts/{exp_name}/model_{ckpt_step}.pt"  # .pt | .safetensors; local path
+        elif args.vocoder_name == "bigvgan":
+            repo_name = "F5-TTS"
+            exp_name = "F5TTS_Base_bigvgan"
+            ckpt_step = 1250000
+            ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.pt"))
+elif model == "E2-TTS":
+    model_cls = UNetT
+    model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
+    if ckpt_file == "":
+        repo_name = "E2-TTS"
+        exp_name = "E2TTS_Base"
+        ckpt_step = 1200000
+        # ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
+        ckpt_file = str(cached_path(f"SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))
+        # ckpt_file = f"ckpts/{exp_name}/model_{ckpt_step}.pt"  # .pt | .safetensors; local path
+    elif args.vocoder_name == "bigvgan":  # TODO: need to test
+        repo_name = "F5-TTS"
+        exp_name = "F5TTS_Base_bigvgan"
+        ckpt_step = 1250000
+        ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.pt"))
+print(f"Using {model}...")
+ema_model = load_model(model_cls, model_cfg, ckpt_file, mel_spec_type=args.vocoder_name, vocab_file=vocab_file)
+def main_process(ref_audio, ref_text, text_gen, model_obj, mel_spec_type, remove_silence, speed):
+    main_voice = {"ref_audio": ref_audio, "ref_text": ref_text}
+    if "voices" not in config:
+        voices = {"main": main_voice}
+    else:
+        voices = config["voices"]
+        voices["main"] = main_voice
+    for voice in voices:
+        voices[voice]["ref_audio"], voices[voice]["ref_text"] = preprocess_ref_audio_text(
+            voices[voice]["ref_audio"], voices[voice]["ref_text"]
+        )
+        print("Voice:", voice)
+        print("Ref_audio:", voices[voice]["ref_audio"])
+        print("Ref_text:", voices[voice]["ref_text"])
+    generated_audio_segments = []
+    reg1 = r"(?=\[\w+\])"
+    chunks = re.split(reg1, text_gen)
+    reg2 = r"\[(\w+)\]"
+    for text in chunks:
+        if not text.strip():
+            continue
+        match = re.match(reg2, text)
+        if match:
+            voice = match[1]
+        else:
+            print("No voice tag found, using main.")
+            voice = "main"
+        if voice not in voices:
+            print(f"Voice {voice} not found, using main.")
+            voice = "main"
+        text = re.sub(reg2, "", text)
+        gen_text = text.strip()
+        ref_audio = voices[voice]["ref_audio"]
+        ref_text = voices[voice]["ref_text"]
+        print(f"Voice: {voice}")
+        audio, final_sample_rate, spectragram = infer_process(
+            ref_audio, ref_text, gen_text, model_obj, vocoder, mel_spec_type=mel_spec_type, speed=speed
+        )
+        generated_audio_segments.append(audio)
+    if generated_audio_segments:
+        final_wave = np.concatenate(generated_audio_segments)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        with open(wave_path, "wb") as f:
+            sf.write(f.name, final_wave, final_sample_rate)
+            # Remove silence
+            if remove_silence:
+                remove_silence_for_generated_wav(f.name)
+            print(f.name)
+def main():
+    main_process(ref_audio, ref_text, gen_text, ema_model, mel_spec_type, remove_silence, speed)
+if __name__ == "__main__":
+    main()
--- a/src/f5_tts/infer/infer_gradio.py
+++ b/src/f5_tts/infer/infer_gradio.py
--- a/src/f5_tts/infer/speech_edit.py
+++ b/src/f5_tts/infer/speech_edit.py
+import os
+import torch
+import torch.nn.functional as F
+import torchaudio
+from f5_tts.infer.utils_infer import load_checkpoint, load_vocoder, save_spectrogram
+from f5_tts.model import CFM, DiT, UNetT
+from f5_tts.model.utils import convert_char_to_pinyin, get_tokenizer
+device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+# --------------------- Dataset Settings -------------------- #
+target_sample_rate = 24000
+n_mel_channels = 100
+hop_length = 256
+win_length = 1024
+n_fft = 1024
+mel_spec_type = "vocos"  # 'vocos' or 'bigvgan'
+target_rms = 0.1
+tokenizer = "pinyin"
+dataset_name = "Emilia_ZH_EN"
+# ---------------------- infer setting ---------------------- #
+seed = None  # int | None
+exp_name = "F5TTS_Base"  # F5TTS_Base | E2TTS_Base
+ckpt_step = 1200000
+nfe_step = 32  # 16, 32
+cfg_strength = 2.0
+ode_method = "euler"  # euler | midpoint
+sway_sampling_coef = -1.0
+speed = 1.0
+if exp_name == "F5TTS_Base":
+    model_cls = DiT
+    model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
+elif exp_name == "E2TTS_Base":
+    model_cls = UNetT
+    model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
+ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.safetensors"
+output_dir = "tests"
+# [leverage https://github.com/MahmoudAshraf97/ctc-forced-aligner to get char level alignment]
+# pip install git+https://github.com/MahmoudAshraf97/ctc-forced-aligner.git
+# [write the origin_text into a file, e.g. tests/test_edit.txt]
+# ctc-forced-aligner --audio_path "src/f5_tts/infer/examples/basic/basic_ref_en.wav" --text_path "tests/test_edit.txt" --language "zho" --romanize --split_size "char"
+# [result will be saved at same path of audio file]
+# [--language "zho" for Chinese, "eng" for English]
+# [if local ckpt, set --alignment_model "../checkpoints/mms-300m-1130-forced-aligner"]
+audio_to_edit = "src/f5_tts/infer/examples/basic/basic_ref_en.wav"
+origin_text = "Some call me nature, others call me mother nature."
+target_text = "Some call me optimist, others call me realist."
+parts_to_edit = [
+    [1.42, 2.44],
+    [4.04, 4.9],
+]  # stard_ends of "nature" & "mother nature", in seconds
+fix_duration = [
+    1.2,
+    1,
+]  # fix duration for "optimist" & "realist", in seconds
+# audio_to_edit = "src/f5_tts/infer/examples/basic/basic_ref_zh.wav"
+# origin_text = "对，这就是我，万人敬仰的太乙真人。"
+# target_text = "对，那就是你，万人敬仰的太白金星。"
+# parts_to_edit = [[0.84, 1.4], [1.92, 2.4], [4.26, 6.26], ]
+# fix_duration = None  # use origin text duration
+# -------------------------------------------------#
+use_ema = True
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
+# Vocoder model
+local = False
+if mel_spec_type == "vocos":
+    vocoder_local_path = "../checkpoints/charactr/vocos-mel-24khz"
+elif mel_spec_type == "bigvgan":
+    vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
+vocoder = load_vocoder(vocoder_name=mel_spec_type, is_local=local, local_path=vocoder_local_path)
+# Tokenizer
+vocab_char_map, vocab_size = get_tokenizer(dataset_name, tokenizer)
+# Model
+model = CFM(
+    transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
+    mel_spec_kwargs=dict(
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        n_mel_channels=n_mel_channels,
+        target_sample_rate=target_sample_rate,
+        mel_spec_type=mel_spec_type,
+    ),
+    odeint_kwargs=dict(
+        method=ode_method,
+    ),
+    vocab_char_map=vocab_char_map,
+).to(device)
+dtype = torch.float32 if mel_spec_type == "bigvgan" else None
+model = load_checkpoint(model, ckpt_path, device, dtype=dtype, use_ema=use_ema)
+# Audio
+audio, sr = torchaudio.load(audio_to_edit)
+if audio.shape[0] > 1:
+    audio = torch.mean(audio, dim=0, keepdim=True)
+rms = torch.sqrt(torch.mean(torch.square(audio)))
+if rms < target_rms:
+    audio = audio * target_rms / rms
+if sr != target_sample_rate:
+    resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
+    audio = resampler(audio)
+offset = 0
+audio_ = torch.zeros(1, 0)
+edit_mask = torch.zeros(1, 0, dtype=torch.bool)
+for part in parts_to_edit:
+    start, end = part
+    part_dur = end - start if fix_duration is None else fix_duration.pop(0)
+    part_dur = part_dur * target_sample_rate
+    start = start * target_sample_rate
+    audio_ = torch.cat((audio_, audio[:, round(offset) : round(start)], torch.zeros(1, round(part_dur))), dim=-1)
+    edit_mask = torch.cat(
+        (
+            edit_mask,
+            torch.ones(1, round((start - offset) / hop_length), dtype=torch.bool),
+            torch.zeros(1, round(part_dur / hop_length), dtype=torch.bool),
+        ),
+        dim=-1,
+    )
+    offset = end * target_sample_rate
+# audio = torch.cat((audio_, audio[:, round(offset):]), dim = -1)
+edit_mask = F.pad(edit_mask, (0, audio.shape[-1] // hop_length - edit_mask.shape[-1] + 1), value=True)
+audio = audio.to(device)
+edit_mask = edit_mask.to(device)
+# Text
+text_list = [target_text]
+if tokenizer == "pinyin":
+    final_text_list = convert_char_to_pinyin(text_list)
+else:
+    final_text_list = [text_list]
+print(f"text  : {text_list}")
+print(f"pinyin: {final_text_list}")
+# Duration
+ref_audio_len = 0
+duration = audio.shape[-1] // hop_length
+# Inference
+with torch.inference_mode():
+    generated, trajectory = model.sample(
+        cond=audio,
+        text=final_text_list,
+        duration=duration,
+        steps=nfe_step,
+        cfg_strength=cfg_strength,
+        sway_sampling_coef=sway_sampling_coef,
+        seed=seed,
+        edit_mask=edit_mask,
+    )
+    print(f"Generated mel: {generated.shape}")
+    # Final result
+    generated = generated.to(torch.float32)
+    generated = generated[:, ref_audio_len:, :]
+    gen_mel_spec = generated.permute(0, 2, 1)
+    if mel_spec_type == "vocos":
+        generated_wave = vocoder.decode(gen_mel_spec)
+    elif mel_spec_type == "bigvgan":
+        generated_wave = vocoder(gen_mel_spec)
+    if rms < target_rms:
+        generated_wave = generated_wave * rms / target_rms
+    save_spectrogram(gen_mel_spec[0].cpu().numpy(), f"{output_dir}/speech_edit_out.png")
+    torchaudio.save(f"{output_dir}/speech_edit_out.wav", generated_wave.squeeze(0).cpu(), target_sample_rate)
+    print(f"Generated wav: {generated_wave.shape}")
--- a/src/f5_tts/infer/utils_infer.py
+++ b/src/f5_tts/infer/utils_infer.py
+# A unified script for inference process
+# Make adjustments inside functions, and consider both gradio and cli scripts if need to change func output format
+import os
+import sys
+sys.path.append(f"../../{os.path.dirname(os.path.abspath(__file__))}/third_party/BigVGAN/")
+import hashlib
+import re
+import tempfile
+from importlib.resources import files
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pylab as plt
+import numpy as np
+import torch
+import torchaudio
+import tqdm
+from pydub import AudioSegment, silence
+from transformers import pipeline
+from vocos import Vocos
+from f5_tts.model import CFM
+from f5_tts.model.utils import (
+    get_tokenizer,
+    convert_char_to_pinyin,
+)
+_ref_audio_cache = {}
+device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+# -----------------------------------------
+target_sample_rate = 24000
+n_mel_channels = 100
+hop_length = 256
+win_length = 1024
+n_fft = 1024
+mel_spec_type = "vocos"
+target_rms = 0.1
+cross_fade_duration = 0.15
+ode_method = "euler"
+nfe_step = 32  # 16, 32
+cfg_strength = 2.0
+sway_sampling_coef = -1.0
+speed = 1.0
+fix_duration = None
+# -----------------------------------------
+# chunk text into smaller pieces
+def chunk_text(text, max_chars=135):
+    """
+    Splits the input text into chunks, each with a maximum number of characters.
+    Args:
+        text (str): The text to be split.
+        max_chars (int): The maximum number of characters per chunk.
+    Returns:
+        List[str]: A list of text chunks.
+    """
+    chunks = []
+    current_chunk = ""
+    # Split the text into sentences based on punctuation followed by whitespace
+    sentences = re.split(r"(?<=[;:,.!?])\s+|(?<=[；：，。！？])", text)
+    for sentence in sentences:
+        if len(current_chunk.encode("utf-8")) + len(sentence.encode("utf-8")) <= max_chars:
+            current_chunk += sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
+        else:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            current_chunk = sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks
+# load vocoder
+def load_vocoder(vocoder_name="vocos", is_local=False, local_path="", device=device):
+    if vocoder_name == "vocos":
+        if is_local:
+            print(f"Load vocos from local path {local_path}")
+            vocoder = Vocos.from_hparams(f"{local_path}/config.yaml")
+            state_dict = torch.load(f"{local_path}/pytorch_model.bin", map_location="cpu")
+            vocoder.load_state_dict(state_dict)
+            vocoder = vocoder.eval().to(device)
+        else:
+            print("Download Vocos from huggingface charactr/vocos-mel-24khz")
+            vocoder = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(device)
+    elif vocoder_name == "bigvgan":
+        try:
+            from third_party.BigVGAN import bigvgan
+        except ImportError:
+            print("You need to follow the README to init submodule and change the BigVGAN source code.")
+        if is_local:
+            """download from https://huggingface.co/nvidia/bigvgan_v2_24khz_100band_256x/tree/main"""
+            vocoder = bigvgan.BigVGAN.from_pretrained(local_path, use_cuda_kernel=False)
+        else:
+            vocoder = bigvgan.BigVGAN.from_pretrained("nvidia/bigvgan_v2_24khz_100band_256x", use_cuda_kernel=False)
+        vocoder.remove_weight_norm()
+        vocoder = vocoder.eval().to(device)
+    return vocoder
+# load asr pipeline
+asr_pipe = None
+def initialize_asr_pipeline(device=device, dtype=None):
+    if dtype is None:
+        dtype = (
+            torch.float16 if device == "cuda" and torch.cuda.get_device_properties(device).major >= 6 else torch.float32
+        )
+    global asr_pipe
+    asr_pipe = pipeline(
+        "automatic-speech-recognition",
+        model="openai/whisper-large-v3-turbo",
+        torch_dtype=dtype,
+        device=device,
+    )
+# load model checkpoint for inference
+def load_checkpoint(model, ckpt_path, device, dtype=None, use_ema=True):
+    if dtype is None:
+        dtype = (
+            torch.float16 if device == "cuda" and torch.cuda.get_device_properties(device).major >= 6 else torch.float32
+        )
+    model = model.to(dtype)
+    ckpt_type = ckpt_path.split(".")[-1]
+    if ckpt_type == "safetensors":
+        from safetensors.torch import load_file
+        checkpoint = load_file(ckpt_path)
+    else:
+        checkpoint = torch.load(ckpt_path, weights_only=True)
+    if use_ema:
+        if ckpt_type == "safetensors":
+            checkpoint = {"ema_model_state_dict": checkpoint}
+        checkpoint["model_state_dict"] = {
+            k.replace("ema_model.", ""): v
+            for k, v in checkpoint["ema_model_state_dict"].items()
+            if k not in ["initted", "step"]
+        }
+        # patch for backward compatibility, 305e3ea
+        for key in ["mel_spec.mel_stft.mel_scale.fb", "mel_spec.mel_stft.spectrogram.window"]:
+            if key in checkpoint["model_state_dict"]:
+                del checkpoint["model_state_dict"][key]
+        model.load_state_dict(checkpoint["model_state_dict"])
+    else:
+        if ckpt_type == "safetensors":
+            checkpoint = {"model_state_dict": checkpoint}
+        model.load_state_dict(checkpoint["model_state_dict"])
+    return model.to(device)
+# load model for inference
+def load_model(
+    model_cls,
+    model_cfg,
+    ckpt_path,
+    mel_spec_type=mel_spec_type,
+    vocab_file="",
+    ode_method=ode_method,
+    use_ema=True,
+    device=device,
+):
+    if vocab_file == "":
+        vocab_file = str(files("f5_tts").joinpath("infer/examples/vocab.txt"))
+    tokenizer = "custom"
+    print("\nvocab : ", vocab_file)
+    print("tokenizer : ", tokenizer)
+    print("model : ", ckpt_path, "\n")
+    vocab_char_map, vocab_size = get_tokenizer(vocab_file, tokenizer)
+    model = CFM(
+        transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
+        mel_spec_kwargs=dict(
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            n_mel_channels=n_mel_channels,
+            target_sample_rate=target_sample_rate,
+            mel_spec_type=mel_spec_type,
+        ),
+        odeint_kwargs=dict(
+            method=ode_method,
+        ),
+        vocab_char_map=vocab_char_map,
+    ).to(device)
+    dtype = torch.float32 if mel_spec_type == "bigvgan" else None
+    model = load_checkpoint(model, ckpt_path, device, dtype=dtype, use_ema=use_ema)
+    return model
+def remove_silence_edges(audio, silence_threshold=-42):
+    # Remove silence from the start
+    non_silent_start_idx = silence.detect_leading_silence(audio, silence_threshold=silence_threshold)
+    audio = audio[non_silent_start_idx:]
+    # Remove silence from the end
+    non_silent_end_duration = audio.duration_seconds
+    for ms in reversed(audio):
+        if ms.dBFS > silence_threshold:
+            break
+        non_silent_end_duration -= 0.001
+    trimmed_audio = audio[: int(non_silent_end_duration * 1000)]
+    return trimmed_audio
+# preprocess reference audio and text
+def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_info=print, device=device):
+    show_info("Converting audio...")
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
+        aseg = AudioSegment.from_file(ref_audio_orig)
+        if clip_short:
+            # 1. try to find long silence for clipping
+            non_silent_segs = silence.split_on_silence(
+                aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
+            )
+            non_silent_wave = AudioSegment.silent(duration=0)
+            for non_silent_seg in non_silent_segs:
+                if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 15000:
+                    show_info("Audio is over 15s, clipping short. (1)")
+                    break
+                non_silent_wave += non_silent_seg
+            # 2. try to find short silence for clipping if 1. failed
+            if len(non_silent_wave) > 15000:
+                non_silent_segs = silence.split_on_silence(
+                    aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000, seek_step=10
+                )
+                non_silent_wave = AudioSegment.silent(duration=0)
+                for non_silent_seg in non_silent_segs:
+                    if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 15000:
+                        show_info("Audio is over 15s, clipping short. (2)")
+                        break
+                    non_silent_wave += non_silent_seg
+            aseg = non_silent_wave
+            # 3. if no proper silence found for clipping
+            if len(aseg) > 15000:
+                aseg = aseg[:15000]
+                show_info("Audio is over 15s, clipping short. (3)")
+        aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
+        aseg.export(f.name, format="wav")
+        ref_audio = f.name
+    # Compute a hash of the reference audio file
+    with open(ref_audio, "rb") as audio_file:
+        audio_data = audio_file.read()
+        audio_hash = hashlib.md5(audio_data).hexdigest()
+    global _ref_audio_cache
+    if audio_hash in _ref_audio_cache:
+        # Use cached reference text
+        show_info("Using cached reference text...")
+        ref_text = _ref_audio_cache[audio_hash]
+    else:
+        if not ref_text.strip():
+            global asr_pipe
+            if asr_pipe is None:
+                initialize_asr_pipeline(device=device)
+            show_info("No reference text provided, transcribing reference audio...")
+            ref_text = asr_pipe(
+                ref_audio,
+                chunk_length_s=30,
+                batch_size=128,
+                generate_kwargs={"task": "transcribe"},
+                return_timestamps=False,
+            )["text"].strip()
+            show_info("Finished transcription")
+        else:
+            show_info("Using custom reference text...")
+        # Cache the transcribed text
+        _ref_audio_cache[audio_hash] = ref_text
+    # Ensure ref_text ends with a proper sentence-ending punctuation
+    if not ref_text.endswith(". ") and not ref_text.endswith("。"):
+        if ref_text.endswith("."):
+            ref_text += " "
+        else:
+            ref_text += ". "
+    return ref_audio, ref_text
+# infer process: chunk text -> infer batches [i.e. infer_batch_process()]
+def infer_process(
+    ref_audio,
+    ref_text,
+    gen_text,
+    model_obj,
+    vocoder,
+    mel_spec_type=mel_spec_type,
+    show_info=print,
+    progress=tqdm,
+    target_rms=target_rms,
+    cross_fade_duration=cross_fade_duration,
+    nfe_step=nfe_step,
+    cfg_strength=cfg_strength,
+    sway_sampling_coef=sway_sampling_coef,
+    speed=speed,
+    fix_duration=fix_duration,
+    device=device,
+):
+    # Split the input text into batches
+    audio, sr = torchaudio.load(ref_audio)
+    max_chars = int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (25 - audio.shape[-1] / sr))
+    gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
+    for i, gen_text in enumerate(gen_text_batches):
+        print(f"gen_text {i}", gen_text)
+    show_info(f"Generating audio in {len(gen_text_batches)} batches...")
+    return infer_batch_process(
+        (audio, sr),
+        ref_text,
+        gen_text_batches,
+        model_obj,
+        vocoder,
+        mel_spec_type=mel_spec_type,
+        progress=progress,
+        target_rms=target_rms,
+        cross_fade_duration=cross_fade_duration,
+        nfe_step=nfe_step,
+        cfg_strength=cfg_strength,
+        sway_sampling_coef=sway_sampling_coef,
+        speed=speed,
+        fix_duration=fix_duration,
+        device=device,
+    )
+# infer batches
+def infer_batch_process(
+    ref_audio,
+    ref_text,
+    gen_text_batches,
+    model_obj,
+    vocoder,
+    mel_spec_type="vocos",
+    progress=tqdm,
+    target_rms=0.1,
+    cross_fade_duration=0.15,
+    nfe_step=32,
+    cfg_strength=2.0,
+    sway_sampling_coef=-1,
+    speed=1,
+    fix_duration=None,
+    device=None,
+):
+    audio, sr = ref_audio
+    if audio.shape[0] > 1:
+        audio = torch.mean(audio, dim=0, keepdim=True)
+    rms = torch.sqrt(torch.mean(torch.square(audio)))
+    if rms < target_rms:
+        audio = audio * target_rms / rms
+    if sr != target_sample_rate:
+        resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
+        audio = resampler(audio)
+    audio = audio.to(device)
+    generated_waves = []
+    spectrograms = []
+    if len(ref_text[-1].encode("utf-8")) == 1:
+        ref_text = ref_text + " "
+    for i, gen_text in enumerate(progress.tqdm(gen_text_batches)):
+        # Prepare the text
+        text_list = [ref_text + gen_text]
+        final_text_list = convert_char_to_pinyin(text_list)
+        ref_audio_len = audio.shape[-1] // hop_length
+        if fix_duration is not None:
+            duration = int(fix_duration * target_sample_rate / hop_length)
+        else:
+            # Calculate duration
+            ref_text_len = len(ref_text.encode("utf-8"))
+            gen_text_len = len(gen_text.encode("utf-8"))
+            duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)
+        # inference
+        with torch.inference_mode():
+            generated, _ = model_obj.sample(
+                cond=audio,
+                text=final_text_list,
+                duration=duration,
+                steps=nfe_step,
+                cfg_strength=cfg_strength,
+                sway_sampling_coef=sway_sampling_coef,
+            )
+            generated = generated.to(torch.float32)
+            generated = generated[:, ref_audio_len:, :]
+            generated_mel_spec = generated.permute(0, 2, 1)
+            if mel_spec_type == "vocos":
+                generated_wave = vocoder.decode(generated_mel_spec)
+            elif mel_spec_type == "bigvgan":
+                generated_wave = vocoder(generated_mel_spec)
+            if rms < target_rms:
+                generated_wave = generated_wave * rms / target_rms
+            # wav -> numpy
+            generated_wave = generated_wave.squeeze().cpu().numpy()
+            generated_waves.append(generated_wave)
+            spectrograms.append(generated_mel_spec[0].cpu().numpy())
+    # Combine all generated waves with cross-fading
+    if cross_fade_duration <= 0:
+        # Simply concatenate
+        final_wave = np.concatenate(generated_waves)
+    else:
+        final_wave = generated_waves[0]
+        for i in range(1, len(generated_waves)):
+            prev_wave = final_wave
+            next_wave = generated_waves[i]
+            # Calculate cross-fade samples, ensuring it does not exceed wave lengths
+            cross_fade_samples = int(cross_fade_duration * target_sample_rate)
+            cross_fade_samples = min(cross_fade_samples, len(prev_wave), len(next_wave))
+            if cross_fade_samples <= 0:
+                # No overlap possible, concatenate
+                final_wave = np.concatenate([prev_wave, next_wave])
+                continue
+            # Overlapping parts
+            prev_overlap = prev_wave[-cross_fade_samples:]
+            next_overlap = next_wave[:cross_fade_samples]
+            # Fade out and fade in
+            fade_out = np.linspace(1, 0, cross_fade_samples)
+            fade_in = np.linspace(0, 1, cross_fade_samples)
+            # Cross-faded overlap
+            cross_faded_overlap = prev_overlap * fade_out + next_overlap * fade_in
+            # Combine
+            new_wave = np.concatenate(
+                [prev_wave[:-cross_fade_samples], cross_faded_overlap, next_wave[cross_fade_samples:]]
+            )
+            final_wave = new_wave
+    # Create a combined spectrogram
+    combined_spectrogram = np.concatenate(spectrograms, axis=1)
+    return final_wave, target_sample_rate, combined_spectrogram
+# remove silence from generated wav
+def remove_silence_for_generated_wav(filename):
+    aseg = AudioSegment.from_file(filename)
+    non_silent_segs = silence.split_on_silence(
+        aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500, seek_step=10
+    )
+    non_silent_wave = AudioSegment.silent(duration=0)
+    for non_silent_seg in non_silent_segs:
+        non_silent_wave += non_silent_seg
+    aseg = non_silent_wave
+    aseg.export(filename, format="wav")
+# save spectrogram
+def save_spectrogram(spectrogram, path):
+    plt.figure(figsize=(12, 4))
+    plt.imshow(spectrogram, origin="lower", aspect="auto")
+    plt.colorbar()
+    plt.savefig(path)
+    plt.close()
--- a/src/f5_tts/model/__init__.py
+++ b/src/f5_tts/model/__init__.py
+from f5_tts.model.cfm import CFM
+from f5_tts.model.backbones.unett import UNetT
+from f5_tts.model.backbones.dit import DiT
+from f5_tts.model.backbones.mmdit import MMDiT
+from f5_tts.model.trainer import Trainer
+__all__ = ["CFM", "UNetT", "DiT", "MMDiT", "Trainer"]
--- a/src/f5_tts/model/backbones/README.md
+++ b/src/f5_tts/model/backbones/README.md
+## Backbones quick introduction
+### unett.py
+- flat unet transformer
+- structure same as in e2-tts & voicebox paper except using rotary pos emb
+- update: allow possible abs pos emb & convnextv2 blocks for embedded text before concat
+### dit.py
+- adaln-zero dit
+- embedded timestep as condition
+- concatted noised_input + masked_cond + embedded_text, linear proj in
+- possible abs pos emb & convnextv2 blocks for embedded text before concat
+- possible long skip connection (first layer to last layer)
+### mmdit.py
+- sd3 structure
+- timestep as condition
+- left stream: text embedded and applied a abs pos emb
+- right stream: masked_cond & noised_input concatted and with same conv pos emb as unett
--- a/src/f5_tts/model/backbones/dit.py
+++ b/src/f5_tts/model/backbones/dit.py
+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+from __future__ import annotations
+import torch
+from torch import nn
+import torch.nn.functional as F
+from x_transformers.x_transformers import RotaryEmbedding
+from f5_tts.model.modules import (
+    TimestepEmbedding,
+    ConvNeXtV2Block,
+    ConvPositionEmbedding,
+    DiTBlock,
+    AdaLayerNormZero_Final,
+    precompute_freqs_cis,
+    get_pos_embed_indices,
+)
+# Text embedding
+class TextEmbedding(nn.Module):
+    def __init__(self, text_num_embeds, text_dim, conv_layers=0, conv_mult=2):
+        super().__init__()
+        self.text_embed = nn.Embedding(text_num_embeds + 1, text_dim)  # use 0 as filler token
+        if conv_layers > 0:
+            self.extra_modeling = True
+            self.precompute_max_pos = 4096  # ~44s of 24khz audio
+            self.register_buffer("freqs_cis", precompute_freqs_cis(text_dim, self.precompute_max_pos), persistent=False)
+            self.text_blocks = nn.Sequential(
+                *[ConvNeXtV2Block(text_dim, text_dim * conv_mult) for _ in range(conv_layers)]
+            )
+        else:
+            self.extra_modeling = False
+    def forward(self, text: int["b nt"], seq_len, drop_text=False):  # noqa: F722
+        text = text + 1  # use 0 as filler token. preprocess of batch pad -1, see list_str_to_idx()
+        text = text[:, :seq_len]  # curtail if character tokens are more than the mel spec tokens
+        batch, text_len = text.shape[0], text.shape[1]
+        text = F.pad(text, (0, seq_len - text_len), value=0)
+        if drop_text:  # cfg for text
+            text = torch.zeros_like(text)
+        text = self.text_embed(text)  # b n -> b n d
+        # possible extra modeling
+        if self.extra_modeling:
+            # sinus pos emb
+            batch_start = torch.zeros((batch,), dtype=torch.long)
+            pos_idx = get_pos_embed_indices(batch_start, seq_len, max_pos=self.precompute_max_pos)
+            text_pos_embed = self.freqs_cis[pos_idx]
+            text = text + text_pos_embed
+            # convnextv2 blocks
+            text = self.text_blocks(text)
+        return text
+# noised input audio and context mixing embedding
+class InputEmbedding(nn.Module):
+    def __init__(self, mel_dim, text_dim, out_dim):
+        super().__init__()
+        self.proj = nn.Linear(mel_dim * 2 + text_dim, out_dim)
+        self.conv_pos_embed = ConvPositionEmbedding(dim=out_dim)
+    def forward(self, x: float["b n d"], cond: float["b n d"], text_embed: float["b n d"], drop_audio_cond=False):  # noqa: F722
+        if drop_audio_cond:  # cfg for cond audio
+            cond = torch.zeros_like(cond)
+        x = self.proj(torch.cat((x, cond, text_embed), dim=-1))
+        x = self.conv_pos_embed(x) + x
+        return x
+# Transformer backbone using DiT blocks
+class DiT(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth=8,
+        heads=8,
+        dim_head=64,
+        dropout=0.1,
+        ff_mult=4,
+        mel_dim=100,
+        text_num_embeds=256,
+        text_dim=None,
+        conv_layers=0,
+        long_skip_connection=False,
+    ):
+        super().__init__()
+        self.time_embed = TimestepEmbedding(dim)
+        if text_dim is None:
+            text_dim = mel_dim
+        self.text_embed = TextEmbedding(text_num_embeds, text_dim, conv_layers=conv_layers)
+        self.input_embed = InputEmbedding(mel_dim, text_dim, dim)
+        self.rotary_embed = RotaryEmbedding(dim_head)
+        self.dim = dim
+        self.depth = depth
+        self.transformer_blocks = nn.ModuleList(
+            [DiTBlock(dim=dim, heads=heads, dim_head=dim_head, ff_mult=ff_mult, dropout=dropout) for _ in range(depth)]
+        )
+        self.long_skip_connection = nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
+        self.norm_out = AdaLayerNormZero_Final(dim)  # final modulation
+        self.proj_out = nn.Linear(dim, mel_dim)
+    def forward(
+        self,
+        x: float["b n d"],  # nosied input audio  # noqa: F722
+        cond: float["b n d"],  # masked cond audio  # noqa: F722
+        text: int["b nt"],  # text  # noqa: F722
+        time: float["b"] | float[""],  # time step  # noqa: F821 F722
+        drop_audio_cond,  # cfg for cond audio
+        drop_text,  # cfg for text
+        mask: bool["b n"] | None = None,  # noqa: F722
+    ):
+        batch, seq_len = x.shape[0], x.shape[1]
+        if time.ndim == 0:
+            time = time.repeat(batch)
+        # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
+        t = self.time_embed(time)
+        text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
+        x = self.input_embed(x, cond, text_embed, drop_audio_cond=drop_audio_cond)
+        rope = self.rotary_embed.forward_from_seq_len(seq_len)
+        if self.long_skip_connection is not None:
+            residual = x
+        for block in self.transformer_blocks:
+            x = block(x, t, mask=mask, rope=rope)
+        if self.long_skip_connection is not None:
+            x = self.long_skip_connection(torch.cat((x, residual), dim=-1))
+        x = self.norm_out(x, t)
+        output = self.proj_out(x)
+        return output
--- a/src/f5_tts/model/backbones/mmdit.py
+++ b/src/f5_tts/model/backbones/mmdit.py
+"""
+ein notation:
+b - batch
+n - sequence
+nt - text sequence
+nw - raw wave length
+d - dimension
+"""
+from __future__ import annotations
+import torch
+from torch import nn
+from x_transformers.x_transformers import RotaryEmbedding
+from f5_tts.model.modules import (
+    TimestepEmbedding,
+    ConvPositionEmbedding,
+    MMDiTBlock,
+    AdaLayerNormZero_Final,
+    precompute_freqs_cis,
+    get_pos_embed_indices,
+)
+# text embedding
+class TextEmbedding(nn.Module):
+    def __init__(self, out_dim, text_num_embeds):
+        super().__init__()
+        self.text_embed = nn.Embedding(text_num_embeds + 1, out_dim)  # will use 0 as filler token
+        self.precompute_max_pos = 1024
+        self.register_buffer("freqs_cis", precompute_freqs_cis(out_dim, self.precompute_max_pos), persistent=False)
+    def forward(self, text: int["b nt"], drop_text=False) -> int["b nt d"]:  # noqa: F722
+        text = text + 1
+        if drop_text:
+            text = torch.zeros_like(text)
+        text = self.text_embed(text)
+        # sinus pos emb
+        batch_start = torch.zeros((text.shape[0],), dtype=torch.long)
+        batch_text_len = text.shape[1]
+        pos_idx = get_pos_embed_indices(batch_start, batch_text_len, max_pos=self.precompute_max_pos)
+        text_pos_embed = self.freqs_cis[pos_idx]
+        text = text + text_pos_embed
+        return text
+# noised input & masked cond audio embedding
+class AudioEmbedding(nn.Module):
+    def __init__(self, in_dim, out_dim):
+        super().__init__()
+        self.linear = nn.Linear(2 * in_dim, out_dim)
+        self.conv_pos_embed = ConvPositionEmbedding(out_dim)
+    def forward(self, x: float["b n d"], cond: float["b n d"], drop_audio_cond=False):  # noqa: F722
+        if drop_audio_cond:
+            cond = torch.zeros_like(cond)
+        x = torch.cat((x, cond), dim=-1)
+        x = self.linear(x)
+        x = self.conv_pos_embed(x) + x
+        return x
+# Transformer backbone using MM-DiT blocks
+class MMDiT(nn.Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth=8,
+        heads=8,
+        dim_head=64,
+        dropout=0.1,
+        ff_mult=4,
+        text_num_embeds=256,
+        mel_dim=100,
+    ):
+        super().__init__()
+        self.time_embed = TimestepEmbedding(dim)
+        self.text_embed = TextEmbedding(dim, text_num_embeds)
+        self.audio_embed = AudioEmbedding(mel_dim, dim)
+        self.rotary_embed = RotaryEmbedding(dim_head)
+        self.dim = dim
+        self.depth = depth
+        self.transformer_blocks = nn.ModuleList(
+            [
+                MMDiTBlock(
+                    dim=dim,
+                    heads=heads,
+                    dim_head=dim_head,
+                    dropout=dropout,
+                    ff_mult=ff_mult,
+                    context_pre_only=i == depth - 1,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.norm_out = AdaLayerNormZero_Final(dim)  # final modulation
+        self.proj_out = nn.Linear(dim, mel_dim)
+    def forward(
+        self,
+        x: float["b n d"],  # nosied input audio  # noqa: F722
+        cond: float["b n d"],  # masked cond audio  # noqa: F722
+        text: int["b nt"],  # text  # noqa: F722
+        time: float["b"] | float[""],  # time step  # noqa: F821 F722
+        drop_audio_cond,  # cfg for cond audio
+        drop_text,  # cfg for text
+        mask: bool["b n"] | None = None,  # noqa: F722
+    ):
+        batch = x.shape[0]
+        if time.ndim == 0:
+            time = time.repeat(batch)
+        # t: conditioning (time), c: context (text + masked cond audio), x: noised input audio
+        t = self.time_embed(time)
+        c = self.text_embed(text, drop_text=drop_text)
+        x = self.audio_embed(x, cond, drop_audio_cond=drop_audio_cond)
+        seq_len = x.shape[1]
+        text_len = text.shape[1]
+        rope_audio = self.rotary_embed.forward_from_seq_len(seq_len)
+        rope_text = self.rotary_embed.forward_from_seq_len(text_len)
+        for block in self.transformer_blocks:
+            c, x = block(x, c, t, mask=mask, rope=rope_audio, c_rope=rope_text)
+        x = self.norm_out(x, t)
+        output = self.proj_out(x)
+        return output
--- a/src/f5_tts/model/backbones/unett.py
+++ b/src/f5_tts/model/backbones/unett.py
--- a/src/f5_tts/model/cfm.py
+++ b/src/f5_tts/model/cfm.py
--- a/src/f5_tts/model/dataset.py
+++ b/src/f5_tts/model/dataset.py
--- a/src/f5_tts/model/modules.py
+++ b/src/f5_tts/model/modules.py
--- a/src/f5_tts/model/trainer.py
+++ b/src/f5_tts/model/trainer.py
--- a/src/f5_tts/model/utils.py
+++ b/src/f5_tts/model/utils.py
--- a/src/f5_tts/scripts/count_max_epoch.py
+++ b/src/f5_tts/scripts/count_max_epoch.py
+"""ADAPTIVE BATCH SIZE"""
+print("Adaptive batch size: using grouping batch sampler, frames_per_gpu fixed fed in")
+print("  -> least padding, gather wavs with accumulated frames in a batch\n")
+# data
+total_hours = 95282
+mel_hop_length = 256
+mel_sampling_rate = 24000
+# target
+wanted_max_updates = 1000000
+# train params
+gpus = 8
+frames_per_gpu = 38400  # 8 * 38400 = 307200
+grad_accum = 1
+# intermediate
+mini_batch_frames = frames_per_gpu * grad_accum * gpus
+mini_batch_hours = mini_batch_frames * mel_hop_length / mel_sampling_rate / 3600
+updates_per_epoch = total_hours / mini_batch_hours
+steps_per_epoch = updates_per_epoch * grad_accum
+# result
+epochs = wanted_max_updates / updates_per_epoch
+print(f"epochs should be set to: {epochs:.0f} ({epochs/grad_accum:.1f} x gd_acum {grad_accum})")
+print(f"progress_bar should show approx. 0/{updates_per_epoch:.0f} updates")
+print(f"                      or approx. 0/{steps_per_epoch:.0f} steps")
+# others
+print(f"total {total_hours:.0f} hours")
+print(f"mini-batch of {mini_batch_frames:.0f} frames, {mini_batch_hours:.2f} hours per mini-batch")
--- a/src/f5_tts/scripts/count_params_gflops.py
+++ b/src/f5_tts/scripts/count_params_gflops.py
+import sys
+import os
+sys.path.append(os.getcwd())
+from f5_tts.model import CFM, DiT
+import torch
+import thop
+""" ~155M """
+# transformer =     UNetT(dim = 768, depth = 20, heads = 12, ff_mult = 4)
+# transformer =     UNetT(dim = 768, depth = 20, heads = 12, ff_mult = 4, text_dim = 512, conv_layers = 4)
+# transformer =       DiT(dim = 768, depth = 18, heads = 12, ff_mult = 2)
+# transformer =       DiT(dim = 768, depth = 18, heads = 12, ff_mult = 2, text_dim = 512, conv_layers = 4)
+# transformer =       DiT(dim = 768, depth = 18, heads = 12, ff_mult = 2, text_dim = 512, conv_layers = 4, long_skip_connection = True)
+# transformer =     MMDiT(dim = 512, depth = 16, heads = 16, ff_mult = 2)
+""" ~335M """
+# FLOPs: 622.1 G, Params: 333.2 M
+# transformer =     UNetT(dim = 1024, depth = 24, heads = 16, ff_mult = 4)
+# FLOPs: 363.4 G, Params: 335.8 M
+transformer = DiT(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
+model = CFM(transformer=transformer)
+target_sample_rate = 24000
+n_mel_channels = 100
+hop_length = 256
+duration = 20
+frame_length = int(duration * target_sample_rate / hop_length)
+text_length = 150
+flops, params = thop.profile(
+    model, inputs=(torch.randn(1, frame_length, n_mel_channels), torch.zeros(1, text_length, dtype=torch.long))
+)
+print(f"FLOPs: {flops / 1e9} G")
+print(f"Params: {params / 1e6} M")
--- a/src/f5_tts/socket_server.py
+++ b/src/f5_tts/socket_server.py
--- a/src/f5_tts/train/README.md
+++ b/src/f5_tts/train/README.md