v1.0

b75857fb · chenzk · b75857fb · b75857fb · b75857fb · b75857fb
Commit b75857fb authored Feb 17, 2025 by chenzk
20 changed files
--- a/fish_speech/configs/firefly_gan_vq.yaml
+++ b/fish_speech/configs/firefly_gan_vq.yaml
+_target_: fish_speech.models.vqgan.modules.firefly.FireflyArchitecture
+spec_transform:
+  _target_: fish_speech.utils.spectrogram.LogMelSpectrogram
+  sample_rate: 44100
+  n_mels: 160
+  n_fft: 2048
+  hop_length: 512
+  win_length: 2048
+backbone:
+  _target_: fish_speech.models.vqgan.modules.firefly.ConvNeXtEncoder
+  input_channels: 160
+  depths: [3, 3, 9, 3]
+  dims: [128, 256, 384, 512]
+  drop_path_rate: 0.2
+  kernel_size: 7
+head:
+  _target_: fish_speech.models.vqgan.modules.firefly.HiFiGANGenerator
+  hop_length: 512
+  upsample_rates: [8, 8, 2, 2, 2]  # aka. strides
+  upsample_kernel_sizes: [16, 16, 4, 4, 4]
+  resblock_kernel_sizes: [3, 7, 11]
+  resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+  num_mels: 512
+  upsample_initial_channel: 512
+  pre_conv_kernel_size: 13
+  post_conv_kernel_size: 13
+quantizer:
+  _target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
+  input_dim: 512
+  n_groups: 8
+  n_codebooks: 1
+  levels: [8, 5, 5, 5]
+  downsample_factor: [2, 2]
--- a/fish_speech/configs/lora/r_8_alpha_16.yaml
+++ b/fish_speech/configs/lora/r_8_alpha_16.yaml
+_target_: fish_speech.models.text2semantic.lora.LoraConfig
+r: 8
+lora_alpha: 16
+lora_dropout: 0.01
--- a/fish_speech/configs/text2semantic_finetune.yaml
+++ b/fish_speech/configs/text2semantic_finetune.yaml
+defaults:
+  - base
+  - _self_
+project: text2semantic_finetune_dual_ar
+max_length: 4096
+pretrained_ckpt_path: checkpoints/fish-speech-1.5
+# Lightning Trainer
+trainer:
+  accumulate_grad_batches: 1
+  gradient_clip_val: 1.0
+  gradient_clip_algorithm: "norm"
+  max_steps: 10000
+  precision: bf16-true
+  limit_val_batches: 10
+  val_check_interval: 100
+  # strategy:
+  #   find_unused_parameters: true
+  #   static_graph: true 
+# Dataset Configuration
+tokenizer:
+  _target_: fish_speech.tokenizer.FishTokenizer
+  model_path: ${pretrained_ckpt_path}/tokenizer.tiktoken
+# Dataset Configuration
+train_dataset:
+  _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionIterableDataset
+  proto_files:
+    - data/protos
+  tokenizer: ${tokenizer}
+  causal: true
+  max_length: ${max_length}
+  use_speaker: false
+  interactive_prob: 0.7
+val_dataset:
+  _target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionIterableDataset
+  proto_files:
+    - data/protos
+  tokenizer: ${tokenizer}
+  causal: true
+  max_length: ${max_length}
+  use_speaker: false
+  interactive_prob: 0.7
+data:
+  _target_: fish_speech.datasets.semantic.SemanticDataModule
+  train_dataset: ${train_dataset}
+  val_dataset: ${val_dataset}
+  num_workers: 4
+  batch_size: 4
+  tokenizer: ${tokenizer}
+  max_length: ${max_length}
+# Model Configuration
+model:
+  _target_: fish_speech.models.text2semantic.lit_module.TextToSemantic
+  model: 
+    _target_: fish_speech.models.text2semantic.llama.BaseTransformer.from_pretrained
+    path: ${pretrained_ckpt_path}
+    load_weights: true
+    max_length: ${max_length}
+    lora_config: null
+  optimizer:
+    _target_: torch.optim.AdamW
+    _partial_: true
+    lr: 1e-4
+    weight_decay: 0
+    betas: [0.9, 0.95]
+    eps: 1e-5
+  lr_scheduler:
+    _target_: torch.optim.lr_scheduler.LambdaLR
+    _partial_: true
+    lr_lambda:
+      _target_: fish_speech.scheduler.get_constant_schedule_with_warmup_lr_lambda
+      _partial_: true
+      num_warmup_steps: 10
+# Callbacks
+callbacks:
+  model_checkpoint:
+    every_n_train_steps: ${trainer.val_check_interval}
--- a/fish_speech/conversation.py
+++ b/fish_speech/conversation.py
+from dataclasses import dataclass, field
+from typing import Literal
+import torch
+from .tokenizer import MODALITY_TOKENS, FishTokenizer
+CODEBOOK_PAD_TOKEN_ID = 0
+@dataclass(kw_only=True)
+class BasePart:
+    pass
+@dataclass(kw_only=True)
+class VQPart(BasePart):
+    codes: torch.Tensor
+@dataclass(kw_only=True)
+class TextPart(BasePart):
+    text: str
+@dataclass(kw_only=True)
+class EncodedMessage:
+    tokens: torch.Tensor
+    labels: torch.Tensor
+    vq_mask_tokens: torch.Tensor | None = None
+    vq_mask_labels: torch.Tensor | None = None
+    vq_parts: list[torch.Tensor]
+    vq_require_losses: torch.Tensor | None = None
+@dataclass(kw_only=True)
+class Message:
+    role: Literal["system", "user", "assistant"]
+    parts: list[VQPart | TextPart] = field(default_factory=list)
+    add_im_start: bool = True
+    add_im_end: bool = True
+    cal_loss: bool = False
+    modality: Literal["text", "voice", "interleave"] | None = None
+    # By default, ignore the loss of the auto-generated im_start token
+    ignore_im_start_loss: bool = True
+    def encode(
+        self: "Message",
+        tokenizer: FishTokenizer,
+    ) -> EncodedMessage:
+        all_tokens = []
+        all_labels = []
+        # Multi-modal tokens
+        vq_parts = []
+        vq_masks = []
+        parts = self.parts.copy()
+        if self.add_im_start:
+            modality_token = MODALITY_TOKENS[self.modality] if self.modality else ""
+            parts.insert(0, TextPart(text=f"<|im_start|>{self.role}\n{modality_token}"))
+        if self.add_im_end:
+            parts.append(TextPart(text="<|im_end|>"))
+        for part in parts:
+            if isinstance(part, TextPart):
+                tokens = torch.tensor(
+                    tokenizer.encode(part.text),
+                    dtype=torch.int,
+                )
+            elif isinstance(part, VQPart):
+                curr_codes = part.codes.clone()
+                tokens = torch.tensor(
+                    [
+                        tokenizer.semantic_id_to_token_id[i.item()]
+                        for i in curr_codes[0].int()
+                    ],
+                    dtype=torch.int,
+                )
+                vq_parts.append(curr_codes)
+            else:
+                raise ValueError(f"Unsupported part type: {type(part)}")
+            all_tokens.append(tokens)
+            if isinstance(part, VQPart):
+                vq_masks.append(torch.ones_like(tokens, dtype=torch.bool))
+            else:
+                vq_masks.append(torch.zeros_like(tokens, dtype=torch.bool))
+            if self.cal_loss:
+                all_labels.append(tokens.clone())
+            else:
+                all_labels.append(torch.full_like(tokens, -100))
+        tokens = torch.cat(all_tokens, dim=0)
+        labels = torch.cat(all_labels, dim=0)
+        vq_masks = torch.cat(vq_masks, dim=0)
+        assert tokens.shape == labels.shape == vq_masks.shape
+        if self.ignore_im_start_loss and self.add_im_start:
+            labels[: len(all_tokens[0])] = -100
+        return EncodedMessage(
+            tokens=tokens,
+            labels=labels,
+            vq_parts=vq_parts,
+            vq_mask_tokens=vq_masks,
+            vq_mask_labels=vq_masks,
+        )
+@dataclass
+class Conversation:
+    messages: list[Message]
+    def __init__(self: "Conversation", messages: list[Message] | None = None):
+        self.messages = messages or []
+    def encode(
+        self: "Conversation",
+        tokenizer: FishTokenizer,
+        add_shift: bool = True,
+        ignore_loss_tokens: list[str] = [],
+    ) -> EncodedMessage:
+        # Build the input_ids and labels
+        tokens = []
+        labels = []
+        vq_parts = []
+        vq_mask_tokens = []
+        vq_mask_labels = []
+        vq_require_losses = []
+        ignore_loss_token_ids = [tokenizer.get_token_id(i) for i in ignore_loss_tokens]
+        for message in self.messages:
+            encoded = message.encode(
+                tokenizer,
+            )
+            tokens.append(encoded.tokens)
+            labels.append(encoded.labels)
+            vq_parts.extend(encoded.vq_parts)
+            vq_mask_tokens.append(encoded.vq_mask_tokens)
+            vq_mask_labels.append(encoded.vq_mask_labels)
+            vq_require_losses.extend([message.cal_loss] * len(encoded.vq_parts))
+        tokens = torch.cat(tokens, dim=0)
+        labels = torch.cat(labels, dim=0)
+        vq_mask_tokens = torch.cat(vq_mask_tokens, dim=0)
+        vq_mask_labels = torch.cat(vq_mask_labels, dim=0)
+        vq_require_losses = torch.tensor(vq_require_losses, dtype=torch.bool)
+        if add_shift:
+            tokens = tokens[:-1]
+            labels = labels[1:]
+            vq_mask_tokens = vq_mask_tokens[:-1]
+            vq_mask_labels = vq_mask_labels[1:]
+        for i in ignore_loss_token_ids:
+            assert i != -100 and i is not None
+            labels[labels == i] = -100
+        assert tokens.dtype in [
+            torch.int,
+            torch.long,
+        ], f"Invalid dtype: {tokens.dtype}, conv: {conversation}"
+        return EncodedMessage(
+            tokens=tokens,
+            labels=labels,
+            vq_parts=vq_parts,
+            vq_mask_tokens=vq_mask_tokens,
+            vq_mask_labels=vq_mask_labels,
+            vq_require_losses=vq_require_losses,
+        )
+    def encode_for_inference(
+        self: "Conversation",
+        tokenizer: FishTokenizer,
+        num_codebooks: int,
+    ) -> EncodedMessage:
+        # self.visualize(tokenizer)
+        encoded = self.encode(tokenizer, add_shift=False)
+        tokens = encoded.tokens
+        values = torch.zeros((num_codebooks + 1, len(tokens)), dtype=torch.int)
+        values[0] = tokens
+        if encoded.vq_parts is None or len(encoded.vq_parts) == 0:
+            return values
+        vq_parts = encoded.vq_parts
+        vq_parts = [part.to(values.device) for part in vq_parts]
+        vq_parts = torch.cat(vq_parts, dim=1)
+        values[0, encoded.vq_mask_tokens] = vq_parts[0] + tokenizer.semantic_begin_id
+        values[1:, encoded.vq_mask_tokens] = vq_parts
+        return values
+    def visualize(
+        self: "Conversation",
+        tokenizer: FishTokenizer,
+        ignore_loss_tokens: list[str] = [],
+    ):
+        encoded = self.encode(
+            tokenizer, add_shift=False, ignore_loss_tokens=ignore_loss_tokens
+        )
+        colors = {
+            "purple": "\033[95m",
+            "yellow": "\033[93m",
+            "red": "\033[91m",
+            "cyan": "\033[96m",
+        }
+        first_idx = 0
+        second_idx = 0
+        def print_first_group(x):
+            nonlocal first_idx
+            color = colors["purple"] if first_idx % 2 == 0 else colors["yellow"]
+            print(f"{color}{x}\033[0m", end="")
+            first_idx += 1
+        def print_second_group(x):
+            nonlocal second_idx
+            color = colors["red"] if second_idx % 2 == 0 else colors["cyan"]
+            print(f"{color}{x}\033[0m", end="")
+            second_idx += 1
+        for tok, lab in zip(encoded.tokens, encoded.labels):
+            val = tokenizer.decode([tok])
+            if lab == -100:
+                print_second_group(val)
+            else:
+                print_first_group(val)
+        print()
+    def append(self: "Conversation", message: Message):
+        self.messages.append(message)
+if __name__ == "__main__":
+    message0 = Message(
+        role="user",
+        parts=[
+            TextPart(text="Hello, how are you?"),
+            VQPart(codes=torch.zeros((4, 10))),
+        ],
+        cal_loss=False,
+    )
+    message1 = Message(
+        role="assistant",
+        parts=[TextPart(text="I'm fine, thank you.")],
+        cal_loss=True,
+    )
+    conversation = Conversation([message0, message1])
+    tokenizer = FishTokenizer.from_pretrained("checkpoints/Qwen2-1.5B-Instruct")
+    conversation.visualize(tokenizer)
+    encoded = conversation.encode(tokenizer)
+    print(encoded)
+    print(tokenizer.batch_decode(encoded.tokens))
--- a/fish_speech/datasets/concat_repeat.py
+++ b/fish_speech/datasets/concat_repeat.py
+import bisect
+import random
+from typing import Iterable
+from torch.utils.data import Dataset, IterableDataset
+class ConcatRepeatDataset(Dataset):
+    datasets: list[Dataset]
+    cumulative_sizes: list[int]
+    repeats: list[int]
+    @staticmethod
+    def cumsum(sequence, repeats):
+        r, s = [], 0
+        for dataset, repeat in zip(sequence, repeats):
+            l = len(dataset) * repeat
+            r.append(l + s)
+            s += l
+        return r
+    def __init__(self, datasets: Iterable[Dataset], repeats: list[int]):
+        super().__init__()
+        self.datasets = list(datasets)
+        self.repeats = repeats
+        assert len(self.datasets) > 0, "datasets should not be an empty iterable"
+        assert len(self.datasets) == len(
+            repeats
+        ), "datasets and repeats should have the same length"
+        for d in self.datasets:
+            assert not isinstance(
+                d, IterableDataset
+            ), "ConcatRepeatDataset does not support IterableDataset"
+        self.cumulative_sizes = self.cumsum(self.datasets, self.repeats)
+    def __len__(self):
+        return self.cumulative_sizes[-1]
+    def __getitem__(self, idx):
+        dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
+        if dataset_idx == 0:
+            sample_idx = idx
+        else:
+            sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
+        dataset = self.datasets[dataset_idx]
+        return dataset[sample_idx % len(dataset)]
--- a/fish_speech/datasets/protos/text-data.proto
+++ b/fish_speech/datasets/protos/text-data.proto
+syntax = "proto3";
+package text_data;
+message Semantics {
+    repeated uint32 values = 1;
+}
+message Sentence {
+    repeated string texts = 1;
+    repeated Semantics semantics = 3;
+}
+message TextData {
+    string source = 1;
+    string name = 2;
+    repeated Sentence sentences = 4;
+}
+message SampledData {
+    string source = 1;
+    string name = 2;
+    repeated Sentence samples = 3;
+}
--- a/fish_speech/datasets/protos/text_data_pb2.py
+++ b/fish_speech/datasets/protos/text_data_pb2.py
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: text-data.proto
+# Protobuf Python Version: 4.25.1
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+# @@protoc_insertion_point(imports)
+_sym_db = _symbol_database.Default()
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
+    b'\n\x0ftext-data.proto\x12\ttext_data"\x1b\n\tSemantics\x12\x0e\n\x06values\x18\x01 \x03(\r"B\n\x08Sentence\x12\r\n\x05texts\x18\x01 \x03(\t\x12\'\n\tsemantics\x18\x03 \x03(\x0b\x32\x14.text_data.Semantics"P\n\x08TextData\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12&\n\tsentences\x18\x04 \x03(\x0b\x32\x13.text_data.Sentence"Q\n\x0bSampledData\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12$\n\x07samples\x18\x03 \x03(\x0b\x32\x13.text_data.Sentenceb\x06proto3'
+)
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "text_data_pb2", _globals)
+if _descriptor._USE_C_DESCRIPTORS == False:
+    DESCRIPTOR._options = None
+    _globals["_SEMANTICS"]._serialized_start = 30
+    _globals["_SEMANTICS"]._serialized_end = 57
+    _globals["_SENTENCE"]._serialized_start = 59
+    _globals["_SENTENCE"]._serialized_end = 125
+    _globals["_TEXTDATA"]._serialized_start = 127
+    _globals["_TEXTDATA"]._serialized_end = 207
+    _globals["_SAMPLEDDATA"]._serialized_start = 209
+    _globals["_SAMPLEDDATA"]._serialized_end = 290
+# @@protoc_insertion_point(module_scope)
--- a/fish_speech/datasets/protos/text_data_stream.py
+++ b/fish_speech/datasets/protos/text_data_stream.py
+import struct
+from .text_data_pb2 import TextData
+def read_pb_stream(f):
+    while True:
+        buf = f.read(4)
+        if len(buf) == 0:
+            break
+        size = struct.unpack("I", buf)[0]
+        buf = f.read(size)
+        text_data = TextData()
+        text_data.ParseFromString(buf)
+        yield text_data
+def write_pb_stream(f, text_data):
+    buf = text_data.SerializeToString()
+    f.write(struct.pack("I", len(buf)))
+    f.write(buf)
+def pack_pb_stream(text_data):
+    buf = text_data.SerializeToString()
+    return struct.pack("I", len(buf)) + buf
+def split_pb_stream(f):
+    while True:
+        head = f.read(4)
+        if len(head) == 0:
+            break
+        size = struct.unpack("I", head)[0]
+        buf = f.read(size)
+        yield head + buf
--- a/fish_speech/datasets/semantic.py
+++ b/fish_speech/datasets/semantic.py
+import random
+from dataclasses import dataclass
+from itertools import chain
+from pathlib import Path
+from random import Random
+from typing import Optional, Union
+import numpy as np
+import pyarrow.parquet as pq
+import torch
+import torch.nn.functional as F
+from datasets.download.streaming_download_manager import xopen
+from huggingface_hub import HfApi
+from lightning import LightningDataModule
+from torch.distributed import get_rank, get_world_size, is_initialized
+from torch.utils.data import DataLoader, Dataset, IterableDataset, get_worker_info
+from fish_speech.conversation import (
+    CODEBOOK_PAD_TOKEN_ID,
+    Conversation,
+    Message,
+    TextPart,
+    VQPart,
+)
+from fish_speech.datasets.protos.text_data_pb2 import SampledData
+from fish_speech.datasets.protos.text_data_stream import read_pb_stream
+from fish_speech.text.clean import clean_text
+from fish_speech.tokenizer import FishTokenizer
+from fish_speech.utils import RankedLogger
+from fish_speech.utils.braceexpand import braceexpand
+log = RankedLogger(__name__, rank_zero_only=True)
+def split_by_rank_worker(files):
+    # We need to know the total number of devices
+    # to split the data properly
+    total_devices = 1
+    if is_initialized():
+        total_devices = get_world_size()
+    worker_info = get_worker_info()
+    if worker_info is not None:
+        total_devices *= worker_info.num_workers
+    if len(files) < total_devices:
+        # Repeat the files N times to match the number of devices
+        files = files * (total_devices // len(files) + 1)
+    # DDP
+    if is_initialized():
+        files = files[get_rank() :: get_world_size()]
+    # Split by worker
+    if worker_info is not None:
+        files = files[worker_info.id :: worker_info.num_workers]
+    return files
+class AutoTextSemanticInstructionIterableDataset(IterableDataset):
+    """
+    Auto Augment Dataset by Speaker
+    1. Random concatenate multiple sentences from the same speaker to form a longer sentence
+    2. Automatically normalize the text
+    For interactive mode, we use the following format (multiple sequences):
+    <s> [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST] </s>
+    For non-interactive mode, we use the following format (one long sequence):
+    <s> [INST] text [/INST] ... </s>
+    """
+    def __init__(
+        self,
+        proto_files: list[str],
+        seed: int = 42,
+        interactive_prob: float = 0.5,
+        max_length: int = 1024,
+        tokenizer: FishTokenizer = None,
+        use_speaker: bool | float = True,
+        causal: bool = True,
+        num_codebooks: Optional[int] = None,
+        skip_text_prob: float = 0.0,
+    ):
+        """
+        Args:
+            proto_files: proto buf files if using local data
+            seed: random seed
+            interactive_prob: probability to use interactive mode
+            max_length: max length of the text
+            tokenizer: tokenizer
+            use_speaker: include speaker information in the prompt
+            causal: use causal sampling when using local data, disable will lead to random sampling
+            num_codebooks: number of codebooks, if None, it will be automatically detected
+            skip_text_prob: probability to skip the text (audio only), this only applies to interactive mode
+        """
+        super().__init__()
+        assert 0 <= interactive_prob <= 1, "interactive_prob must be in [0, 1]"
+        self.seed = seed
+        self.max_length = max_length
+        self.tokenizer = tokenizer
+        self.interactive_prob = interactive_prob
+        self.use_speaker = use_speaker
+        self.proto_files = proto_files
+        self.causal = causal
+        self.num_codebooks = num_codebooks
+        self.skip_text_prob = skip_text_prob
+        self.groups = None
+    def __iter__(self):
+        while True:
+            yield self.augment()
+    def init_mock_data_server(self):
+        if self.groups is not None:
+            return
+        # Expand the proto files
+        expanded_proto_files = []
+        for filename in self.proto_files:
+            for i in braceexpand(filename):
+                i = Path(i)
+                if i.is_file():
+                    expanded_proto_files.append(i)
+                elif i.is_dir():
+                    expanded_proto_files.extend(i.rglob("*.proto"))
+                    expanded_proto_files.extend(i.rglob("*.protos"))
+                else:
+                    raise ValueError(f"{i} is not a file or directory")
+        expanded_proto_files = sorted(expanded_proto_files)
+        Random(self.seed).shuffle(expanded_proto_files)
+        self.groups = []
+        shard_proto_files = split_by_rank_worker(expanded_proto_files)
+        log.info(
+            f"Reading {len(shard_proto_files)} / {len(expanded_proto_files)} files"
+        )
+        count = 0
+        for filename in shard_proto_files:
+            with open(filename, "rb") as f:
+                for text_data in read_pb_stream(f):
+                    self.groups.append(text_data)
+                    count += 1
+        log.info(f"Read total {count} groups of data")
+        # Shuffle the lines
+        Random(self.seed).shuffle(self.groups)
+        self.group_weights = [len(i.sentences) for i in self.groups]
+    def sample_data(self):
+        if self.groups is None:
+            self.init_mock_data_server()
+        # Shuffle unique lines, estimate that each sample is at least 20 tokens
+        num_samples = self.max_length // 20
+        # choice group based on their number of samples
+        group = random.choices(self.groups, weights=self.group_weights, k=1)[0]
+        if self.causal:
+            # Sample in order
+            if num_samples >= len(group.sentences):
+                samples = group.sentences
+            else:
+                begin = random.randint(0, len(group.sentences) - num_samples)
+                samples = group.sentences[begin : begin + num_samples]
+        else:
+            samples = random.choices(
+                group.sentences, k=min(num_samples, len(group.sentences))
+            )
+        return SampledData(
+            source=group.source,
+            name=group.name,
+            samples=samples,
+        )
+    def pack_sentences(
+        self,
+        sentences: list[str],
+        semantics: list,
+        # speaker: Optional[str] = None,
+        skip_text: bool = False,
+    ):
+        # if speaker is None:
+        #     speaker = "assistant"
+        messages = [
+            Message(
+                role="system",
+                parts=[TextPart(text="Speak out the provided text.")],
+                # add_im_end=False,
+                # cal_loss=True,
+            )
+        ]
+        cated_sentences = " ".join(sentences)
+        if skip_text:
+            cated_sentences = "<|skip_text|>"
+        messages.append(
+            Message(
+                role="user",
+                parts=[TextPart(text=cated_sentences)],
+                # cal_loss=True,
+            )
+        )
+        vq_codes = [x.values for x in semantics[0]]
+        vq_codes_tensor = torch.tensor(vq_codes).to(torch.int32)
+        vqpart = VQPart(codes=vq_codes_tensor)
+        messages.append(
+            Message(
+                role="assistant",
+                parts=[TextPart(text="<|voice|>"), vqpart],
+                cal_loss=True,
+            )
+        )
+        num_codebooks = (
+            len(semantics[0]) if self.num_codebooks is None else self.num_codebooks
+        )
+        conversation = Conversation(messages=messages)
+        # conversation.visualize(tokenizer=self.tokenizer)
+        encoded = conversation.encode(
+            tokenizer=self.tokenizer,
+        )
+        tokens_raw = encoded.tokens
+        tokens = torch.zeros((num_codebooks + 1, len(tokens_raw)), dtype=torch.int)
+        tokens[0] = tokens_raw
+        vq_parts = encoded.vq_parts
+        vq_parts = [part.to(tokens.device) for part in vq_parts]
+        vq_parts = torch.cat(vq_parts, dim=1)
+        tokens[1:, encoded.vq_mask_tokens] = vq_parts
+        labels_raw = encoded.labels
+        labels = torch.full((num_codebooks + 1, len(labels_raw)), -100, dtype=torch.int)
+        labels[0, :] = labels_raw
+        labels[1:, encoded.vq_mask_labels] = vq_parts
+        labels[1:, -1:] = CODEBOOK_PAD_TOKEN_ID
+        tokens = tokens.long()
+        labels = labels.long()
+        # Verify the padding is correct, and the last token is eos
+        assert (tokens[1:, ~(encoded.vq_mask_tokens)] == CODEBOOK_PAD_TOKEN_ID).all()
+        assert (labels[1:, -1:] == CODEBOOK_PAD_TOKEN_ID).all()
+        return tokens, labels
+    def augment(self):
+        response = self.sample_data()
+        if len(response.samples) == 0:
+            # Invalid group
+            return None
+        samples = list(response.samples)
+        all_tokens, all_labels = [], []
+        while len(samples) > 0:
+            sentence = samples.pop(0)
+            text = clean_text(random.choice(sentence.texts))
+            tokens, labels = self.pack_sentences(
+                sentences=[text],
+                semantics=[sentence.semantics],
+                # speaker=response.name if use_speaker else None,
+                skip_text=random.random() < self.skip_text_prob,
+            )
+            all_tokens.append(tokens)
+            all_labels.append(labels)
+        tokens = torch.cat(all_tokens, dim=1)
+        labels = torch.cat(all_labels, dim=1)
+        # Verify that the length is correct
+        assert tokens.size(1) == labels.size(1), f"{tokens.size(1)} != {labels.size(1)}"
+        data = {"tokens": tokens, "labels": labels}
+        return data
+class AutoTextSemanticInstructionDataset(Dataset):
+    """
+    Auto Augment Dataset by Speaker
+    1. Random concatenate multiple sentences from the same speaker to form a longer sentence
+    2. Automatically normalize the text
+    For interactive mode, we use the following format (multiple sequences):
+    <s> [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST] </s>
+    For non-interactive mode, we use the following format (one long sequence):
+    <s> [INST] text [/INST] ... </s>
+    """
+    def __init__(
+        self,
+        proto_files: list[str],
+        seed: int = 42,
+        interactive_prob: float = 0.5,
+        max_length: int = 1024,
+        tokenizer: FishTokenizer = None,
+        use_speaker: bool | float = True,
+        causal: bool = True,
+        num_codebooks: Optional[int] = None,
+        skip_text_prob: float = 0.0,
+    ):
+        """
+        Args:
+            proto_files: proto buf files if using local data
+            seed: random seed
+            interactive_prob: probability to use interactive mode
+            max_length: max length of the text
+            tokenizer: tokenizer
+            use_speaker: include speaker information in the prompt
+            causal: use causal sampling when using local data, disable will lead to random sampling
+            num_codebooks: number of codebooks, if None, it will be automatically detected
+            skip_text_prob: probability to skip the text (audio only), this only applies to interactive mode
+        """
+        super().__init__()
+        assert 0 <= interactive_prob <= 1, "interactive_prob must be in [0, 1]"
+        self.seed = seed
+        self.max_length = max_length
+        self.tokenizer = tokenizer
+        self.interactive_prob = interactive_prob
+        self.use_speaker = use_speaker
+        self.proto_files = proto_files
+        self.causal = causal
+        self.num_codebooks = num_codebooks
+        self.skip_text_prob = skip_text_prob
+        self.data = []
+        self._init_data()
+    def _init_data(self):
+        expanded_proto_files = []
+        for filename in self.proto_files:
+            for i in braceexpand(filename):
+                i = Path(i)
+                if i.is_file():
+                    expanded_proto_files.append(i)
+                elif i.is_dir():
+                    expanded_proto_files.extend(i.rglob("*.proto"))
+                    expanded_proto_files.extend(i.rglob("*.protos"))
+                else:
+                    raise ValueError(f"{i} is not a file or directory")
+        expanded_proto_files = sorted(expanded_proto_files)
+        Random(self.seed).shuffle(expanded_proto_files)
+        groups = []
+        shard_proto_files = split_by_rank_worker(expanded_proto_files)
+        log.info(
+            f"Reading {len(shard_proto_files)} / {len(expanded_proto_files)} files"
+        )
+        count = 0
+        for filename in shard_proto_files:
+            with open(filename, "rb") as f:
+                for text_data in read_pb_stream(f):
+                    groups.append(text_data)
+                    count += 1
+        log.info(f"Read total {count} groups of data")
+        for group in groups:
+            if len(group.sentences) == 0:
+                continue
+            samples = list(group.sentences)
+            for sentence in samples:
+                text = clean_text(random.choice(sentence.texts))
+                tokens, labels = self.pack_sentences(
+                    sentences=[text],
+                    semantics=[sentence.semantics],
+                    skip_text=random.random() < self.skip_text_prob,
+                )
+                self.data.append({"tokens": tokens, "labels": labels})
+        random.Random(self.seed).shuffle(self.data)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        return self.data[idx]
+    def pack_sentences(
+        self,
+        sentences: list[str],
+        semantics: list,
+        skip_text: bool = False,
+    ):
+        messages = [
+            Message(
+                role="system",
+                parts=[TextPart(text="Speak out the provided text.")],
+            )
+        ]
+        cated_sentences = " ".join(sentences)
+        if skip_text:
+            cated_sentences = "<|skip_text|>"
+        messages.append(
+            Message(
+                role="user",
+                parts=[TextPart(text=cated_sentences)],
+            )
+        )
+        vq_codes = [x.values for x in semantics[0]]
+        vq_codes_tensor = torch.tensor(vq_codes).to(torch.int32)
+        vqpart = VQPart(codes=vq_codes_tensor)
+        messages.append(
+            Message(
+                role="assistant",
+                parts=[TextPart(text="<|voice|>"), vqpart],
+                cal_loss=True,
+            )
+        )
+        num_codebooks = (
+            len(semantics[0]) if self.num_codebooks is None else self.num_codebooks
+        )
+        conversation = Conversation(messages=messages)
+        encoded = conversation.encode(
+            tokenizer=self.tokenizer,
+        )
+        tokens_raw = encoded.tokens
+        tokens = torch.zeros((num_codebooks + 1, len(tokens_raw)), dtype=torch.int)
+        tokens[0] = tokens_raw
+        vq_parts = encoded.vq_parts
+        vq_parts = [part.to(tokens.device) for part in vq_parts]
+        vq_parts = torch.cat(vq_parts, dim=1)
+        tokens[1:, encoded.vq_mask_tokens] = vq_parts
+        labels_raw = encoded.labels
+        labels = torch.full((num_codebooks + 1, len(labels_raw)), -100, dtype=torch.int)
+        labels[0, :] = labels_raw
+        labels[1:, encoded.vq_mask_labels] = vq_parts
+        labels[1:, -1:] = CODEBOOK_PAD_TOKEN_ID
+        tokens = tokens.long()
+        labels = labels.long()
+        assert (tokens[1:, ~(encoded.vq_mask_tokens)] == CODEBOOK_PAD_TOKEN_ID).all()
+        assert (labels[1:, -1:] == CODEBOOK_PAD_TOKEN_ID).all()
+        return tokens, labels
+class InterleaveDataset(IterableDataset):
+    def __init__(
+        self,
+        datasets: list[IterableDataset],
+        probabilities: list[float],
+        seed: int = 42,
+    ):
+        super().__init__()
+        self.datasets = datasets
+        self.probabilities = probabilities
+        self.seed = seed
+    def __iter__(self):
+        rng = np.random.default_rng(self.seed)
+        dataset_iterators = [iter(dataset) for dataset in self.datasets]
+        while True:
+            # Random choice one
+            dataset_idx = rng.choice(len(self.datasets), p=self.probabilities)
+            dataset_iterator = dataset_iterators[dataset_idx]
+            try:
+                yield next(dataset_iterator)
+            except StopIteration:
+                # Exhausted, create a new iterator
+                dataset_iterators[dataset_idx] = iter(self.datasets[dataset_idx])
+                yield next(dataset_iterators[dataset_idx])
+@dataclass
+class TextDataCollator:
+    tokenizer: FishTokenizer
+    max_length: int = 1024
+    def __call__(self, examples):
+        if "negative_tokens" in examples:
+            positive_examples = []
+            negative_examples = []
+            for i in examples:
+                positive_examples.append(
+                    {
+                        "tokens": i["tokens"],
+                        "labels": i["labels"],
+                    }
+                )
+                negative_examples.append(
+                    {
+                        "tokens": i["negative_tokens"],
+                        "labels": i["negative_labels"],
+                    }
+                )
+            examples = positive_examples + negative_examples
+        return self.batchify(examples)
+    def batchify(self, examples, tokens_key="tokens", labels_key="labels"):
+        tokens, attention_masks, labels = [], [], []
+        # Calculate the max length
+        max_tokens_length = 0
+        for example in examples:
+            max_tokens_length = max(max_tokens_length, example[tokens_key].size(1))
+        max_tokens_length = min(max_tokens_length, self.max_length)
+        for example in examples:
+            _tokens = example[tokens_key][:, :max_tokens_length]
+            _labels = example[labels_key][:, :max_tokens_length]
+            _attention_mask = torch.ones((max_tokens_length,), dtype=torch.bool)
+            tokens_length = _tokens.size(1)
+            _attention_mask[:tokens_length] = False
+            assert tokens_length == _labels.size(
+                1
+            ), f"{tokens_length} != {_labels.size(1)}"
+            if tokens_length < max_tokens_length:
+                _tokens = F.pad(
+                    _tokens,
+                    (0, max_tokens_length - tokens_length),
+                    value=self.tokenizer.get_token_id("<|end_of_text|>"),
+                )
+                _tokens[1:, tokens_length:] = CODEBOOK_PAD_TOKEN_ID
+                _labels = F.pad(
+                    _labels, (0, max_tokens_length - _labels.size(1)), value=-100
+                )
+            tokens.append(_tokens)
+            attention_masks.append(_attention_mask)
+            labels.append(_labels)
+        tokens = torch.stack(tokens, dim=0)
+        attention_masks = torch.stack(attention_masks, dim=0)
+        labels = torch.stack(labels, dim=0)
+        return {
+            "inputs": tokens,
+            "attention_masks": attention_masks,
+            "labels": labels,
+        }
+class SemanticDataModule(LightningDataModule):
+    def __init__(
+        self,
+        train_dataset: Union[
+            AutoTextSemanticInstructionDataset,
+            AutoTextSemanticInstructionIterableDataset,
+            InterleaveDataset,
+        ],
+        val_dataset: Union[
+            AutoTextSemanticInstructionDataset,
+            AutoTextSemanticInstructionIterableDataset,
+            InterleaveDataset,
+        ],
+        batch_size: int = 32,
+        tokenizer: FishTokenizer = None,
+        max_length: int = 1024,
+        num_workers: int = 4,
+    ):
+        super().__init__()
+        self.train_dataset = train_dataset
+        self.val_dataset = val_dataset
+        self.batch_size = batch_size
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.num_workers = num_workers
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            collate_fn=TextDataCollator(self.tokenizer, self.max_length),
+            num_workers=self.num_workers,
+            persistent_workers=True,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.val_dataset,
+            batch_size=self.batch_size,
+            collate_fn=TextDataCollator(self.tokenizer, self.max_length),
+            num_workers=self.num_workers,
+            persistent_workers=True,
+        )
+if __name__ == "__main__":
+    from tqdm import tqdm
+    ds = AutoTextSemanticInstructionDataset(
+        ["data/protos"],
+        tokenizer=FishTokenizer("checkpoints/fish-speech-1.5/tokenizer.tiktoken"),
+        use_speaker=False,
+        interactive_prob=1.0,
+        skip_text_prob=0.5,
+    )
+    for i in range(100):
+        # Please uncomment line 235 to visualize the tokenized message
+        print(ds[i])
--- a/fish_speech/datasets/vqgan.py
+++ b/fish_speech/datasets/vqgan.py
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+import librosa
+import numpy as np
+import torch
+from lightning import LightningDataModule
+from torch.utils.data import DataLoader, Dataset
+from fish_speech.utils import RankedLogger
+logger = RankedLogger(__name__, rank_zero_only=False)
+class VQGANDataset(Dataset):
+    def __init__(
+        self,
+        filelist: str,
+        sample_rate: int = 32000,
+        hop_length: int = 640,
+        slice_frames: Optional[int] = None,
+    ):
+        super().__init__()
+        filelist = Path(filelist)
+        root = filelist.parent
+        self.files = [
+            root / line.strip()
+            for line in filelist.read_text(encoding="utf-8").splitlines()
+            if line.strip()
+        ]
+        self.sample_rate = sample_rate
+        self.hop_length = hop_length
+        self.slice_frames = slice_frames
+    def __len__(self):
+        return len(self.files)
+    def get_item(self, idx):
+        file = self.files[idx]
+        audio, _ = librosa.load(file, sr=self.sample_rate, mono=True)
+        # Slice audio and features
+        if (
+            self.slice_frames is not None
+            and audio.shape[0] > self.slice_frames * self.hop_length
+        ):
+            start = np.random.randint(
+                0, audio.shape[0] - self.slice_frames * self.hop_length
+            )
+            audio = audio[start : start + self.slice_frames * self.hop_length]
+        if len(audio) == 0:
+            return None
+        max_value = np.abs(audio).max()
+        if max_value > 1.0:
+            audio = audio / max_value
+        return {
+            "audio": torch.from_numpy(audio),
+        }
+    def __getitem__(self, idx):
+        try:
+            return self.get_item(idx)
+        except Exception as e:
+            import traceback
+            traceback.print_exc()
+            logger.error(f"Error loading {self.files[idx]}: {e}")
+            return None
+@dataclass
+class VQGANCollator:
+    def __call__(self, batch):
+        batch = [x for x in batch if x is not None]
+        audio_lengths = torch.tensor([len(x["audio"]) for x in batch])
+        audio_maxlen = audio_lengths.max()
+        # Rounds up to nearest multiple of 2 (audio_lengths)
+        audios = []
+        for x in batch:
+            audios.append(
+                torch.nn.functional.pad(x["audio"], (0, audio_maxlen - len(x["audio"])))
+            )
+        return {
+            "audios": torch.stack(audios),
+            "audio_lengths": audio_lengths,
+        }
+class VQGANDataModule(LightningDataModule):
+    def __init__(
+        self,
+        train_dataset: VQGANDataset,
+        val_dataset: VQGANDataset,
+        batch_size: int = 32,
+        num_workers: int = 4,
+        val_batch_size: Optional[int] = None,
+    ):
+        super().__init__()
+        self.train_dataset = train_dataset
+        self.val_dataset = val_dataset
+        self.batch_size = batch_size
+        self.val_batch_size = val_batch_size or batch_size
+        self.num_workers = num_workers
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            collate_fn=VQGANCollator(),
+            num_workers=self.num_workers,
+            shuffle=True,
+            persistent_workers=True,
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.val_dataset,
+            batch_size=self.val_batch_size,
+            collate_fn=VQGANCollator(),
+            num_workers=self.num_workers,
+            persistent_workers=True,
+        )
+if __name__ == "__main__":
+    dataset = VQGANDataset("data/LibriTTS_R/vq_train_filelist.txt")
+    dataloader = DataLoader(
+        dataset, batch_size=4, shuffle=False, collate_fn=VQGANCollator()
+    )
+    for batch in dataloader:
+        print(batch["audios"].shape)
+        print(batch["features"].shape)
+        print(batch["audio_lengths"])
+        print(batch["feature_lengths"])
+        break
--- a/fish_speech/i18n/README.md
+++ b/fish_speech/i18n/README.md
+## i18n Folder Attribution
+The `i18n` folder within the `fish_speech` directory contains files initially sourced from the RVC project. In compliance with the MIT license under which these files were released, we acknowledge the original authors and sources below:
+### fish_speech/i18n/core.py
+**Related code from RVC:**
+[https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py)
+**Initial commit:**
+add localization(添加本地化) [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#35](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/35)
+**Initial author:**
+[@L4Ph](https://github.com/L4Ph)
+### fish_speech/i18n/scan.py
+**Related code from RVC:**
+[https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py)
+**Initial commit:**
+File for detecting i18n missing keys [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#1058](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/1058)
+**Initial author:**
+[@towzeur](https://github.com/towzeur)
+We appreciate the contributions of the RVC project and its authors.
--- a/fish_speech/i18n/__init__.py
+++ b/fish_speech/i18n/__init__.py
+from .core import i18n
+__all__ = ["i18n"]
--- a/fish_speech/i18n/core.py
+++ b/fish_speech/i18n/core.py
+import json
+import locale
+from pathlib import Path
+I18N_FILE_PATH = Path(__file__).parent / "locale"
+DEFAULT_LANGUAGE = "en_US"
+def load_language_list(language):
+    with open(I18N_FILE_PATH / f"{language}.json", "r", encoding="utf-8") as f:
+        language_list = json.load(f)
+    return language_list
+class I18nAuto:
+    def __init__(self):
+        i18n_file = Path(".locale")
+        if i18n_file.exists():
+            with open(i18n_file, "r", encoding="utf-8") as f:
+                language = f.read().strip()
+        else:
+            # getlocale can't identify the system's language ((None, None))
+            language = locale.getdefaultlocale()[0]
+        if (I18N_FILE_PATH / f"{language}.json").exists() is False:
+            language = DEFAULT_LANGUAGE
+        self.language = language
+        self.language_map = load_language_list(language)
+    def __call__(self, key):
+        return self.language_map.get(key, key)
+    def __repr__(self):
+        return "Use Language: " + self.language
+i18n = I18nAuto()
--- a/fish_speech/i18n/locale/en_US.json
+++ b/fish_speech/i18n/locale/en_US.json
+{
+  "16-mixed is recommended for 10+ series GPU": "16-mixed is recommended for 10+ series GPU",
+  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 to 10 seconds of reference audio, useful for specifying speaker.",
+  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).",
+  "Accumulate Gradient Batches": "Accumulate Gradient Batches",
+  "Add to Processing Area": "Add to Processing Area",
+  "Added path successfully!": "Added path successfully!",
+  "Advanced Config": "Advanced Config",
+  "Base LLAMA Model": "Base LLAMA Model",
+  "Batch Inference": "Batch Inference",
+  "Batch Size": "Batch Size",
+  "Changing with the Model Path": "Changing with the Model Path",
+  "Chinese": "Chinese",
+  "Compile Model": "Compile Model",
+  "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compile the model can significantly reduce the inference time, but will increase cold start time",
+  "Copy": "Copy",
+  "Data Preprocessing": "Data Preprocessing",
+  "Data Preprocessing Path": "Data Preprocessing Path",
+  "Data Source": "Data Source",
+  "Decoder Model Config": "Decoder Model Config",
+  "Decoder Model Path": "Decoder Model Path",
+  "Disabled": "Disabled",
+  "Enable Reference Audio": "Enable Reference Audio",
+  "English": "English",
+  "Error Message": "Error Message",
+  "File Preprocessing": "File Preprocessing",
+  "Generate": "Generate",
+  "Generated Audio": "Generated Audio",
+  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format",
+  "Infer interface is closed": "Infer interface is closed",
+  "Inference Configuration": "Inference Configuration",
+  "Inference Server Configuration": "Inference Server Configuration",
+  "Inference Server Error": "Inference Server Error",
+  "Inferring interface is launched at {}": "Inferring interface is launched at {}",
+  "Initial Learning Rate": "Initial Learning Rate",
+  "Input Audio & Source Path for Transcription": "Input Audio & Source Path for Transcription",
+  "Input Text": "Input Text",
+  "Invalid path: {}": "Invalid path: {}",
+  "It is recommended to use CUDA, if you have low configuration, use CPU": "It is recommended to use CUDA, if you have low configuration, use CPU",
+  "Iterative Prompt Length, 0 means off": "Iterative Prompt Length, 0 means off",
+  "Japanese": "Japanese",
+  "LLAMA Configuration": "LLAMA Configuration",
+  "LLAMA Model Config": "LLAMA Model Config",
+  "LLAMA Model Path": "LLAMA Model Path",
+  "Labeling Device": "Labeling Device",
+  "LoRA Model to be merged": "LoRA Model to be merged",
+  "Maximum Audio Duration": "Maximum Audio Duration",
+  "Maximum Length per Sample": "Maximum Length per Sample",
+  "Maximum Training Steps": "Maximum Training Steps",
+  "Maximum tokens per batch, 0 means no limit": "Maximum tokens per batch, 0 means no limit",
+  "Merge": "Merge",
+  "Merge LoRA": "Merge LoRA",
+  "Merge successfully": "Merge successfully",
+  "Minimum Audio Duration": "Minimum Audio Duration",
+  "Model Output Path": "Model Output Path",
+  "Model Size": "Model Size",
+  "Move": "Move",
+  "Move files successfully": "Move files successfully",
+  "No audio generated, please check the input text.": "No audio generated, please check the input text.",
+  "No selected options": "No selected options",
+  "Number of Workers": "Number of Workers",
+  "Open Inference Server": "Open Inference Server",
+  "Open Labeler WebUI": "Open Labeler WebUI",
+  "Open Tensorboard": "Open Tensorboard",
+  "Opened labeler in browser": "Opened labeler in browser",
+  "Optional Label Language": "Optional Label Language",
+  "Optional online ver": "Optional online ver",
+  "Output Path": "Output Path",
+  "Path error, please check the model file exists in the corresponding path": "Path error, please check the model file exists in the corresponding path",
+  "Precision": "Precision",
+  "Probability of applying Speaker Condition": "Probability of applying Speaker Condition",
+  "Put your text here.": "Put your text here.",
+  "Reference Audio": "Reference Audio",
+  "Reference Text": "Reference Text",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "Related code and weights are released under CC BY-NC-SA 4.0 License.",
+  "Remove Selected Data": "Remove Selected Data",
+  "Removed path successfully!": "Removed path successfully!",
+  "Repetition Penalty": "Repetition Penalty",
+  "Save model every n steps": "Save model every n steps",
+  "Select LLAMA ckpt": "Select LLAMA ckpt",
+  "Select VITS ckpt": "Select VITS ckpt",
+  "Select VQGAN ckpt": "Select VQGAN ckpt",
+  "Select source file processing method": "Select source file processing method",
+  "Select the model to be trained (Depending on the Tab page you are on)": "Select the model to be trained (Depending on the Tab page you are on)",
+  "Selected: {}": "Selected: {}",
+  "Speaker": "Speaker",
+  "Speaker is identified by the folder name": "Speaker is identified by the folder name",
+  "Start Training": "Start Training",
+  "Streaming Audio": "Streaming Audio",
+  "Streaming Generate": "Streaming Generate",
+  "Tensorboard Host": "Tensorboard Host",
+  "Tensorboard Log Path": "Tensorboard Log Path",
+  "Tensorboard Port": "Tensorboard Port",
+  "Tensorboard interface is closed": "Tensorboard interface is closed",
+  "Tensorboard interface is launched at {}": "Tensorboard interface is launched at {}",
+  "Text is too long, please keep it under {} characters.": "Text is too long, please keep it under {} characters.",
+  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.",
+  "Training Configuration": "Training Configuration",
+  "Training Error": "Training Error",
+  "Training stopped": "Training stopped",
+  "Type name of the speaker": "Type name of the speaker",
+  "Type the path or select from the dropdown": "Type the path or select from the dropdown",
+  "Use LoRA": "Use LoRA",
+  "Use LoRA can save GPU memory, but may reduce the quality of the model": "Use LoRA can save GPU memory, but may reduce the quality of the model",
+  "Use filelist": "Use filelist",
+  "Use large for 10G+ GPU, medium for 5G, small for 2G": "Use large for 10G+ GPU, medium for 5G, small for 2G",
+  "VITS Configuration": "VITS Configuration",
+  "VQGAN Configuration": "VQGAN Configuration",
+  "Validation Batch Size": "Validation Batch Size",
+  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "View the status of the preprocessing folder (use the slider to control the depth of the tree)",
+  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.",
+  "WebUI Host": "WebUI Host",
+  "WebUI Port": "WebUI Port",
+  "Whisper Model": "Whisper Model",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).",
+  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU",
+  "latest": "latest",
+  "new": "new",
+  "Realtime Transform Text": "Realtime Transform Text",
+  "Normalization Result Preview (Currently Only Chinese)": "Normalization Result Preview (Currently Only Chinese)",
+  "Text Normalization": "Text Normalization",
+  "Select Example Audio": "Select Example Audio"
+}
--- a/fish_speech/i18n/locale/es_ES.json
+++ b/fish_speech/i18n/locale/es_ES.json
+{
+  "16-mixed is recommended for 10+ series GPU": "se recomienda 16-mixed para GPU de la serie 10+",
+  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 a 10 segundos de audio de referencia, útil para especificar el hablante.",
+  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "Un modelo de texto a voz basado en VQ-GAN y Llama desarrollado por [Fish Audio](https://fish.audio).",
+  "Accumulate Gradient Batches": "Acumular lotes de gradientes",
+  "Add to Processing Area": "Agregar al Área de Procesamiento",
+  "Added path successfully!": "¡Ruta agregada exitosamente!",
+  "Advanced Config": "Configuración Avanzada",
+  "Base LLAMA Model": "Modelo Base LLAMA",
+  "Batch Inference": "Inferencia por Lote",
+  "Batch Size": "Tamaño del Lote",
+  "Changing with the Model Path": "Cambiando con la Ruta del Modelo",
+  "Chinese": "Chino",
+  "Compile Model": "Compilar Modelo",
+  "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar el modelo puede reducir significativamente el tiempo de inferencia, pero aumentará el tiempo de inicio en frío",
+  "Copy": "Copiar",
+  "Data Preprocessing": "Preprocesamiento de Datos",
+  "Data Preprocessing Path": "Ruta de Preprocesamiento de Datos",
+  "Data Source": "Fuente de Datos",
+  "Decoder Model Config": "Configuración del modelo decodificador",
+  "Decoder Model Path": "Ruta del modelo decodificador",
+  "Disabled": "Desactivado",
+  "Enable Reference Audio": "Habilitar Audio de Referencia",
+  "English": "Inglés",
+  "Error Message": "Mensaje de Error",
+  "File Preprocessing": "Preprocesamiento de Archivos",
+  "Generate": "Generar",
+  "Generated Audio": "Audio Generado",
+  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "Si no hay texto correspondiente para el audio, aplique ASR para asistencia, soporte para formato .txt o .lab",
+  "Infer interface is closed": "La interfaz de inferencia está cerrada",
+  "Inference Configuration": "Configuración de Inferencia",
+  "Inference Server Configuration": "Configuración del Servidor de Inferencia",
+  "Inference Server Error": "Error del Servidor de Inferencia",
+  "Inferring interface is launched at {}": "La interfaz de inferencia se ha lanzado en {}",
+  "Initial Learning Rate": "Tasa de Aprendizaje Inicial",
+  "Input Audio & Source Path for Transcription": "Audio de Entrada y Ruta de Origen para Transcripción",
+  "Input Text": "Texto de Entrada",
+  "Invalid path: {}": "Ruta inválida: {}",
+  "It is recommended to use CUDA, if you have low configuration, use CPU": "Se recomienda usar CUDA, si tiene una configuración baja, use CPU",
+  "Iterative Prompt Length, 0 means off": "Longitud de la Indicación Iterativa, 0 significa apagado",
+  "Japanese": "Japonés",
+  "LLAMA Configuration": "Configuración de LLAMA",
+  "LLAMA Model Config": "Configuración del Modelo LLAMA",
+  "LLAMA Model Path": "Ruta del Modelo LLAMA",
+  "Labeling Device": "Dispositivo de Etiquetado",
+  "LoRA Model to be merged": "Modelo LoRA a fusionar",
+  "Maximum Audio Duration": "Duración máxima de audio",
+  "Maximum Length per Sample": "Longitud Máxima por Muestra",
+  "Maximum Training Steps": "Pasos Máximos de Entrenamiento",
+  "Maximum tokens per batch, 0 means no limit": "Máximo de tokens por lote, 0 significa sin límite",
+  "Merge": "Fusionar",
+  "Merge LoRA": "Fusionar LoRA",
+  "Merge successfully": "Fusionado exitosamente",
+  "Minimum Audio Duration": "Duración mínima de audio",
+  "Model Output Path": "Ruta de Salida del Modelo",
+  "Model Size": "Tamaño del Modelo",
+  "Move": "Mover",
+  "Move files successfully": "Archivos movidos exitosamente",
+  "No audio generated, please check the input text.": "No se generó audio, por favor verifique el texto de entrada.",
+  "No selected options": "No hay opciones seleccionadas",
+  "Number of Workers": "Número de Trabajadores",
+  "Open Inference Server": "Abrir Servidor de Inferencia",
+  "Open Labeler WebUI": "Abrir Interfaz Web del Etiquetador",
+  "Open Tensorboard": "Abrir Tensorboard",
+  "Opened labeler in browser": "Se abrió el etiquetador en el navegador",
+  "Optional Label Language": "Idioma de Etiquetado Opcional",
+  "Optional online ver": "Ver en línea opcional",
+  "Output Path": "Ruta de Salida",
+  "Path error, please check the model file exists in the corresponding path": "Error de ruta, por favor verifique que el archivo del modelo exista en la ruta correspondiente",
+  "Precision": "Precisión",
+  "Probability of applying Speaker Condition": "Probabilidad de aplicar Condición de Hablante",
+  "Put your text here.": "Ponga su texto aquí.",
+  "Reference Audio": "Audio de Referencia",
+  "Reference Text": "Texto de Referencia",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "El código relacionado y los pesos se publican bajo la Licencia CC BY-NC-SA 4.0.",
+  "Remove Selected Data": "Eliminar Datos Seleccionados",
+  "Removed path successfully!": "¡Ruta eliminada exitosamente!",
+  "Repetition Penalty": "Penalización por Repetición",
+  "Save model every n steps": "Guardar modelo cada n pasos",
+  "Select LLAMA ckpt": "Seleccionar punto de control LLAMA",
+  "Select VITS ckpt": "Seleccionar punto de control VITS",
+  "Select VQGAN ckpt": "Seleccionar punto de control VQGAN",
+  "Select source file processing method": "Seleccione el método de procesamiento de archivos fuente",
+  "Select the model to be trained (Depending on the Tab page you are on)": "Seleccione el modelo a entrenar (Dependiendo de la pestaña en la que se encuentre)",
+  "Selected: {}": "Seleccionado: {}",
+  "Speaker": "Hablante",
+  "Speaker is identified by the folder name": "El hablante se identifica por el nombre de la carpeta",
+  "Start Training": "Iniciar Entrenamiento",
+  "Streaming Audio": "transmisión de audio",
+  "Streaming Generate": "síntesis en flujo",
+  "Tensorboard Host": "Host de Tensorboard",
+  "Tensorboard Log Path": "Ruta de Registro de Tensorboard",
+  "Tensorboard Port": "Puerto de Tensorboard",
+  "Tensorboard interface is closed": "La interfaz de Tensorboard está cerrada",
+  "Tensorboard interface is launched at {}": "La interfaz de Tensorboard se ha lanzado en {}",
+  "Text is too long, please keep it under {} characters.": "El texto es demasiado largo, por favor manténgalo por debajo de {} caracteres.",
+  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "La ruta de la carpeta de entrada a la izquierda o la lista de archivos. Ya sea que esté marcado o no, se utilizará para el entrenamiento posterior en esta lista.",
+  "Training Configuration": "Configuración de Entrenamiento",
+  "Training Error": "Error de Entrenamiento",
+  "Training stopped": "Entrenamiento detenido",
+  "Type name of the speaker": "Escriba el nombre del hablante",
+  "Type the path or select from the dropdown": "Escriba la ruta o seleccione de la lista desplegable",
+  "Use LoRA": "Usar LoRA",
+  "Use LoRA can save GPU memory, but may reduce the quality of the model": "Usar LoRA puede ahorrar memoria GPU, pero puede reducir la calidad del modelo",
+  "Use filelist": "Usar lista de archivos",
+  "Use large for 10G+ GPU, medium for 5G, small for 2G": "Use grande para GPU de 10G+, mediano para 5G, pequeño para 2G",
+  "VITS Configuration": "Configuración de VITS",
+  "VQGAN Configuration": "Configuración de VQGAN",
+  "Validation Batch Size": "Tamaño del Lote de Validación",
+  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "Vea el estado de la carpeta de preprocesamiento (use el control deslizante para controlar la profundidad del árbol)",
+  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "No somos responsables de ningún mal uso del modelo, por favor considere sus leyes y regulaciones locales antes de usarlo.",
+  "WebUI Host": "Host de WebUI",
+  "WebUI Port": "Puerto de WebUI",
+  "Whisper Model": "Modelo Whisper",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Puede encontrar el código fuente [aquí](https://github.com/fishaudio/fish-speech) y los modelos [aquí](https://huggingface.co/fishaudio/fish-speech-1).",
+  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "Se recomienda bf16-true para GPU de la serie 30+, se recomienda 16-mixed para GPU de la serie 10+",
+  "latest": "más reciente",
+  "new": "nuevo",
+  "Realtime Transform Text": "Transformación de Texto en Tiempo Real",
+  "Normalization Result Preview (Currently Only Chinese)": "Vista Previa del Resultado de Normalización (Actualmente Solo Chino)",
+  "Text Normalization": "Normalización de Texto",
+  "Select Example Audio": "Selecionar áudio de exemplo"
+}
--- a/fish_speech/i18n/locale/ja_JP.json
+++ b/fish_speech/i18n/locale/ja_JP.json
+{
+  "16-mixed is recommended for 10+ series GPU": "10シリーズ以降のGPUには16-mixedをお勧めします",
+  "5 to 10 seconds of reference audio, useful for specifying speaker.": "話者を指定するのに役立つ、5～10秒のリファレンスオーディオ。",
+  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "[Fish Audio](https://fish.audio)が開発したVQ-GANとLlamaに基づくテキスト音声合成モデル。",
+  "Accumulate Gradient Batches": "勾配バッチの累積",
+  "Add to Processing Area": "処理エリアに追加",
+  "Added path successfully!": "パスの追加に成功しました！",
+  "Advanced Config": "詳細設定",
+  "Base LLAMA Model": "基本LLAMAモデル",
+  "Batch Inference": "バッチ推論",
+  "Batch Size": "バッチサイズ",
+  "Changing with the Model Path": "モデルのパスに伴って変化する",
+  "Chinese": "中国語",
+  "Compile Model": "モデルのコンパイル",
+  "Compile the model can significantly reduce the inference time, but will increase cold start time": "モデルをコンパイルすると推論時間を大幅に短縮できますが、コールドスタート時間が長くなります",
+  "Copy": "コピー",
+  "Data Preprocessing": "データ前処理",
+  "Data Preprocessing Path": "データ前処理パス",
+  "Data Source": "データソース",
+  "Decoder Model Config": "デコーダーモデルの構成",
+  "Decoder Model Path": "デコーダーモデルのパス",
+  "Disabled": "無効",
+  "Enable Reference Audio": "リファレンスオーディオを有効にする",
+  "English": "英語",
+  "Error Message": "エラーメッセージ",
+  "File Preprocessing": "文書前处理",
+  "Generate": "生成",
+  "Generated Audio": "生成されたオーディオ",
+  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "音声に対応するテキストがない場合は、ASRを適用してサポートします。.txtまたは.lab形式をサポートしています",
+  "Infer interface is closed": "推論インターフェースが閉じられています",
+  "Inference Configuration": "推論設定",
+  "Inference Server Configuration": "推論サーバー設定",
+  "Inference Server Error": "推論サーバーエラー",
+  "Inferring interface is launched at {}": "推論インターフェースが{}で起動しました",
+  "Initial Learning Rate": "初期学習率",
+  "Input Audio & Source Path for Transcription": "入力オーディオと文字起こしのソースパス",
+  "Input Text": "入力テキスト",
+  "Invalid path: {}": "無効なパス: {}",
+  "It is recommended to use CUDA, if you have low configuration, use CPU": "CUDAの使用をお勧めします。低い構成の場合はCPUを使用してください",
+  "Iterative Prompt Length, 0 means off": "反復プロンプト長。0はオフを意味します",
+  "Japanese": "日本語",
+  "LLAMA Configuration": "LLAMA設定",
+  "LLAMA Model Config": "LLAMAモデル設定",
+  "LLAMA Model Path": "LLAMAモデルパス",
+  "Labeling Device": "ラベリングデバイス",
+  "LoRA Model to be merged": "マージするLoRAモデル",
+  "Maximum Audio Duration": "最大オーディオの長さ",
+  "Maximum Length per Sample": "サンプルあたりの最大長",
+  "Maximum Training Steps": "最大トレーニングステップ数",
+  "Maximum tokens per batch, 0 means no limit": "バッチあたりの最大トークン数。0は制限なしを意味します",
+  "Merge": "マージ",
+  "Merge LoRA": "LoRAのマージ",
+  "Merge successfully": "マージに成功しました",
+  "Minimum Audio Duration": "最小オーディオの長さ",
+  "Model Output Path": "モデル出力パス",
+  "Model Size": "モデルサイズ",
+  "Move": "移動",
+  "Move files successfully": "ファイルの移動に成功しました",
+  "No audio generated, please check the input text.": "オーディオが生成されていません。入力テキストを確認してください。",
+  "No selected options": "選択されたオプションはありません",
+  "Number of Workers": "ワーカー数",
+  "Open Inference Server": "推論サーバーを開く",
+  "Open Labeler WebUI": "ラベラーWebUIを開く",
+  "Open Tensorboard": "Tensorboardを開く",
+  "Opened labeler in browser": "ブラウザでラベラーを開きました",
+  "Optional Label Language": "オプションのラベル言語",
+  "Optional online ver": "オプションのオンラインバージョン",
+  "Output Path": "出力パス",
+  "Path error, please check the model file exists in the corresponding path": "パスエラー。対応するパスにモデルファイルが存在するか確認してください",
+  "Precision": "精度",
+  "Probability of applying Speaker Condition": "話者条件を適用する確率",
+  "Put your text here.": "ここにテキストを入力してください。",
+  "Reference Audio": "リファレンスオーディオ",
+  "Reference Text": "リファレンステキスト",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "関連コードと重みはCC BY-NC-SA 4.0ライセンスの下でリリースされます。",
+  "Remove Selected Data": "選択したデータを削除",
+  "Removed path successfully!": "パスの削除に成功しました！",
+  "Repetition Penalty": "反復ペナルティ",
+  "Save model every n steps": "nステップごとにモデルを保存",
+  "Select LLAMA ckpt": " LLAMA チェックポイントを選択",
+  "Select VITS ckpt": "VITS チェックポイントを選択",
+  "Select VQGAN ckpt": "VQGAN チェックポイントを選択",
+  "Select source file processing method": "ソースファイルの処理方法を選択",
+  "Select the model to be trained (Depending on the Tab page you are on)": "タブページに応じてトレーニングするモデルを選択してください",
+  "Selected: {}": "選択済み: {}",
+  "Speaker": "話者",
+  "Speaker is identified by the folder name": "話者はフォルダ名で識別されます",
+  "Start Training": "トレーニング開始",
+  "Streaming Audio": "ストリーミングオーディオ",
+  "Streaming Generate": "ストリーミング合成",
+  "Tensorboard Host": "Tensorboardホスト",
+  "Tensorboard Log Path": "Tensorboardログパス",
+  "Tensorboard Port": "Tensorboardポート",
+  "Tensorboard interface is closed": "Tensorboardインターフェースが閉じられています",
+  "Tensorboard interface is launched at {}": "Tensorboardインターフェースが{}で起動されました",
+  "Text is too long, please keep it under {} characters.": "テキストが長すぎます。{}文字以内に抑えてください。",
+  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "左側の入力フォルダまたはファイルリストのパス。チェックの有無にかかわらず、このリストの後続のトレーニングに使用されます。",
+  "Training Configuration": "トレーニング設定",
+  "Training Error": "トレーニングエラー",
+  "Training stopped": "トレーニングが停止しました",
+  "Type name of the speaker": "話者の名前を入力",
+  "Type the path or select from the dropdown": "パスを入力するか、ドロップダウンから選択してください",
+  "Use LoRA": "LoRAを使用",
+  "Use LoRA can save GPU memory, but may reduce the quality of the model": "LoRAを使用するとGPUメモリを節約できますが、モデルの品質が低下する可能性があります",
+  "Use filelist": "ファイルリストを使用",
+  "Use large for 10G+ GPU, medium for 5G, small for 2G": "10G以上のGPUには大、5Gには中、2Gには小を使用してください",
+  "VITS Configuration": "VITS の構成",
+  "VQGAN Configuration": "VQGAN の構成",
+  "Validation Batch Size": "検証バッチサイズ",
+  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "前処理フォルダの状態を表示（スライダーを使用してツリーの深さを制御）",
+  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "モデルの誤用については一切責任を負いません。使用する前に、現地の法律と規制を考慮してください。",
+  "WebUI Host": "WebUIホスト",
+  "WebUI Port": "WebUIポート",
+  "Whisper Model": "Whisperモデル",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "ソースコードは[こちら](https://github.com/fishaudio/fish-speech)、モデルは[こちら](https://huggingface.co/fishaudio/fish-speech-1)にあります。",
+  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30シリーズ以降のGPUにはbf16-trueを、10シリーズ以降のGPUには16-mixedをお勧めします",
+  "latest": "最新",
+  "new": "新規",
+  "Realtime Transform Text": "リアルタイム変換テキスト",
+  "Normalization Result Preview (Currently Only Chinese)": "正規化結果プレビュー（現在は中国語のみ）",
+  "Text Normalization": "テキスト正規化",
+  "Select Example Audio": "サンプル音声を選択"
+}
--- a/fish_speech/i18n/locale/ko_KR.json
+++ b/fish_speech/i18n/locale/ko_KR.json
+{
+  "16-mixed is recommended for 10+ series GPU": "10+ 시리즈 GPU에는 16-mixed를 권장합니다.",
+  "5 to 10 seconds of reference audio, useful for specifying speaker.": "화자를 특정하는 데 유의미한 5~10초의 길이의 참조 오디오 데이터.",
+  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "[Fish Audio](https://fish.audio)에서 개발한 VQ-GAN 및 Llama 기반의 텍스트 음성 변환 모델.",
+  "Accumulate Gradient Batches": "그라디언트 배치 누적",
+  "Add to Processing Area": "처리 영역에 추가",
+  "Added path successfully!": "경로가 성공적으로 추가되었습니다!",
+  "Advanced Config": "고급 설정",
+  "Base LLAMA Model": "기본 LLAMA 모델",
+  "Batch Inference": "배치 추론",
+  "Batch Size": "배치 크기",
+  "Changing with the Model Path": "모델 경로에 따라 변경 중",
+  "Chinese": "중국어",
+  "Compile Model": "모델 컴파일",
+  "Compile the model can significantly reduce the inference time, but will increase cold start time": "모델을 컴파일하면 추론 시간이 크게 줄어들지만, 초기 시작 시간이 길어집니다.",
+  "Copy": "복사",
+  "Data Preprocessing": "데이터 전처리",
+  "Data Preprocessing Path": "데이터 전처리 경로",
+  "Data Source": "데이터 소스",
+  "Decoder Model Config": "디코더 모델 설정",
+  "Decoder Model Path": "디코더 모델 경로",
+  "Disabled": "비활성화 됨",
+  "Enable Reference Audio": "참고 음성 활성화",
+  "English": "영어",
+  "Error Message": "오류 메시지",
+  "File Preprocessing": "파일 전처리",
+  "Generate": "생성",
+  "Generated Audio": "생성된 오디오",
+  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "오디오애 대응하는 텍스트가 없을 경우, ASR을 적용해 지원하며, .txt 또는 .lab 형식을 지원합니다.",
+  "Infer interface is closed": "추론 인터페이스가 닫혔습니다.",
+  "Inference Configuration": "추론 설정",
+  "Inference Server Configuration": "추론 서버 설정",
+  "Inference Server Error": "추론 서버 오류",
+  "Inferring interface is launched at {}": "추론 인터페이스가 {}에서 시작되었습니다.",
+  "Initial Learning Rate": "초기 학습률",
+  "Input Audio & Source Path for Transcription": "전사할 입력 오디오 및 소스 경로",
+  "Input Text": "입력 텍스트",
+  "Invalid path: {}": "유효하지 않은 경로: {}",
+  "It is recommended to use CUDA, if you have low configuration, use CPU": "CUDA 사용을 권장하며, 낮은 사양일 경우 CPU를 사용하는 것을 권장합니다.",
+  "Iterative Prompt Length, 0 means off": "반복 프롬프트 길이. (0:비활성화)",
+  "Japanese": "일본어",
+  "LLAMA Configuration": "LLAMA 설정",
+  "LLAMA Model Config": "LLAMA 모델 설정",
+  "LLAMA Model Path": "LLAMA 모델 경로",
+  "Labeling Device": "라벨링 장치",
+  "LoRA Model to be merged": "병합할 LoRA 모델",
+  "Maximum Audio Duration": "최대 오디오 길이",
+  "Maximum Length per Sample": "샘플당 최대 길이",
+  "Maximum Training Steps": "최대 학습 단계",
+  "Maximum tokens per batch, 0 means no limit": "배치당 최대 토큰 수(0:제한 없음)",
+  "Merge": "병합",
+  "Merge LoRA": "LoRA 병합",
+  "Merge successfully": "성공적으로 병합 되었습니다.",
+  "Minimum Audio Duration": "최소 오디오 길이",
+  "Model Output Path": "모델 출력 경로",
+  "Model Size": "모델 크기",
+  "Move": "이동",
+  "Move files successfully": "파일이 성공적으로 이동되었습니다.",
+  "No audio generated, please check the input text.": "생성된 오디오가 없습니다. 입력된 텍스트를 확인하세요.",
+  "No selected options": "옵션이 선택되지 않았습니다.",
+  "Number of Workers": "작업자 수",
+  "Open Inference Server": "추론 서버 열기",
+  "Open Labeler WebUI": "라벨러 WebUI 열기",
+  "Open Tensorboard": "Tensorboard 열기",
+  "Opened labeler in browser": "브라우저에서 라벨러가 열렸습니다.",
+  "Optional Label Language": "선택적 라벨 언어",
+  "Optional online ver": "온라인 버전 선택",
+  "Output Path": "출력 경로",
+  "Path error, please check the model file exists in the corresponding path": "경로 오류, 해당 경로에 모델 파일이 있는지 확인하십시오.",
+  "Precision": "정밀도",
+  "Probability of applying Speaker Condition": "화자 조건 적용 확률",
+  "Put your text here.": "여기에 텍스트를 입력하세요.",
+  "Reference Audio": "참고 오디오",
+  "Reference Text": "참고 텍스트",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "관련 코드 및 가중치는 CC BY-NC-SA 4.0 라이선스 하에 배포됩니다.",
+  "Remove Selected Data": "선택한 데이터 제거",
+  "Removed path successfully!": "경로가 성공적으로 제거되었습니다!",
+  "Repetition Penalty": "반복 패널티",
+  "Save model every n steps": "n 단계마다 모델 저장",
+  "Select LLAMA ckpt": "LLAMA ckpt 선택",
+  "Select VITS ckpt": "VITS ckpt 선택",
+  "Select VQGAN ckpt": "VQGAN ckpt 선택",
+  "Select source file processing method": "소스 파일 처리 방법 선택",
+  "Select the model to be trained (Depending on the Tab page you are on)": "학습할 모델 선택(탭 페이지에 따라 다름)",
+  "Selected: {}": "선택됨: {}",
+  "Speaker": "화자",
+  "Speaker is identified by the folder name": "화자는 폴더 이름으로 식별됩니다",
+  "Start Training": "학습 시작",
+  "Streaming Audio": "스트리밍 오디오",
+  "Streaming Generate": "스트리밍 생성",
+  "Tensorboard Host": "Tensorboard 호스트",
+  "Tensorboard Log Path": "Tensorboard 로그 경로",
+  "Tensorboard Port": "Tensorboard 포트",
+  "Tensorboard interface is closed": "Tensorboard 인터페이스가 닫혔습니다",
+  "Tensorboard interface is launched at {}": "Tensorboard 인터페이스가 {}에서 시작되었습니다.",
+  "Text is too long, please keep it under {} characters.": "텍스트가 너무 깁니다. {}자 이하로 입력해주세요.",
+  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "왼쪽의 입력 폴더 경로 또는 파일 목록의 경로. 체크 여부에 관계없이 이 목록에서 후속 학습에 사용됩니다.",
+  "Training Configuration": "학습 설정",
+  "Training Error": "학습 오류",
+  "Training stopped": "학습이 중지되었습니다.",
+  "Type name of the speaker": "화자의 이름을 입력하세요.",
+  "Type the path or select from the dropdown": "경로를 입력하거나 드롭다운에서 선택하세요.",
+  "Use LoRA": "LoRA 사용",
+  "Use LoRA can save GPU memory, but may reduce the quality of the model": "LoRA를 사용하면 GPU 메모리를 절약할 수 있지만, 모델의 품질이 저하될 수 있습니다.",
+  "Use filelist": "파일 목록 사용",
+  "Use large for 10G+ GPU, medium for 5G, small for 2G": "10G+ GPU 환경에선 large, 5G에선 medium, 2G에선 small을 사용할 것을 권장합니다.",
+  "VITS Configuration": "VITS 설정",
+  "VQGAN Configuration": "VQGAN 설정",
+  "Validation Batch Size": "검증 배치 크기",
+  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "전처리 폴더의 상태를 확인합니다(슬라이더를 사용하여 트리의 깊이를 조절합니다)",
+  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "모델의 오용에 대해 책임지지 않습니다. 사용하기 전에 현지 법률과 규정을 고려하시길 바랍니다.",
+  "WebUI Host": "WebUI 호스트",
+  "WebUI Port": "WebUI 포트",
+  "Whisper Model": "Whisper 모델",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "소스 코드는 [이곳](https://github.com/fishaudio/fish-speech)에서, 모델은 [이곳](https://huggingface.co/fishaudio/fish-speech-1)에서 확인하실 수 있습니다.",
+  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30+ 시리즈 GPU에는 bf16-true를, 10+ 시리즈 GPU에는 16-mixed를 권장합니다",
+  "latest": "최신",
+  "new": "새로운",
+  "Realtime Transform Text": "실시간 텍스트 변환",
+  "Normalization Result Preview (Currently Only Chinese)": "정규화 결과 미리보기(현재 중국어만 지원)",
+  "Text Normalization": "텍스트 정규화",
+  "Select Example Audio": "예시 오디오 선택"
+}
--- a/fish_speech/i18n/locale/pt_BR.json
+++ b/fish_speech/i18n/locale/pt_BR.json
+{
+  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 a 10 segundos de áudio de referência, útil para especificar o orador.",
+  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "Um modelo de texto para fala baseado em VQ-GAN e Llama desenvolvido por [Fish Audio](https://fish.audio).",
+  "Accumulate Gradient Batches": "Acumular Lotes de Gradiente",
+  "Add to Processing Area": "Adicionar à Área de Processamento",
+  "Added path successfully!": "Caminho adicionado com sucesso!",
+  "Advanced Config": "Configuração Avançada",
+  "Base LLAMA Model": "Modelo LLAMA Base",
+  "Batch Inference": "Inferência em Lote",
+  "Batch Size": "Tamanho do Lote",
+  "Changing with the Model Path": "Alterando com o Caminho do Modelo",
+  "Compile Model": "Compilar Modelo",
+  "Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar o modelo pode reduzir significativamente o tempo de inferência, mas aumentará a latência inicial",
+  "Copy": "Copiar",
+  "Data Preprocessing": "Pré-processamento de Dados",
+  "Data Preprocessing Path": "Caminho de Pré-processamento de Dados",
+  "Data Source": "Fonte de Dados",
+  "Decoder Model Config": "Configuração do Modelo Decodificador",
+  "Decoder Model Path": "Caminho do Modelo Decodificador",
+  "Disabled": "Desativado",
+  "Enable Initial Prompt": "Habilitar Prompt Inicial",
+  "Enable Reference Audio": "Habilitar Áudio de Referência",
+  "English": "Inglês",
+  "Japanese": "Japonês",
+  "Chinese": "Chinês",
+  "Portuguese": "Português",
+  "Spanish": "Espanhol",
+  "Error Message": "Mensagem de Erro",
+  "Faster Whisper, Up to 5g GPU memory usage": "Faster Whisper (Usa até 5 GB de vRAM)",
+  "File Preprocessing": "Pré-processamento de Arquivos",
+  "Generate": "Gerar",
+  "Generated Audio": "Áudio Gerado",
+  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "Se não houver texto correspondente ao áudio, utilize o ASR para assistência (formatos .txt ou .lab)",
+  "Infer interface is closed": "A interface de inferência foi fechada",
+  "Inference Configuration": "Configuração de Inferência",
+  "Inference Server Configuration": "Configuração do Servidor de Inferência",
+  "Inference Server Error": "Erro do Servidor de Inferência",
+  "Inferring interface is launched at {}": "A interface de inferência foi iniciada em {}",
+  "Initial Learning Rate": "Taxa de Aprendizagem Inicial",
+  "Initial Prompt": "Prompt Inicial",
+  "Initial prompt can provide contextual or vocabulary-specific guidance to the model.": "O prompt inicial pode fornecer orientação contextual ou específica de vocabulário para o modelo.",
+  "Input Audio & Source Path for Transcription": "Entrada de Áudio/Caminho de Origem para Transcrição",
+  "Input Text": "Texto de Entrada",
+  "Invalid path: {}": "Caminho inválido: {}",
+  "It is recommended to use CUDA, if you have low configuration, use CPU": "Para GPUs Nvidia é recomendado usar CUDA. Se não tiver uma GPU Nvidia, use CPU",
+  "Iterative Prompt Length, 0 means off": "Comprimento do Prompt Iterativo (0 = desativado)",
+  "LLAMA Configuration": "Configuração do LLAMA",
+  "LLAMA Model Config": "Configuração do Modelo LLAMA",
+  "LLAMA Model Path": "Caminho do Modelo LLAMA",
+  "Labeling Device": "Dispositivo de Rotulagem",
+  "LoRA Model to be merged": "Modelo LoRA para mesclagem",
+  "Maximum Length per Sample": "Comprimento Máximo por Amostra",
+  "Maximum Training Steps": "Etapas Máximas de Treinamento",
+  "Maximum tokens per batch, 0 means no limit": "Número máximo de tokens por lote, 0 significa sem limite",
+  "Merge": "Mesclar",
+  "Merge LoRA": "Mesclar LoRA",
+  "Merge successfully": "Mesclado com sucesso",
+  "Model Output Path": "Caminho de Saída do Modelo",
+  "Model Quantization": "Quantização do Modelo",
+  "Model Size": "Tamanho do Modelo",
+  "Move": "Mover",
+  "Move files successfully": "Arquivos movidos com sucesso",
+  "No audio generated, please check the input text.": "Nenhum áudio gerado, verifique o texto de entrada.",
+  "No selected options": "Nenhuma opção selecionada",
+  "Normalization Result Preview (Currently Only Chinese)": "Pré-visualização do Resultado da Normalização (Atualmente Apenas Chinês)",
+  "Number of Workers": "Número de Processos",
+  "Open Inference Server": "Abrir Servidor de Inferência",
+  "Open Labeler WebUI": "Abrir WebUI de Rotulagem",
+  "Open Tensorboard": "Abrir Tensorboard",
+  "Opened labeler in browser": "WebUI de rotulagem aberta no navegador",
+  "Optional Label Language": "Idioma do Rótulo (Opcional)",
+  "Optional online ver": "Versão online (opcional)",
+  "Output Path": "Caminho de Saída",
+  "Path error, please check the model file exists in the corresponding path": "Erro de caminho, verifique se o arquivo do modelo existe no caminho correspondente",
+  "Post-quantification Precision": "Precisão Pós-quantização",
+  "Precision": "Precisão",
+  "Probability of applying Speaker Condition": "Probabilidade de Aplicar Condição de Orador",
+  "Put your text here.": "Insira seu texto aqui.",
+  "Quantify": "Quantizar",
+  "Quantify successfully": "Quantizado com sucesso",
+  "Realtime Transform Text": "Transformar Texto em Tempo Real",
+  "Reference Audio": "Áudio de Referência",
+  "Reference Text": "Texto de Referência",
+  "warning": "Aviso",
+  "Pre-processing begins...": "O pré-processamento começou!",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "O código relacionado e os pesos são licenciados sob a Licença CC BY-NC-SA 4.0.",
+  "Remove Selected Data": "Remover Dados Selecionados",
+  "Removed path successfully!": "Caminho removido com sucesso!",
+  "Repetition Penalty": "Penalidade de Repetição",
+  "Save model every n steps": "Salvar modelo a cada n etapas",
+  "Select LLAMA ckpt": "Selecionar .ckpt do LLAMA",
+  "Select source file processing method": "Escolha como processar o arquivo de origem",
+  "Select the model to be trained (Depending on the Tab page you are on)": "Selecione o modelo para o treinamento (dependendo da aba em que você está)",
+  "Selected: {}": "Selecionado: {}",
+  "Speaker is identified by the folder name": "O orador é identificado pelo nome da pasta",
+  "Start Training": "Iniciar Treinamento",
+  "Streaming Audio": "Áudio em Streaming",
+  "Streaming Generate": "Geração em Streaming",
+  "Tensorboard Host": "Host do Tensorboard",
+  "Tensorboard Log Path": "Caminho de Log do Tensorboard",
+  "Tensorboard Port": "Porta do Tensorboard",
+  "Tensorboard interface is closed": "A interface do Tensorboard está fechada",
+  "Tensorboard interface is launched at {}": "A interface do Tensorboard foi iniciada em {}",
+  "Text Normalization": "Normalização de Texto",
+  "Text is too long, please keep it under {} characters.": "O texto é muito longo. Mantenha-o com menos de {} caracteres.",
+  "The lower the quantitative precision, the more the effectiveness may decrease, but the greater the efficiency will increase": "Quanto menor a precisão quantitativa, mais a eficácia pode diminuir, mas maior será o aumento da eficiência",
+  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "O caminho da pasta de entrada à esquerda ou a lista de arquivos. Independentemente de estar marcada ou não, ela será utilizada para o treinamento subsequente nesta lista.",
+  "Training Configuration": "Configuração de Treinamento",
+  "Training Error": "Erro de Treinamento",
+  "Training stopped": "Treinamento interrompido!",
+  "Type the path or select from the dropdown": "Digite o caminho ou selecione no menu suspenso",
+  "Use LoRA": "Usar LoRA",
+  "Use LoRA can save GPU memory, but may reduce the quality of the model": "O uso de LoRAs pode economizar memória da GPU, mas também pode reduzir a qualidade",
+  "Use filelist": "Usar lista de arquivos",
+  "VQGAN Configuration": "Configuração do VQGAN",
+  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "Visualizar o status da pasta de pré-processamento (use o controle deslizante para controlar a profundidade da árvore)",
+  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "Não nos responsabilizamos por qualquer uso indevido do modelo. Por favor, considere as leis e regulamentações locais antes de usá-lo.",
+  "WebUI Host": "Host da WebUI",
+  "WebUI Port": "Porta da WebUI",
+  "Whisper Model": "Modelo Whisper",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Você pode encontrar o código fonte [aqui](https://github.com/fishaudio/fish-speech) e os modelos [aqui](https://huggingface.co/fishaudio/fish-speech-1).",
+  "auto": "automático",
+  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true é recomendado para GPUs da série 30+, 16-mixed é recomendado para GPUs da série 10+",
+  "latest": "mais recente",
+  "new": "novo",
+  "This audio introduces the basic concepts and applications of artificial intelligence and machine learning.": "Este áudio introduz os conceitos básicos e aplicações de inteligência artificial e aprendizado de máquina.",
+  "You don't need to train this model!": "Não é necessário treinar este modelo!",
+  "Yes": "Sim",
+  "No": "Não",
+  "version:": "versão:",
+  "author:": "autor:"
+}
--- a/fish_speech/i18n/locale/zh_CN.json
+++ b/fish_speech/i18n/locale/zh_CN.json
+{
+  "16-mixed is recommended for 10+ series GPU": "10+ 系列 GPU 建议使用 16-mixed",
+  "5 to 10 seconds of reference audio, useful for specifying speaker.": "5 到 10 秒的参考音频，适用于指定音色。",
+  "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成.",
+  "Accumulate Gradient Batches": "梯度累积批次",
+  "Add to Processing Area": "加入处理区",
+  "Added path successfully!": "添加路径成功!",
+  "Advanced Config": "高级参数",
+  "Base LLAMA Model": "基础 LLAMA 模型",
+  "Batch Inference": "批量推理",
+  "Batch Size": "批次大小",
+  "Changing with the Model Path": "随模型路径变化",
+  "Chinese": "中文",
+  "Compile Model": "编译模型",
+  "Compile the model can significantly reduce the inference time, but will increase cold start time": "编译模型可以显著减少推理时间，但会增加冷启动时间",
+  "Copy": "复制",
+  "Data Preprocessing": "数据预处理",
+  "Data Preprocessing Path": "数据预处理路径",
+  "Data Source": "数据源",
+  "Decoder Model Config": "解码器模型配置",
+  "Decoder Model Path": "解码器模型路径",
+  "Disabled": "禁用",
+  "Enable Reference Audio": "启用参考音频",
+  "English": "英文",
+  "Error Message": "错误信息",
+  "File Preprocessing": "文件预处理",
+  "Generate": "生成",
+  "Generated Audio": "音频",
+  "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "如果音频没有对应的文本，可以应用 ASR 辅助，支持 .txt 或 .lab 格式",
+  "Infer interface is closed": "推理界面已关闭",
+  "Inference Configuration": "推理配置",
+  "Inference Server Configuration": "推理服务器配置",
+  "Inference Server Error": "推理服务器错误",
+  "Inferring interface is launched at {}": "推理界面已在 {} 上启动",
+  "Initial Learning Rate": "初始学习率",
+  "Input Audio & Source Path for Transcription": "输入音频和转录源路径",
+  "Input Text": "输入文本",
+  "Invalid path: {}": "无效路径: {}",
+  "It is recommended to use CUDA, if you have low configuration, use CPU": "建议使用 CUDA，如果配置较低，使用 CPU",
+  "Iterative Prompt Length, 0 means off": "迭代提示长度，0 表示关闭",
+  "Japanese": "日文",
+  "LLAMA Configuration": "LLAMA 配置",
+  "LLAMA Model Config": "LLAMA 模型配置",
+  "LLAMA Model Path": "LLAMA 模型路径",
+  "Labeling Device": "标注加速设备",
+  "LoRA Model to be merged": "要合并的 LoRA 模型",
+  "Maximum Audio Duration": "最大音频时长",
+  "Maximum Length per Sample": "每个样本的最大长度",
+  "Maximum Training Steps": "最大训练步数",
+  "Maximum tokens per batch, 0 means no limit": "每批最大令牌数，0 表示无限制",
+  "Merge": "合并",
+  "Merge LoRA": "合并 LoRA",
+  "Merge successfully": "合并成功",
+  "Minimum Audio Duration": "最小音频时长",
+  "Model Output Path": "模型输出路径",
+  "Model Size": "模型规模",
+  "Move": "移动",
+  "Move files successfully": "移动文件成功",
+  "No audio generated, please check the input text.": "没有生成音频，请检查输入文本.",
+  "No selected options": "没有选择的选项",
+  "Number of Workers": "数据加载进程数",
+  "Open Inference Server": "打开推理服务器",
+  "Open Labeler WebUI": "打开标注工具",
+  "Open Tensorboard": "打开 Tensorboard",
+  "Opened labeler in browser": "在浏览器中打开标注工具",
+  "Optional Label Language": "[可选] 标注语言",
+  "Optional online ver": "[可选] 使用在线版",
+  "Output Path": "输出路径",
+  "Path error, please check the model file exists in the corresponding path": "路径错误，请检查模型文件是否存在于相应路径",
+  "Precision": "精度",
+  "Probability of applying Speaker Condition": "应用说话人条件的概率",
+  "Put your text here.": "在此处输入文本.",
+  "Reference Audio": "参考音频",
+  "Reference Text": "参考文本",
+  "Related code and weights are released under CC BY-NC-SA 4.0 License.": "相关代码和权重使用 CC BY-NC-SA 4.0 许可证发布.",
+  "Remove Selected Data": "移除选中数据",
+  "Removed path successfully!": "移除路径成功!",
+  "Repetition Penalty": "重复惩罚",
+  "Save model every n steps": "每 n 步保存模型",
+  "Select LLAMA ckpt": "选择 LLAMA 检查点",
+  "Select VITS ckpt": "选择 VITS 检查点",
+  "Select VQGAN ckpt": "选择 VQGAN 检查点",
+  "Select source file processing method": "选择源文件处理方法",
+  "Select the model to be trained (Depending on the Tab page you are on)": "根据您所在的选项卡页面选择要训练的模型",
+  "Selected: {}": "已选择: {}",
+  "Speaker": "说话人",
+  "Speaker is identified by the folder name": "自动根据父目录名称识别说话人",
+  "Start Training": "开始训练",
+  "Streaming Audio": "流式音频",
+  "Streaming Generate": "流式合成",
+  "Tensorboard Host": "Tensorboard 监听地址",
+  "Tensorboard Log Path": "Tensorboard 日志路径",
+  "Tensorboard Port": "Tensorboard 端口",
+  "Tensorboard interface is closed": "Tensorboard 界面已关闭",
+  "Tensorboard interface is launched at {}": "Tensorboard 界面已在 {} 上启动",
+  "Text is too long, please keep it under {} characters.": "文本太长，请保持在 {} 个字符以内.",
+  "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "左侧输入文件夹的路径或文件列表。无论是否选中，都将在此列表中用于后续训练.",
+  "Training Configuration": "训练配置",
+  "Training Error": "训练错误",
+  "Training stopped": "训练已停止",
+  "Type name of the speaker": "输入说话人的名称",
+  "Type the path or select from the dropdown": "输入路径或从下拉菜单中选择",
+  "Use LoRA": "使用 LoRA",
+  "Use LoRA can save GPU memory, but may reduce the quality of the model": "使用 LoRA 可以节省 GPU 内存，但可能会降低模型质量",
+  "Use filelist": "使用文件列表",
+  "Use large for 10G+ GPU, medium for 5G, small for 2G": "10G+ GPU 使用 large, 5G 使用 medium, 2G 使用 small",
+  "VITS Configuration": "VITS 配置",
+  "VQGAN Configuration": "VQGAN 配置",
+  "Validation Batch Size": "验证批次大小",
+  "View the status of the preprocessing folder (use the slider to control the depth of the tree)": "查看预处理文件夹的状态 (使用滑块控制树的深度)",
+  "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "我们不对模型的任何滥用负责，请在使用之前考虑您当地的法律法规.",
+  "WebUI Host": "WebUI 监听地址",
+  "WebUI Port": "WebUI 端口",
+  "Whisper Model": "Whisper 模型",
+  "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1) 找到模型.",
+  "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30+ 系列 GPU 建议使用 bf16-true, 10+ 系列 GPU 建议使用 16-mixed",
+  "latest": "最近的检查点",
+  "new": "创建新的检查点",
+  "Realtime Transform Text": "实时规范化文本",
+  "Normalization Result Preview (Currently Only Chinese)": "规范化结果预览",
+  "Text Normalization": "文本规范化",
+  "Select Example Audio": "选择参考音频"
+}
--- a/fish_speech/i18n/scan.py
+++ b/fish_speech/i18n/scan.py
+import ast
+import glob
+import json
+from collections import OrderedDict
+from pathlib import Path
+from loguru import logger
+from .core import DEFAULT_LANGUAGE, I18N_FILE_PATH
+def extract_i18n_strings(node):
+    i18n_strings = []
+    if (
+        isinstance(node, ast.Call)
+        and isinstance(node.func, ast.Name)
+        and node.func.id == "i18n"
+    ):
+        for arg in node.args:
+            if isinstance(arg, ast.Str):
+                i18n_strings.append(arg.s)
+    for child_node in ast.iter_child_nodes(node):
+        i18n_strings.extend(extract_i18n_strings(child_node))
+    return i18n_strings
+# scan the directory for all .py files (recursively)
+# for each file, parse the code into an AST
+# for each AST, extract the i18n strings
+strings = []
+folders = ["fish_speech", "tools"]
+# for filename in glob.iglob("**/*.py", recursive=True):
+for folder in folders:
+    for f in Path(folder).rglob("*.py"):
+        code = f.read_text(encoding="utf-8")
+        if "i18n(" in code:
+            tree = ast.parse(code)
+            i18n_strings = extract_i18n_strings(tree)
+            logger.info(f"Found {len(i18n_strings)} i18n strings in {f}")
+            strings.extend(i18n_strings)
+code_keys = set(strings)
+logger.info(f"Total unique: {len(code_keys)}")
+standard_file = I18N_FILE_PATH / f"{DEFAULT_LANGUAGE}.json"
+with open(standard_file, "r", encoding="utf-8") as f:
+    standard_data = json.load(f, object_pairs_hook=OrderedDict)
+standard_keys = set(standard_data.keys())
+# Define the standard file name
+unused_keys = standard_keys - code_keys
+logger.info(f"Found {len(unused_keys)} unused keys in {standard_file}")
+for unused_key in unused_keys:
+    logger.info(f"\t{unused_key}")
+missing_keys = code_keys - standard_keys
+logger.info(f"Found {len(missing_keys)} missing keys in {standard_file}")
+for missing_key in missing_keys:
+    logger.info(f"\t{missing_key}")
+code_keys_dict = OrderedDict()
+for s in strings:
+    code_keys_dict[s] = s
+# write back
+with open(standard_file, "w", encoding="utf-8") as f:
+    json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True)
+    f.write("\n")
+logger.info(f"Updated {standard_file}")
+# Define the standard file name
+standard_file = I18N_FILE_PATH / f"{DEFAULT_LANGUAGE}.json"
+# Find all JSON files in the directory
+dir_path = I18N_FILE_PATH
+languages = [f for f in dir_path.glob("*.json") if f.stem != DEFAULT_LANGUAGE]
+# Load the standard file
+with open(standard_file, "r", encoding="utf-8") as f:
+    standard_data = json.load(f, object_pairs_hook=OrderedDict)
+# Loop through each language file
+for lang_file in languages:
+    # Load the language file
+    with open(lang_file, "r", encoding="utf-8") as f:
+        lang_data = json.load(f, object_pairs_hook=OrderedDict)
+    # Find the difference between the language file and the standard file
+    diff = set(standard_data.keys()) - set(lang_data.keys())
+    miss = set(lang_data.keys()) - set(standard_data.keys())
+    # Add any missing keys to the language file
+    for key in diff:
+        lang_data[key] = "#!" + key
+        logger.info(f"Added missing key: {key} to {lang_file}")
+    # Del any extra keys to the language file
+    for key in miss:
+        del lang_data[key]
+        logger.info(f"Del extra key: {key} from {lang_file}")
+    # Sort the keys of the language file to match the order of the standard file
+    lang_data = OrderedDict(
+        sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))
+    )
+    # Save the updated language file
+    with open(lang_file, "w", encoding="utf-8") as f:
+        json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True)
+        f.write("\n")
+    logger.info(f"Updated {lang_file}")
+logger.info("Done")