Commit b75857fb authored by chenzk's avatar chenzk
Browse files

v1.0

parents
_target_: fish_speech.models.vqgan.modules.firefly.FireflyArchitecture
spec_transform:
_target_: fish_speech.utils.spectrogram.LogMelSpectrogram
sample_rate: 44100
n_mels: 160
n_fft: 2048
hop_length: 512
win_length: 2048
backbone:
_target_: fish_speech.models.vqgan.modules.firefly.ConvNeXtEncoder
input_channels: 160
depths: [3, 3, 9, 3]
dims: [128, 256, 384, 512]
drop_path_rate: 0.2
kernel_size: 7
head:
_target_: fish_speech.models.vqgan.modules.firefly.HiFiGANGenerator
hop_length: 512
upsample_rates: [8, 8, 2, 2, 2] # aka. strides
upsample_kernel_sizes: [16, 16, 4, 4, 4]
resblock_kernel_sizes: [3, 7, 11]
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
num_mels: 512
upsample_initial_channel: 512
pre_conv_kernel_size: 13
post_conv_kernel_size: 13
quantizer:
_target_: fish_speech.models.vqgan.modules.fsq.DownsampleFiniteScalarQuantize
input_dim: 512
n_groups: 8
n_codebooks: 1
levels: [8, 5, 5, 5]
downsample_factor: [2, 2]
_target_: fish_speech.models.text2semantic.lora.LoraConfig
r: 8
lora_alpha: 16
lora_dropout: 0.01
defaults:
- base
- _self_
project: text2semantic_finetune_dual_ar
max_length: 4096
pretrained_ckpt_path: checkpoints/fish-speech-1.5
# Lightning Trainer
trainer:
accumulate_grad_batches: 1
gradient_clip_val: 1.0
gradient_clip_algorithm: "norm"
max_steps: 10000
precision: bf16-true
limit_val_batches: 10
val_check_interval: 100
# strategy:
# find_unused_parameters: true
# static_graph: true
# Dataset Configuration
tokenizer:
_target_: fish_speech.tokenizer.FishTokenizer
model_path: ${pretrained_ckpt_path}/tokenizer.tiktoken
# Dataset Configuration
train_dataset:
_target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionIterableDataset
proto_files:
- data/protos
tokenizer: ${tokenizer}
causal: true
max_length: ${max_length}
use_speaker: false
interactive_prob: 0.7
val_dataset:
_target_: fish_speech.datasets.semantic.AutoTextSemanticInstructionIterableDataset
proto_files:
- data/protos
tokenizer: ${tokenizer}
causal: true
max_length: ${max_length}
use_speaker: false
interactive_prob: 0.7
data:
_target_: fish_speech.datasets.semantic.SemanticDataModule
train_dataset: ${train_dataset}
val_dataset: ${val_dataset}
num_workers: 4
batch_size: 4
tokenizer: ${tokenizer}
max_length: ${max_length}
# Model Configuration
model:
_target_: fish_speech.models.text2semantic.lit_module.TextToSemantic
model:
_target_: fish_speech.models.text2semantic.llama.BaseTransformer.from_pretrained
path: ${pretrained_ckpt_path}
load_weights: true
max_length: ${max_length}
lora_config: null
optimizer:
_target_: torch.optim.AdamW
_partial_: true
lr: 1e-4
weight_decay: 0
betas: [0.9, 0.95]
eps: 1e-5
lr_scheduler:
_target_: torch.optim.lr_scheduler.LambdaLR
_partial_: true
lr_lambda:
_target_: fish_speech.scheduler.get_constant_schedule_with_warmup_lr_lambda
_partial_: true
num_warmup_steps: 10
# Callbacks
callbacks:
model_checkpoint:
every_n_train_steps: ${trainer.val_check_interval}
from dataclasses import dataclass, field
from typing import Literal
import torch
from .tokenizer import MODALITY_TOKENS, FishTokenizer
CODEBOOK_PAD_TOKEN_ID = 0
@dataclass(kw_only=True)
class BasePart:
pass
@dataclass(kw_only=True)
class VQPart(BasePart):
codes: torch.Tensor
@dataclass(kw_only=True)
class TextPart(BasePart):
text: str
@dataclass(kw_only=True)
class EncodedMessage:
tokens: torch.Tensor
labels: torch.Tensor
vq_mask_tokens: torch.Tensor | None = None
vq_mask_labels: torch.Tensor | None = None
vq_parts: list[torch.Tensor]
vq_require_losses: torch.Tensor | None = None
@dataclass(kw_only=True)
class Message:
role: Literal["system", "user", "assistant"]
parts: list[VQPart | TextPart] = field(default_factory=list)
add_im_start: bool = True
add_im_end: bool = True
cal_loss: bool = False
modality: Literal["text", "voice", "interleave"] | None = None
# By default, ignore the loss of the auto-generated im_start token
ignore_im_start_loss: bool = True
def encode(
self: "Message",
tokenizer: FishTokenizer,
) -> EncodedMessage:
all_tokens = []
all_labels = []
# Multi-modal tokens
vq_parts = []
vq_masks = []
parts = self.parts.copy()
if self.add_im_start:
modality_token = MODALITY_TOKENS[self.modality] if self.modality else ""
parts.insert(0, TextPart(text=f"<|im_start|>{self.role}\n{modality_token}"))
if self.add_im_end:
parts.append(TextPart(text="<|im_end|>"))
for part in parts:
if isinstance(part, TextPart):
tokens = torch.tensor(
tokenizer.encode(part.text),
dtype=torch.int,
)
elif isinstance(part, VQPart):
curr_codes = part.codes.clone()
tokens = torch.tensor(
[
tokenizer.semantic_id_to_token_id[i.item()]
for i in curr_codes[0].int()
],
dtype=torch.int,
)
vq_parts.append(curr_codes)
else:
raise ValueError(f"Unsupported part type: {type(part)}")
all_tokens.append(tokens)
if isinstance(part, VQPart):
vq_masks.append(torch.ones_like(tokens, dtype=torch.bool))
else:
vq_masks.append(torch.zeros_like(tokens, dtype=torch.bool))
if self.cal_loss:
all_labels.append(tokens.clone())
else:
all_labels.append(torch.full_like(tokens, -100))
tokens = torch.cat(all_tokens, dim=0)
labels = torch.cat(all_labels, dim=0)
vq_masks = torch.cat(vq_masks, dim=0)
assert tokens.shape == labels.shape == vq_masks.shape
if self.ignore_im_start_loss and self.add_im_start:
labels[: len(all_tokens[0])] = -100
return EncodedMessage(
tokens=tokens,
labels=labels,
vq_parts=vq_parts,
vq_mask_tokens=vq_masks,
vq_mask_labels=vq_masks,
)
@dataclass
class Conversation:
messages: list[Message]
def __init__(self: "Conversation", messages: list[Message] | None = None):
self.messages = messages or []
def encode(
self: "Conversation",
tokenizer: FishTokenizer,
add_shift: bool = True,
ignore_loss_tokens: list[str] = [],
) -> EncodedMessage:
# Build the input_ids and labels
tokens = []
labels = []
vq_parts = []
vq_mask_tokens = []
vq_mask_labels = []
vq_require_losses = []
ignore_loss_token_ids = [tokenizer.get_token_id(i) for i in ignore_loss_tokens]
for message in self.messages:
encoded = message.encode(
tokenizer,
)
tokens.append(encoded.tokens)
labels.append(encoded.labels)
vq_parts.extend(encoded.vq_parts)
vq_mask_tokens.append(encoded.vq_mask_tokens)
vq_mask_labels.append(encoded.vq_mask_labels)
vq_require_losses.extend([message.cal_loss] * len(encoded.vq_parts))
tokens = torch.cat(tokens, dim=0)
labels = torch.cat(labels, dim=0)
vq_mask_tokens = torch.cat(vq_mask_tokens, dim=0)
vq_mask_labels = torch.cat(vq_mask_labels, dim=0)
vq_require_losses = torch.tensor(vq_require_losses, dtype=torch.bool)
if add_shift:
tokens = tokens[:-1]
labels = labels[1:]
vq_mask_tokens = vq_mask_tokens[:-1]
vq_mask_labels = vq_mask_labels[1:]
for i in ignore_loss_token_ids:
assert i != -100 and i is not None
labels[labels == i] = -100
assert tokens.dtype in [
torch.int,
torch.long,
], f"Invalid dtype: {tokens.dtype}, conv: {conversation}"
return EncodedMessage(
tokens=tokens,
labels=labels,
vq_parts=vq_parts,
vq_mask_tokens=vq_mask_tokens,
vq_mask_labels=vq_mask_labels,
vq_require_losses=vq_require_losses,
)
def encode_for_inference(
self: "Conversation",
tokenizer: FishTokenizer,
num_codebooks: int,
) -> EncodedMessage:
# self.visualize(tokenizer)
encoded = self.encode(tokenizer, add_shift=False)
tokens = encoded.tokens
values = torch.zeros((num_codebooks + 1, len(tokens)), dtype=torch.int)
values[0] = tokens
if encoded.vq_parts is None or len(encoded.vq_parts) == 0:
return values
vq_parts = encoded.vq_parts
vq_parts = [part.to(values.device) for part in vq_parts]
vq_parts = torch.cat(vq_parts, dim=1)
values[0, encoded.vq_mask_tokens] = vq_parts[0] + tokenizer.semantic_begin_id
values[1:, encoded.vq_mask_tokens] = vq_parts
return values
def visualize(
self: "Conversation",
tokenizer: FishTokenizer,
ignore_loss_tokens: list[str] = [],
):
encoded = self.encode(
tokenizer, add_shift=False, ignore_loss_tokens=ignore_loss_tokens
)
colors = {
"purple": "\033[95m",
"yellow": "\033[93m",
"red": "\033[91m",
"cyan": "\033[96m",
}
first_idx = 0
second_idx = 0
def print_first_group(x):
nonlocal first_idx
color = colors["purple"] if first_idx % 2 == 0 else colors["yellow"]
print(f"{color}{x}\033[0m", end="")
first_idx += 1
def print_second_group(x):
nonlocal second_idx
color = colors["red"] if second_idx % 2 == 0 else colors["cyan"]
print(f"{color}{x}\033[0m", end="")
second_idx += 1
for tok, lab in zip(encoded.tokens, encoded.labels):
val = tokenizer.decode([tok])
if lab == -100:
print_second_group(val)
else:
print_first_group(val)
print()
def append(self: "Conversation", message: Message):
self.messages.append(message)
if __name__ == "__main__":
message0 = Message(
role="user",
parts=[
TextPart(text="Hello, how are you?"),
VQPart(codes=torch.zeros((4, 10))),
],
cal_loss=False,
)
message1 = Message(
role="assistant",
parts=[TextPart(text="I'm fine, thank you.")],
cal_loss=True,
)
conversation = Conversation([message0, message1])
tokenizer = FishTokenizer.from_pretrained("checkpoints/Qwen2-1.5B-Instruct")
conversation.visualize(tokenizer)
encoded = conversation.encode(tokenizer)
print(encoded)
print(tokenizer.batch_decode(encoded.tokens))
import bisect
import random
from typing import Iterable
from torch.utils.data import Dataset, IterableDataset
class ConcatRepeatDataset(Dataset):
datasets: list[Dataset]
cumulative_sizes: list[int]
repeats: list[int]
@staticmethod
def cumsum(sequence, repeats):
r, s = [], 0
for dataset, repeat in zip(sequence, repeats):
l = len(dataset) * repeat
r.append(l + s)
s += l
return r
def __init__(self, datasets: Iterable[Dataset], repeats: list[int]):
super().__init__()
self.datasets = list(datasets)
self.repeats = repeats
assert len(self.datasets) > 0, "datasets should not be an empty iterable"
assert len(self.datasets) == len(
repeats
), "datasets and repeats should have the same length"
for d in self.datasets:
assert not isinstance(
d, IterableDataset
), "ConcatRepeatDataset does not support IterableDataset"
self.cumulative_sizes = self.cumsum(self.datasets, self.repeats)
def __len__(self):
return self.cumulative_sizes[-1]
def __getitem__(self, idx):
dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
if dataset_idx == 0:
sample_idx = idx
else:
sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
dataset = self.datasets[dataset_idx]
return dataset[sample_idx % len(dataset)]
syntax = "proto3";
package text_data;
message Semantics {
repeated uint32 values = 1;
}
message Sentence {
repeated string texts = 1;
repeated Semantics semantics = 3;
}
message TextData {
string source = 1;
string name = 2;
repeated Sentence sentences = 4;
}
message SampledData {
string source = 1;
string name = 2;
repeated Sentence samples = 3;
}
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: text-data.proto
# Protobuf Python Version: 4.25.1
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool
from google.protobuf import symbol_database as _symbol_database
from google.protobuf.internal import builder as _builder
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
b'\n\x0ftext-data.proto\x12\ttext_data"\x1b\n\tSemantics\x12\x0e\n\x06values\x18\x01 \x03(\r"B\n\x08Sentence\x12\r\n\x05texts\x18\x01 \x03(\t\x12\'\n\tsemantics\x18\x03 \x03(\x0b\x32\x14.text_data.Semantics"P\n\x08TextData\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12&\n\tsentences\x18\x04 \x03(\x0b\x32\x13.text_data.Sentence"Q\n\x0bSampledData\x12\x0e\n\x06source\x18\x01 \x01(\t\x12\x0c\n\x04name\x18\x02 \x01(\t\x12$\n\x07samples\x18\x03 \x03(\x0b\x32\x13.text_data.Sentenceb\x06proto3'
)
_globals = globals()
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, "text_data_pb2", _globals)
if _descriptor._USE_C_DESCRIPTORS == False:
DESCRIPTOR._options = None
_globals["_SEMANTICS"]._serialized_start = 30
_globals["_SEMANTICS"]._serialized_end = 57
_globals["_SENTENCE"]._serialized_start = 59
_globals["_SENTENCE"]._serialized_end = 125
_globals["_TEXTDATA"]._serialized_start = 127
_globals["_TEXTDATA"]._serialized_end = 207
_globals["_SAMPLEDDATA"]._serialized_start = 209
_globals["_SAMPLEDDATA"]._serialized_end = 290
# @@protoc_insertion_point(module_scope)
import struct
from .text_data_pb2 import TextData
def read_pb_stream(f):
while True:
buf = f.read(4)
if len(buf) == 0:
break
size = struct.unpack("I", buf)[0]
buf = f.read(size)
text_data = TextData()
text_data.ParseFromString(buf)
yield text_data
def write_pb_stream(f, text_data):
buf = text_data.SerializeToString()
f.write(struct.pack("I", len(buf)))
f.write(buf)
def pack_pb_stream(text_data):
buf = text_data.SerializeToString()
return struct.pack("I", len(buf)) + buf
def split_pb_stream(f):
while True:
head = f.read(4)
if len(head) == 0:
break
size = struct.unpack("I", head)[0]
buf = f.read(size)
yield head + buf
import random
from dataclasses import dataclass
from itertools import chain
from pathlib import Path
from random import Random
from typing import Optional, Union
import numpy as np
import pyarrow.parquet as pq
import torch
import torch.nn.functional as F
from datasets.download.streaming_download_manager import xopen
from huggingface_hub import HfApi
from lightning import LightningDataModule
from torch.distributed import get_rank, get_world_size, is_initialized
from torch.utils.data import DataLoader, Dataset, IterableDataset, get_worker_info
from fish_speech.conversation import (
CODEBOOK_PAD_TOKEN_ID,
Conversation,
Message,
TextPart,
VQPart,
)
from fish_speech.datasets.protos.text_data_pb2 import SampledData
from fish_speech.datasets.protos.text_data_stream import read_pb_stream
from fish_speech.text.clean import clean_text
from fish_speech.tokenizer import FishTokenizer
from fish_speech.utils import RankedLogger
from fish_speech.utils.braceexpand import braceexpand
log = RankedLogger(__name__, rank_zero_only=True)
def split_by_rank_worker(files):
# We need to know the total number of devices
# to split the data properly
total_devices = 1
if is_initialized():
total_devices = get_world_size()
worker_info = get_worker_info()
if worker_info is not None:
total_devices *= worker_info.num_workers
if len(files) < total_devices:
# Repeat the files N times to match the number of devices
files = files * (total_devices // len(files) + 1)
# DDP
if is_initialized():
files = files[get_rank() :: get_world_size()]
# Split by worker
if worker_info is not None:
files = files[worker_info.id :: worker_info.num_workers]
return files
class AutoTextSemanticInstructionIterableDataset(IterableDataset):
"""
Auto Augment Dataset by Speaker
1. Random concatenate multiple sentences from the same speaker to form a longer sentence
2. Automatically normalize the text
For interactive mode, we use the following format (multiple sequences):
<s> [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST] </s>
For non-interactive mode, we use the following format (one long sequence):
<s> [INST] text [/INST] ... </s>
"""
def __init__(
self,
proto_files: list[str],
seed: int = 42,
interactive_prob: float = 0.5,
max_length: int = 1024,
tokenizer: FishTokenizer = None,
use_speaker: bool | float = True,
causal: bool = True,
num_codebooks: Optional[int] = None,
skip_text_prob: float = 0.0,
):
"""
Args:
proto_files: proto buf files if using local data
seed: random seed
interactive_prob: probability to use interactive mode
max_length: max length of the text
tokenizer: tokenizer
use_speaker: include speaker information in the prompt
causal: use causal sampling when using local data, disable will lead to random sampling
num_codebooks: number of codebooks, if None, it will be automatically detected
skip_text_prob: probability to skip the text (audio only), this only applies to interactive mode
"""
super().__init__()
assert 0 <= interactive_prob <= 1, "interactive_prob must be in [0, 1]"
self.seed = seed
self.max_length = max_length
self.tokenizer = tokenizer
self.interactive_prob = interactive_prob
self.use_speaker = use_speaker
self.proto_files = proto_files
self.causal = causal
self.num_codebooks = num_codebooks
self.skip_text_prob = skip_text_prob
self.groups = None
def __iter__(self):
while True:
yield self.augment()
def init_mock_data_server(self):
if self.groups is not None:
return
# Expand the proto files
expanded_proto_files = []
for filename in self.proto_files:
for i in braceexpand(filename):
i = Path(i)
if i.is_file():
expanded_proto_files.append(i)
elif i.is_dir():
expanded_proto_files.extend(i.rglob("*.proto"))
expanded_proto_files.extend(i.rglob("*.protos"))
else:
raise ValueError(f"{i} is not a file or directory")
expanded_proto_files = sorted(expanded_proto_files)
Random(self.seed).shuffle(expanded_proto_files)
self.groups = []
shard_proto_files = split_by_rank_worker(expanded_proto_files)
log.info(
f"Reading {len(shard_proto_files)} / {len(expanded_proto_files)} files"
)
count = 0
for filename in shard_proto_files:
with open(filename, "rb") as f:
for text_data in read_pb_stream(f):
self.groups.append(text_data)
count += 1
log.info(f"Read total {count} groups of data")
# Shuffle the lines
Random(self.seed).shuffle(self.groups)
self.group_weights = [len(i.sentences) for i in self.groups]
def sample_data(self):
if self.groups is None:
self.init_mock_data_server()
# Shuffle unique lines, estimate that each sample is at least 20 tokens
num_samples = self.max_length // 20
# choice group based on their number of samples
group = random.choices(self.groups, weights=self.group_weights, k=1)[0]
if self.causal:
# Sample in order
if num_samples >= len(group.sentences):
samples = group.sentences
else:
begin = random.randint(0, len(group.sentences) - num_samples)
samples = group.sentences[begin : begin + num_samples]
else:
samples = random.choices(
group.sentences, k=min(num_samples, len(group.sentences))
)
return SampledData(
source=group.source,
name=group.name,
samples=samples,
)
def pack_sentences(
self,
sentences: list[str],
semantics: list,
# speaker: Optional[str] = None,
skip_text: bool = False,
):
# if speaker is None:
# speaker = "assistant"
messages = [
Message(
role="system",
parts=[TextPart(text="Speak out the provided text.")],
# add_im_end=False,
# cal_loss=True,
)
]
cated_sentences = " ".join(sentences)
if skip_text:
cated_sentences = "<|skip_text|>"
messages.append(
Message(
role="user",
parts=[TextPart(text=cated_sentences)],
# cal_loss=True,
)
)
vq_codes = [x.values for x in semantics[0]]
vq_codes_tensor = torch.tensor(vq_codes).to(torch.int32)
vqpart = VQPart(codes=vq_codes_tensor)
messages.append(
Message(
role="assistant",
parts=[TextPart(text="<|voice|>"), vqpart],
cal_loss=True,
)
)
num_codebooks = (
len(semantics[0]) if self.num_codebooks is None else self.num_codebooks
)
conversation = Conversation(messages=messages)
# conversation.visualize(tokenizer=self.tokenizer)
encoded = conversation.encode(
tokenizer=self.tokenizer,
)
tokens_raw = encoded.tokens
tokens = torch.zeros((num_codebooks + 1, len(tokens_raw)), dtype=torch.int)
tokens[0] = tokens_raw
vq_parts = encoded.vq_parts
vq_parts = [part.to(tokens.device) for part in vq_parts]
vq_parts = torch.cat(vq_parts, dim=1)
tokens[1:, encoded.vq_mask_tokens] = vq_parts
labels_raw = encoded.labels
labels = torch.full((num_codebooks + 1, len(labels_raw)), -100, dtype=torch.int)
labels[0, :] = labels_raw
labels[1:, encoded.vq_mask_labels] = vq_parts
labels[1:, -1:] = CODEBOOK_PAD_TOKEN_ID
tokens = tokens.long()
labels = labels.long()
# Verify the padding is correct, and the last token is eos
assert (tokens[1:, ~(encoded.vq_mask_tokens)] == CODEBOOK_PAD_TOKEN_ID).all()
assert (labels[1:, -1:] == CODEBOOK_PAD_TOKEN_ID).all()
return tokens, labels
def augment(self):
response = self.sample_data()
if len(response.samples) == 0:
# Invalid group
return None
samples = list(response.samples)
all_tokens, all_labels = [], []
while len(samples) > 0:
sentence = samples.pop(0)
text = clean_text(random.choice(sentence.texts))
tokens, labels = self.pack_sentences(
sentences=[text],
semantics=[sentence.semantics],
# speaker=response.name if use_speaker else None,
skip_text=random.random() < self.skip_text_prob,
)
all_tokens.append(tokens)
all_labels.append(labels)
tokens = torch.cat(all_tokens, dim=1)
labels = torch.cat(all_labels, dim=1)
# Verify that the length is correct
assert tokens.size(1) == labels.size(1), f"{tokens.size(1)} != {labels.size(1)}"
data = {"tokens": tokens, "labels": labels}
return data
class AutoTextSemanticInstructionDataset(Dataset):
"""
Auto Augment Dataset by Speaker
1. Random concatenate multiple sentences from the same speaker to form a longer sentence
2. Automatically normalize the text
For interactive mode, we use the following format (multiple sequences):
<s> [INST] [SPK: speaker] text [/INST] ... [INST] text [/INST] </s>
For non-interactive mode, we use the following format (one long sequence):
<s> [INST] text [/INST] ... </s>
"""
def __init__(
self,
proto_files: list[str],
seed: int = 42,
interactive_prob: float = 0.5,
max_length: int = 1024,
tokenizer: FishTokenizer = None,
use_speaker: bool | float = True,
causal: bool = True,
num_codebooks: Optional[int] = None,
skip_text_prob: float = 0.0,
):
"""
Args:
proto_files: proto buf files if using local data
seed: random seed
interactive_prob: probability to use interactive mode
max_length: max length of the text
tokenizer: tokenizer
use_speaker: include speaker information in the prompt
causal: use causal sampling when using local data, disable will lead to random sampling
num_codebooks: number of codebooks, if None, it will be automatically detected
skip_text_prob: probability to skip the text (audio only), this only applies to interactive mode
"""
super().__init__()
assert 0 <= interactive_prob <= 1, "interactive_prob must be in [0, 1]"
self.seed = seed
self.max_length = max_length
self.tokenizer = tokenizer
self.interactive_prob = interactive_prob
self.use_speaker = use_speaker
self.proto_files = proto_files
self.causal = causal
self.num_codebooks = num_codebooks
self.skip_text_prob = skip_text_prob
self.data = []
self._init_data()
def _init_data(self):
expanded_proto_files = []
for filename in self.proto_files:
for i in braceexpand(filename):
i = Path(i)
if i.is_file():
expanded_proto_files.append(i)
elif i.is_dir():
expanded_proto_files.extend(i.rglob("*.proto"))
expanded_proto_files.extend(i.rglob("*.protos"))
else:
raise ValueError(f"{i} is not a file or directory")
expanded_proto_files = sorted(expanded_proto_files)
Random(self.seed).shuffle(expanded_proto_files)
groups = []
shard_proto_files = split_by_rank_worker(expanded_proto_files)
log.info(
f"Reading {len(shard_proto_files)} / {len(expanded_proto_files)} files"
)
count = 0
for filename in shard_proto_files:
with open(filename, "rb") as f:
for text_data in read_pb_stream(f):
groups.append(text_data)
count += 1
log.info(f"Read total {count} groups of data")
for group in groups:
if len(group.sentences) == 0:
continue
samples = list(group.sentences)
for sentence in samples:
text = clean_text(random.choice(sentence.texts))
tokens, labels = self.pack_sentences(
sentences=[text],
semantics=[sentence.semantics],
skip_text=random.random() < self.skip_text_prob,
)
self.data.append({"tokens": tokens, "labels": labels})
random.Random(self.seed).shuffle(self.data)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx]
def pack_sentences(
self,
sentences: list[str],
semantics: list,
skip_text: bool = False,
):
messages = [
Message(
role="system",
parts=[TextPart(text="Speak out the provided text.")],
)
]
cated_sentences = " ".join(sentences)
if skip_text:
cated_sentences = "<|skip_text|>"
messages.append(
Message(
role="user",
parts=[TextPart(text=cated_sentences)],
)
)
vq_codes = [x.values for x in semantics[0]]
vq_codes_tensor = torch.tensor(vq_codes).to(torch.int32)
vqpart = VQPart(codes=vq_codes_tensor)
messages.append(
Message(
role="assistant",
parts=[TextPart(text="<|voice|>"), vqpart],
cal_loss=True,
)
)
num_codebooks = (
len(semantics[0]) if self.num_codebooks is None else self.num_codebooks
)
conversation = Conversation(messages=messages)
encoded = conversation.encode(
tokenizer=self.tokenizer,
)
tokens_raw = encoded.tokens
tokens = torch.zeros((num_codebooks + 1, len(tokens_raw)), dtype=torch.int)
tokens[0] = tokens_raw
vq_parts = encoded.vq_parts
vq_parts = [part.to(tokens.device) for part in vq_parts]
vq_parts = torch.cat(vq_parts, dim=1)
tokens[1:, encoded.vq_mask_tokens] = vq_parts
labels_raw = encoded.labels
labels = torch.full((num_codebooks + 1, len(labels_raw)), -100, dtype=torch.int)
labels[0, :] = labels_raw
labels[1:, encoded.vq_mask_labels] = vq_parts
labels[1:, -1:] = CODEBOOK_PAD_TOKEN_ID
tokens = tokens.long()
labels = labels.long()
assert (tokens[1:, ~(encoded.vq_mask_tokens)] == CODEBOOK_PAD_TOKEN_ID).all()
assert (labels[1:, -1:] == CODEBOOK_PAD_TOKEN_ID).all()
return tokens, labels
class InterleaveDataset(IterableDataset):
def __init__(
self,
datasets: list[IterableDataset],
probabilities: list[float],
seed: int = 42,
):
super().__init__()
self.datasets = datasets
self.probabilities = probabilities
self.seed = seed
def __iter__(self):
rng = np.random.default_rng(self.seed)
dataset_iterators = [iter(dataset) for dataset in self.datasets]
while True:
# Random choice one
dataset_idx = rng.choice(len(self.datasets), p=self.probabilities)
dataset_iterator = dataset_iterators[dataset_idx]
try:
yield next(dataset_iterator)
except StopIteration:
# Exhausted, create a new iterator
dataset_iterators[dataset_idx] = iter(self.datasets[dataset_idx])
yield next(dataset_iterators[dataset_idx])
@dataclass
class TextDataCollator:
tokenizer: FishTokenizer
max_length: int = 1024
def __call__(self, examples):
if "negative_tokens" in examples:
positive_examples = []
negative_examples = []
for i in examples:
positive_examples.append(
{
"tokens": i["tokens"],
"labels": i["labels"],
}
)
negative_examples.append(
{
"tokens": i["negative_tokens"],
"labels": i["negative_labels"],
}
)
examples = positive_examples + negative_examples
return self.batchify(examples)
def batchify(self, examples, tokens_key="tokens", labels_key="labels"):
tokens, attention_masks, labels = [], [], []
# Calculate the max length
max_tokens_length = 0
for example in examples:
max_tokens_length = max(max_tokens_length, example[tokens_key].size(1))
max_tokens_length = min(max_tokens_length, self.max_length)
for example in examples:
_tokens = example[tokens_key][:, :max_tokens_length]
_labels = example[labels_key][:, :max_tokens_length]
_attention_mask = torch.ones((max_tokens_length,), dtype=torch.bool)
tokens_length = _tokens.size(1)
_attention_mask[:tokens_length] = False
assert tokens_length == _labels.size(
1
), f"{tokens_length} != {_labels.size(1)}"
if tokens_length < max_tokens_length:
_tokens = F.pad(
_tokens,
(0, max_tokens_length - tokens_length),
value=self.tokenizer.get_token_id("<|end_of_text|>"),
)
_tokens[1:, tokens_length:] = CODEBOOK_PAD_TOKEN_ID
_labels = F.pad(
_labels, (0, max_tokens_length - _labels.size(1)), value=-100
)
tokens.append(_tokens)
attention_masks.append(_attention_mask)
labels.append(_labels)
tokens = torch.stack(tokens, dim=0)
attention_masks = torch.stack(attention_masks, dim=0)
labels = torch.stack(labels, dim=0)
return {
"inputs": tokens,
"attention_masks": attention_masks,
"labels": labels,
}
class SemanticDataModule(LightningDataModule):
def __init__(
self,
train_dataset: Union[
AutoTextSemanticInstructionDataset,
AutoTextSemanticInstructionIterableDataset,
InterleaveDataset,
],
val_dataset: Union[
AutoTextSemanticInstructionDataset,
AutoTextSemanticInstructionIterableDataset,
InterleaveDataset,
],
batch_size: int = 32,
tokenizer: FishTokenizer = None,
max_length: int = 1024,
num_workers: int = 4,
):
super().__init__()
self.train_dataset = train_dataset
self.val_dataset = val_dataset
self.batch_size = batch_size
self.tokenizer = tokenizer
self.max_length = max_length
self.num_workers = num_workers
def train_dataloader(self):
return DataLoader(
self.train_dataset,
batch_size=self.batch_size,
collate_fn=TextDataCollator(self.tokenizer, self.max_length),
num_workers=self.num_workers,
persistent_workers=True,
)
def val_dataloader(self):
return DataLoader(
self.val_dataset,
batch_size=self.batch_size,
collate_fn=TextDataCollator(self.tokenizer, self.max_length),
num_workers=self.num_workers,
persistent_workers=True,
)
if __name__ == "__main__":
from tqdm import tqdm
ds = AutoTextSemanticInstructionDataset(
["data/protos"],
tokenizer=FishTokenizer("checkpoints/fish-speech-1.5/tokenizer.tiktoken"),
use_speaker=False,
interactive_prob=1.0,
skip_text_prob=0.5,
)
for i in range(100):
# Please uncomment line 235 to visualize the tokenized message
print(ds[i])
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
import librosa
import numpy as np
import torch
from lightning import LightningDataModule
from torch.utils.data import DataLoader, Dataset
from fish_speech.utils import RankedLogger
logger = RankedLogger(__name__, rank_zero_only=False)
class VQGANDataset(Dataset):
def __init__(
self,
filelist: str,
sample_rate: int = 32000,
hop_length: int = 640,
slice_frames: Optional[int] = None,
):
super().__init__()
filelist = Path(filelist)
root = filelist.parent
self.files = [
root / line.strip()
for line in filelist.read_text(encoding="utf-8").splitlines()
if line.strip()
]
self.sample_rate = sample_rate
self.hop_length = hop_length
self.slice_frames = slice_frames
def __len__(self):
return len(self.files)
def get_item(self, idx):
file = self.files[idx]
audio, _ = librosa.load(file, sr=self.sample_rate, mono=True)
# Slice audio and features
if (
self.slice_frames is not None
and audio.shape[0] > self.slice_frames * self.hop_length
):
start = np.random.randint(
0, audio.shape[0] - self.slice_frames * self.hop_length
)
audio = audio[start : start + self.slice_frames * self.hop_length]
if len(audio) == 0:
return None
max_value = np.abs(audio).max()
if max_value > 1.0:
audio = audio / max_value
return {
"audio": torch.from_numpy(audio),
}
def __getitem__(self, idx):
try:
return self.get_item(idx)
except Exception as e:
import traceback
traceback.print_exc()
logger.error(f"Error loading {self.files[idx]}: {e}")
return None
@dataclass
class VQGANCollator:
def __call__(self, batch):
batch = [x for x in batch if x is not None]
audio_lengths = torch.tensor([len(x["audio"]) for x in batch])
audio_maxlen = audio_lengths.max()
# Rounds up to nearest multiple of 2 (audio_lengths)
audios = []
for x in batch:
audios.append(
torch.nn.functional.pad(x["audio"], (0, audio_maxlen - len(x["audio"])))
)
return {
"audios": torch.stack(audios),
"audio_lengths": audio_lengths,
}
class VQGANDataModule(LightningDataModule):
def __init__(
self,
train_dataset: VQGANDataset,
val_dataset: VQGANDataset,
batch_size: int = 32,
num_workers: int = 4,
val_batch_size: Optional[int] = None,
):
super().__init__()
self.train_dataset = train_dataset
self.val_dataset = val_dataset
self.batch_size = batch_size
self.val_batch_size = val_batch_size or batch_size
self.num_workers = num_workers
def train_dataloader(self):
return DataLoader(
self.train_dataset,
batch_size=self.batch_size,
collate_fn=VQGANCollator(),
num_workers=self.num_workers,
shuffle=True,
persistent_workers=True,
)
def val_dataloader(self):
return DataLoader(
self.val_dataset,
batch_size=self.val_batch_size,
collate_fn=VQGANCollator(),
num_workers=self.num_workers,
persistent_workers=True,
)
if __name__ == "__main__":
dataset = VQGANDataset("data/LibriTTS_R/vq_train_filelist.txt")
dataloader = DataLoader(
dataset, batch_size=4, shuffle=False, collate_fn=VQGANCollator()
)
for batch in dataloader:
print(batch["audios"].shape)
print(batch["features"].shape)
print(batch["audio_lengths"])
print(batch["feature_lengths"])
break
## i18n Folder Attribution
The `i18n` folder within the `fish_speech` directory contains files initially sourced from the RVC project. In compliance with the MIT license under which these files were released, we acknowledge the original authors and sources below:
### fish_speech/i18n/core.py
**Related code from RVC:**
[https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/i18n.py)
**Initial commit:**
add localization(添加本地化) [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#35](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/35)
**Initial author:**
[@L4Ph](https://github.com/L4Ph)
### fish_speech/i18n/scan.py
**Related code from RVC:**
[https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/83d6a64e675d9bbd6e92ee450c5f807ed2bb54d8/i18n/scan_i18n.py)
**Initial commit:**
File for detecting i18n missing keys [RVC-Project/Retrieval-based-Voice-Conversion-WebUI#1058](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/pull/1058)
**Initial author:**
[@towzeur](https://github.com/towzeur)
We appreciate the contributions of the RVC project and its authors.
from .core import i18n
__all__ = ["i18n"]
import json
import locale
from pathlib import Path
I18N_FILE_PATH = Path(__file__).parent / "locale"
DEFAULT_LANGUAGE = "en_US"
def load_language_list(language):
with open(I18N_FILE_PATH / f"{language}.json", "r", encoding="utf-8") as f:
language_list = json.load(f)
return language_list
class I18nAuto:
def __init__(self):
i18n_file = Path(".locale")
if i18n_file.exists():
with open(i18n_file, "r", encoding="utf-8") as f:
language = f.read().strip()
else:
# getlocale can't identify the system's language ((None, None))
language = locale.getdefaultlocale()[0]
if (I18N_FILE_PATH / f"{language}.json").exists() is False:
language = DEFAULT_LANGUAGE
self.language = language
self.language_map = load_language_list(language)
def __call__(self, key):
return self.language_map.get(key, key)
def __repr__(self):
return "Use Language: " + self.language
i18n = I18nAuto()
{
"16-mixed is recommended for 10+ series GPU": "16-mixed is recommended for 10+ series GPU",
"5 to 10 seconds of reference audio, useful for specifying speaker.": "5 to 10 seconds of reference audio, useful for specifying speaker.",
"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).",
"Accumulate Gradient Batches": "Accumulate Gradient Batches",
"Add to Processing Area": "Add to Processing Area",
"Added path successfully!": "Added path successfully!",
"Advanced Config": "Advanced Config",
"Base LLAMA Model": "Base LLAMA Model",
"Batch Inference": "Batch Inference",
"Batch Size": "Batch Size",
"Changing with the Model Path": "Changing with the Model Path",
"Chinese": "Chinese",
"Compile Model": "Compile Model",
"Compile the model can significantly reduce the inference time, but will increase cold start time": "Compile the model can significantly reduce the inference time, but will increase cold start time",
"Copy": "Copy",
"Data Preprocessing": "Data Preprocessing",
"Data Preprocessing Path": "Data Preprocessing Path",
"Data Source": "Data Source",
"Decoder Model Config": "Decoder Model Config",
"Decoder Model Path": "Decoder Model Path",
"Disabled": "Disabled",
"Enable Reference Audio": "Enable Reference Audio",
"English": "English",
"Error Message": "Error Message",
"File Preprocessing": "File Preprocessing",
"Generate": "Generate",
"Generated Audio": "Generated Audio",
"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format",
"Infer interface is closed": "Infer interface is closed",
"Inference Configuration": "Inference Configuration",
"Inference Server Configuration": "Inference Server Configuration",
"Inference Server Error": "Inference Server Error",
"Inferring interface is launched at {}": "Inferring interface is launched at {}",
"Initial Learning Rate": "Initial Learning Rate",
"Input Audio & Source Path for Transcription": "Input Audio & Source Path for Transcription",
"Input Text": "Input Text",
"Invalid path: {}": "Invalid path: {}",
"It is recommended to use CUDA, if you have low configuration, use CPU": "It is recommended to use CUDA, if you have low configuration, use CPU",
"Iterative Prompt Length, 0 means off": "Iterative Prompt Length, 0 means off",
"Japanese": "Japanese",
"LLAMA Configuration": "LLAMA Configuration",
"LLAMA Model Config": "LLAMA Model Config",
"LLAMA Model Path": "LLAMA Model Path",
"Labeling Device": "Labeling Device",
"LoRA Model to be merged": "LoRA Model to be merged",
"Maximum Audio Duration": "Maximum Audio Duration",
"Maximum Length per Sample": "Maximum Length per Sample",
"Maximum Training Steps": "Maximum Training Steps",
"Maximum tokens per batch, 0 means no limit": "Maximum tokens per batch, 0 means no limit",
"Merge": "Merge",
"Merge LoRA": "Merge LoRA",
"Merge successfully": "Merge successfully",
"Minimum Audio Duration": "Minimum Audio Duration",
"Model Output Path": "Model Output Path",
"Model Size": "Model Size",
"Move": "Move",
"Move files successfully": "Move files successfully",
"No audio generated, please check the input text.": "No audio generated, please check the input text.",
"No selected options": "No selected options",
"Number of Workers": "Number of Workers",
"Open Inference Server": "Open Inference Server",
"Open Labeler WebUI": "Open Labeler WebUI",
"Open Tensorboard": "Open Tensorboard",
"Opened labeler in browser": "Opened labeler in browser",
"Optional Label Language": "Optional Label Language",
"Optional online ver": "Optional online ver",
"Output Path": "Output Path",
"Path error, please check the model file exists in the corresponding path": "Path error, please check the model file exists in the corresponding path",
"Precision": "Precision",
"Probability of applying Speaker Condition": "Probability of applying Speaker Condition",
"Put your text here.": "Put your text here.",
"Reference Audio": "Reference Audio",
"Reference Text": "Reference Text",
"Related code and weights are released under CC BY-NC-SA 4.0 License.": "Related code and weights are released under CC BY-NC-SA 4.0 License.",
"Remove Selected Data": "Remove Selected Data",
"Removed path successfully!": "Removed path successfully!",
"Repetition Penalty": "Repetition Penalty",
"Save model every n steps": "Save model every n steps",
"Select LLAMA ckpt": "Select LLAMA ckpt",
"Select VITS ckpt": "Select VITS ckpt",
"Select VQGAN ckpt": "Select VQGAN ckpt",
"Select source file processing method": "Select source file processing method",
"Select the model to be trained (Depending on the Tab page you are on)": "Select the model to be trained (Depending on the Tab page you are on)",
"Selected: {}": "Selected: {}",
"Speaker": "Speaker",
"Speaker is identified by the folder name": "Speaker is identified by the folder name",
"Start Training": "Start Training",
"Streaming Audio": "Streaming Audio",
"Streaming Generate": "Streaming Generate",
"Tensorboard Host": "Tensorboard Host",
"Tensorboard Log Path": "Tensorboard Log Path",
"Tensorboard Port": "Tensorboard Port",
"Tensorboard interface is closed": "Tensorboard interface is closed",
"Tensorboard interface is launched at {}": "Tensorboard interface is launched at {}",
"Text is too long, please keep it under {} characters.": "Text is too long, please keep it under {} characters.",
"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.",
"Training Configuration": "Training Configuration",
"Training Error": "Training Error",
"Training stopped": "Training stopped",
"Type name of the speaker": "Type name of the speaker",
"Type the path or select from the dropdown": "Type the path or select from the dropdown",
"Use LoRA": "Use LoRA",
"Use LoRA can save GPU memory, but may reduce the quality of the model": "Use LoRA can save GPU memory, but may reduce the quality of the model",
"Use filelist": "Use filelist",
"Use large for 10G+ GPU, medium for 5G, small for 2G": "Use large for 10G+ GPU, medium for 5G, small for 2G",
"VITS Configuration": "VITS Configuration",
"VQGAN Configuration": "VQGAN Configuration",
"Validation Batch Size": "Validation Batch Size",
"View the status of the preprocessing folder (use the slider to control the depth of the tree)": "View the status of the preprocessing folder (use the slider to control the depth of the tree)",
"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.",
"WebUI Host": "WebUI Host",
"WebUI Port": "WebUI Port",
"Whisper Model": "Whisper Model",
"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).",
"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU",
"latest": "latest",
"new": "new",
"Realtime Transform Text": "Realtime Transform Text",
"Normalization Result Preview (Currently Only Chinese)": "Normalization Result Preview (Currently Only Chinese)",
"Text Normalization": "Text Normalization",
"Select Example Audio": "Select Example Audio"
}
{
"16-mixed is recommended for 10+ series GPU": "se recomienda 16-mixed para GPU de la serie 10+",
"5 to 10 seconds of reference audio, useful for specifying speaker.": "5 a 10 segundos de audio de referencia, útil para especificar el hablante.",
"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "Un modelo de texto a voz basado en VQ-GAN y Llama desarrollado por [Fish Audio](https://fish.audio).",
"Accumulate Gradient Batches": "Acumular lotes de gradientes",
"Add to Processing Area": "Agregar al Área de Procesamiento",
"Added path successfully!": "¡Ruta agregada exitosamente!",
"Advanced Config": "Configuración Avanzada",
"Base LLAMA Model": "Modelo Base LLAMA",
"Batch Inference": "Inferencia por Lote",
"Batch Size": "Tamaño del Lote",
"Changing with the Model Path": "Cambiando con la Ruta del Modelo",
"Chinese": "Chino",
"Compile Model": "Compilar Modelo",
"Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar el modelo puede reducir significativamente el tiempo de inferencia, pero aumentará el tiempo de inicio en frío",
"Copy": "Copiar",
"Data Preprocessing": "Preprocesamiento de Datos",
"Data Preprocessing Path": "Ruta de Preprocesamiento de Datos",
"Data Source": "Fuente de Datos",
"Decoder Model Config": "Configuración del modelo decodificador",
"Decoder Model Path": "Ruta del modelo decodificador",
"Disabled": "Desactivado",
"Enable Reference Audio": "Habilitar Audio de Referencia",
"English": "Inglés",
"Error Message": "Mensaje de Error",
"File Preprocessing": "Preprocesamiento de Archivos",
"Generate": "Generar",
"Generated Audio": "Audio Generado",
"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "Si no hay texto correspondiente para el audio, aplique ASR para asistencia, soporte para formato .txt o .lab",
"Infer interface is closed": "La interfaz de inferencia está cerrada",
"Inference Configuration": "Configuración de Inferencia",
"Inference Server Configuration": "Configuración del Servidor de Inferencia",
"Inference Server Error": "Error del Servidor de Inferencia",
"Inferring interface is launched at {}": "La interfaz de inferencia se ha lanzado en {}",
"Initial Learning Rate": "Tasa de Aprendizaje Inicial",
"Input Audio & Source Path for Transcription": "Audio de Entrada y Ruta de Origen para Transcripción",
"Input Text": "Texto de Entrada",
"Invalid path: {}": "Ruta inválida: {}",
"It is recommended to use CUDA, if you have low configuration, use CPU": "Se recomienda usar CUDA, si tiene una configuración baja, use CPU",
"Iterative Prompt Length, 0 means off": "Longitud de la Indicación Iterativa, 0 significa apagado",
"Japanese": "Japonés",
"LLAMA Configuration": "Configuración de LLAMA",
"LLAMA Model Config": "Configuración del Modelo LLAMA",
"LLAMA Model Path": "Ruta del Modelo LLAMA",
"Labeling Device": "Dispositivo de Etiquetado",
"LoRA Model to be merged": "Modelo LoRA a fusionar",
"Maximum Audio Duration": "Duración máxima de audio",
"Maximum Length per Sample": "Longitud Máxima por Muestra",
"Maximum Training Steps": "Pasos Máximos de Entrenamiento",
"Maximum tokens per batch, 0 means no limit": "Máximo de tokens por lote, 0 significa sin límite",
"Merge": "Fusionar",
"Merge LoRA": "Fusionar LoRA",
"Merge successfully": "Fusionado exitosamente",
"Minimum Audio Duration": "Duración mínima de audio",
"Model Output Path": "Ruta de Salida del Modelo",
"Model Size": "Tamaño del Modelo",
"Move": "Mover",
"Move files successfully": "Archivos movidos exitosamente",
"No audio generated, please check the input text.": "No se generó audio, por favor verifique el texto de entrada.",
"No selected options": "No hay opciones seleccionadas",
"Number of Workers": "Número de Trabajadores",
"Open Inference Server": "Abrir Servidor de Inferencia",
"Open Labeler WebUI": "Abrir Interfaz Web del Etiquetador",
"Open Tensorboard": "Abrir Tensorboard",
"Opened labeler in browser": "Se abrió el etiquetador en el navegador",
"Optional Label Language": "Idioma de Etiquetado Opcional",
"Optional online ver": "Ver en línea opcional",
"Output Path": "Ruta de Salida",
"Path error, please check the model file exists in the corresponding path": "Error de ruta, por favor verifique que el archivo del modelo exista en la ruta correspondiente",
"Precision": "Precisión",
"Probability of applying Speaker Condition": "Probabilidad de aplicar Condición de Hablante",
"Put your text here.": "Ponga su texto aquí.",
"Reference Audio": "Audio de Referencia",
"Reference Text": "Texto de Referencia",
"Related code and weights are released under CC BY-NC-SA 4.0 License.": "El código relacionado y los pesos se publican bajo la Licencia CC BY-NC-SA 4.0.",
"Remove Selected Data": "Eliminar Datos Seleccionados",
"Removed path successfully!": "¡Ruta eliminada exitosamente!",
"Repetition Penalty": "Penalización por Repetición",
"Save model every n steps": "Guardar modelo cada n pasos",
"Select LLAMA ckpt": "Seleccionar punto de control LLAMA",
"Select VITS ckpt": "Seleccionar punto de control VITS",
"Select VQGAN ckpt": "Seleccionar punto de control VQGAN",
"Select source file processing method": "Seleccione el método de procesamiento de archivos fuente",
"Select the model to be trained (Depending on the Tab page you are on)": "Seleccione el modelo a entrenar (Dependiendo de la pestaña en la que se encuentre)",
"Selected: {}": "Seleccionado: {}",
"Speaker": "Hablante",
"Speaker is identified by the folder name": "El hablante se identifica por el nombre de la carpeta",
"Start Training": "Iniciar Entrenamiento",
"Streaming Audio": "transmisión de audio",
"Streaming Generate": "síntesis en flujo",
"Tensorboard Host": "Host de Tensorboard",
"Tensorboard Log Path": "Ruta de Registro de Tensorboard",
"Tensorboard Port": "Puerto de Tensorboard",
"Tensorboard interface is closed": "La interfaz de Tensorboard está cerrada",
"Tensorboard interface is launched at {}": "La interfaz de Tensorboard se ha lanzado en {}",
"Text is too long, please keep it under {} characters.": "El texto es demasiado largo, por favor manténgalo por debajo de {} caracteres.",
"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "La ruta de la carpeta de entrada a la izquierda o la lista de archivos. Ya sea que esté marcado o no, se utilizará para el entrenamiento posterior en esta lista.",
"Training Configuration": "Configuración de Entrenamiento",
"Training Error": "Error de Entrenamiento",
"Training stopped": "Entrenamiento detenido",
"Type name of the speaker": "Escriba el nombre del hablante",
"Type the path or select from the dropdown": "Escriba la ruta o seleccione de la lista desplegable",
"Use LoRA": "Usar LoRA",
"Use LoRA can save GPU memory, but may reduce the quality of the model": "Usar LoRA puede ahorrar memoria GPU, pero puede reducir la calidad del modelo",
"Use filelist": "Usar lista de archivos",
"Use large for 10G+ GPU, medium for 5G, small for 2G": "Use grande para GPU de 10G+, mediano para 5G, pequeño para 2G",
"VITS Configuration": "Configuración de VITS",
"VQGAN Configuration": "Configuración de VQGAN",
"Validation Batch Size": "Tamaño del Lote de Validación",
"View the status of the preprocessing folder (use the slider to control the depth of the tree)": "Vea el estado de la carpeta de preprocesamiento (use el control deslizante para controlar la profundidad del árbol)",
"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "No somos responsables de ningún mal uso del modelo, por favor considere sus leyes y regulaciones locales antes de usarlo.",
"WebUI Host": "Host de WebUI",
"WebUI Port": "Puerto de WebUI",
"Whisper Model": "Modelo Whisper",
"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Puede encontrar el código fuente [aquí](https://github.com/fishaudio/fish-speech) y los modelos [aquí](https://huggingface.co/fishaudio/fish-speech-1).",
"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "Se recomienda bf16-true para GPU de la serie 30+, se recomienda 16-mixed para GPU de la serie 10+",
"latest": "más reciente",
"new": "nuevo",
"Realtime Transform Text": "Transformación de Texto en Tiempo Real",
"Normalization Result Preview (Currently Only Chinese)": "Vista Previa del Resultado de Normalización (Actualmente Solo Chino)",
"Text Normalization": "Normalización de Texto",
"Select Example Audio": "Selecionar áudio de exemplo"
}
{
"16-mixed is recommended for 10+ series GPU": "10シリーズ以降のGPUには16-mixedをお勧めします",
"5 to 10 seconds of reference audio, useful for specifying speaker.": "話者を指定するのに役立つ、5~10秒のリファレンスオーディオ。",
"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "[Fish Audio](https://fish.audio)が開発したVQ-GANとLlamaに基づくテキスト音声合成モデル。",
"Accumulate Gradient Batches": "勾配バッチの累積",
"Add to Processing Area": "処理エリアに追加",
"Added path successfully!": "パスの追加に成功しました!",
"Advanced Config": "詳細設定",
"Base LLAMA Model": "基本LLAMAモデル",
"Batch Inference": "バッチ推論",
"Batch Size": "バッチサイズ",
"Changing with the Model Path": "モデルのパスに伴って変化する",
"Chinese": "中国語",
"Compile Model": "モデルのコンパイル",
"Compile the model can significantly reduce the inference time, but will increase cold start time": "モデルをコンパイルすると推論時間を大幅に短縮できますが、コールドスタート時間が長くなります",
"Copy": "コピー",
"Data Preprocessing": "データ前処理",
"Data Preprocessing Path": "データ前処理パス",
"Data Source": "データソース",
"Decoder Model Config": "デコーダーモデルの構成",
"Decoder Model Path": "デコーダーモデルのパス",
"Disabled": "無効",
"Enable Reference Audio": "リファレンスオーディオを有効にする",
"English": "英語",
"Error Message": "エラーメッセージ",
"File Preprocessing": "文書前处理",
"Generate": "生成",
"Generated Audio": "生成されたオーディオ",
"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "音声に対応するテキストがない場合は、ASRを適用してサポートします。.txtまたは.lab形式をサポートしています",
"Infer interface is closed": "推論インターフェースが閉じられています",
"Inference Configuration": "推論設定",
"Inference Server Configuration": "推論サーバー設定",
"Inference Server Error": "推論サーバーエラー",
"Inferring interface is launched at {}": "推論インターフェースが{}で起動しました",
"Initial Learning Rate": "初期学習率",
"Input Audio & Source Path for Transcription": "入力オーディオと文字起こしのソースパス",
"Input Text": "入力テキスト",
"Invalid path: {}": "無効なパス: {}",
"It is recommended to use CUDA, if you have low configuration, use CPU": "CUDAの使用をお勧めします。低い構成の場合はCPUを使用してください",
"Iterative Prompt Length, 0 means off": "反復プロンプト長。0はオフを意味します",
"Japanese": "日本語",
"LLAMA Configuration": "LLAMA設定",
"LLAMA Model Config": "LLAMAモデル設定",
"LLAMA Model Path": "LLAMAモデルパス",
"Labeling Device": "ラベリングデバイス",
"LoRA Model to be merged": "マージするLoRAモデル",
"Maximum Audio Duration": "最大オーディオの長さ",
"Maximum Length per Sample": "サンプルあたりの最大長",
"Maximum Training Steps": "最大トレーニングステップ数",
"Maximum tokens per batch, 0 means no limit": "バッチあたりの最大トークン数。0は制限なしを意味します",
"Merge": "マージ",
"Merge LoRA": "LoRAのマージ",
"Merge successfully": "マージに成功しました",
"Minimum Audio Duration": "最小オーディオの長さ",
"Model Output Path": "モデル出力パス",
"Model Size": "モデルサイズ",
"Move": "移動",
"Move files successfully": "ファイルの移動に成功しました",
"No audio generated, please check the input text.": "オーディオが生成されていません。入力テキストを確認してください。",
"No selected options": "選択されたオプションはありません",
"Number of Workers": "ワーカー数",
"Open Inference Server": "推論サーバーを開く",
"Open Labeler WebUI": "ラベラーWebUIを開く",
"Open Tensorboard": "Tensorboardを開く",
"Opened labeler in browser": "ブラウザでラベラーを開きました",
"Optional Label Language": "オプションのラベル言語",
"Optional online ver": "オプションのオンラインバージョン",
"Output Path": "出力パス",
"Path error, please check the model file exists in the corresponding path": "パスエラー。対応するパスにモデルファイルが存在するか確認してください",
"Precision": "精度",
"Probability of applying Speaker Condition": "話者条件を適用する確率",
"Put your text here.": "ここにテキストを入力してください。",
"Reference Audio": "リファレンスオーディオ",
"Reference Text": "リファレンステキスト",
"Related code and weights are released under CC BY-NC-SA 4.0 License.": "関連コードと重みはCC BY-NC-SA 4.0ライセンスの下でリリースされます。",
"Remove Selected Data": "選択したデータを削除",
"Removed path successfully!": "パスの削除に成功しました!",
"Repetition Penalty": "反復ペナルティ",
"Save model every n steps": "nステップごとにモデルを保存",
"Select LLAMA ckpt": " LLAMA チェックポイントを選択",
"Select VITS ckpt": "VITS チェックポイントを選択",
"Select VQGAN ckpt": "VQGAN チェックポイントを選択",
"Select source file processing method": "ソースファイルの処理方法を選択",
"Select the model to be trained (Depending on the Tab page you are on)": "タブページに応じてトレーニングするモデルを選択してください",
"Selected: {}": "選択済み: {}",
"Speaker": "話者",
"Speaker is identified by the folder name": "話者はフォルダ名で識別されます",
"Start Training": "トレーニング開始",
"Streaming Audio": "ストリーミングオーディオ",
"Streaming Generate": "ストリーミング合成",
"Tensorboard Host": "Tensorboardホスト",
"Tensorboard Log Path": "Tensorboardログパス",
"Tensorboard Port": "Tensorboardポート",
"Tensorboard interface is closed": "Tensorboardインターフェースが閉じられています",
"Tensorboard interface is launched at {}": "Tensorboardインターフェースが{}で起動されました",
"Text is too long, please keep it under {} characters.": "テキストが長すぎます。{}文字以内に抑えてください。",
"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "左側の入力フォルダまたはファイルリストのパス。チェックの有無にかかわらず、このリストの後続のトレーニングに使用されます。",
"Training Configuration": "トレーニング設定",
"Training Error": "トレーニングエラー",
"Training stopped": "トレーニングが停止しました",
"Type name of the speaker": "話者の名前を入力",
"Type the path or select from the dropdown": "パスを入力するか、ドロップダウンから選択してください",
"Use LoRA": "LoRAを使用",
"Use LoRA can save GPU memory, but may reduce the quality of the model": "LoRAを使用するとGPUメモリを節約できますが、モデルの品質が低下する可能性があります",
"Use filelist": "ファイルリストを使用",
"Use large for 10G+ GPU, medium for 5G, small for 2G": "10G以上のGPUには大、5Gには中、2Gには小を使用してください",
"VITS Configuration": "VITS の構成",
"VQGAN Configuration": "VQGAN の構成",
"Validation Batch Size": "検証バッチサイズ",
"View the status of the preprocessing folder (use the slider to control the depth of the tree)": "前処理フォルダの状態を表示(スライダーを使用してツリーの深さを制御)",
"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "モデルの誤用については一切責任を負いません。使用する前に、現地の法律と規制を考慮してください。",
"WebUI Host": "WebUIホスト",
"WebUI Port": "WebUIポート",
"Whisper Model": "Whisperモデル",
"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "ソースコードは[こちら](https://github.com/fishaudio/fish-speech)、モデルは[こちら](https://huggingface.co/fishaudio/fish-speech-1)にあります。",
"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30シリーズ以降のGPUにはbf16-trueを、10シリーズ以降のGPUには16-mixedをお勧めします",
"latest": "最新",
"new": "新規",
"Realtime Transform Text": "リアルタイム変換テキスト",
"Normalization Result Preview (Currently Only Chinese)": "正規化結果プレビュー(現在は中国語のみ)",
"Text Normalization": "テキスト正規化",
"Select Example Audio": "サンプル音声を選択"
}
{
"16-mixed is recommended for 10+ series GPU": "10+ 시리즈 GPU에는 16-mixed를 권장합니다.",
"5 to 10 seconds of reference audio, useful for specifying speaker.": "화자를 특정하는 데 유의미한 5~10초의 길이의 참조 오디오 데이터.",
"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "[Fish Audio](https://fish.audio)에서 개발한 VQ-GAN 및 Llama 기반의 텍스트 음성 변환 모델.",
"Accumulate Gradient Batches": "그라디언트 배치 누적",
"Add to Processing Area": "처리 영역에 추가",
"Added path successfully!": "경로가 성공적으로 추가되었습니다!",
"Advanced Config": "고급 설정",
"Base LLAMA Model": "기본 LLAMA 모델",
"Batch Inference": "배치 추론",
"Batch Size": "배치 크기",
"Changing with the Model Path": "모델 경로에 따라 변경 중",
"Chinese": "중국어",
"Compile Model": "모델 컴파일",
"Compile the model can significantly reduce the inference time, but will increase cold start time": "모델을 컴파일하면 추론 시간이 크게 줄어들지만, 초기 시작 시간이 길어집니다.",
"Copy": "복사",
"Data Preprocessing": "데이터 전처리",
"Data Preprocessing Path": "데이터 전처리 경로",
"Data Source": "데이터 소스",
"Decoder Model Config": "디코더 모델 설정",
"Decoder Model Path": "디코더 모델 경로",
"Disabled": "비활성화 됨",
"Enable Reference Audio": "참고 음성 활성화",
"English": "영어",
"Error Message": "오류 메시지",
"File Preprocessing": "파일 전처리",
"Generate": "생성",
"Generated Audio": "생성된 오디오",
"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "오디오애 대응하는 텍스트가 없을 경우, ASR을 적용해 지원하며, .txt 또는 .lab 형식을 지원합니다.",
"Infer interface is closed": "추론 인터페이스가 닫혔습니다.",
"Inference Configuration": "추론 설정",
"Inference Server Configuration": "추론 서버 설정",
"Inference Server Error": "추론 서버 오류",
"Inferring interface is launched at {}": "추론 인터페이스가 {}에서 시작되었습니다.",
"Initial Learning Rate": "초기 학습률",
"Input Audio & Source Path for Transcription": "전사할 입력 오디오 및 소스 경로",
"Input Text": "입력 텍스트",
"Invalid path: {}": "유효하지 않은 경로: {}",
"It is recommended to use CUDA, if you have low configuration, use CPU": "CUDA 사용을 권장하며, 낮은 사양일 경우 CPU를 사용하는 것을 권장합니다.",
"Iterative Prompt Length, 0 means off": "반복 프롬프트 길이. (0:비활성화)",
"Japanese": "일본어",
"LLAMA Configuration": "LLAMA 설정",
"LLAMA Model Config": "LLAMA 모델 설정",
"LLAMA Model Path": "LLAMA 모델 경로",
"Labeling Device": "라벨링 장치",
"LoRA Model to be merged": "병합할 LoRA 모델",
"Maximum Audio Duration": "최대 오디오 길이",
"Maximum Length per Sample": "샘플당 최대 길이",
"Maximum Training Steps": "최대 학습 단계",
"Maximum tokens per batch, 0 means no limit": "배치당 최대 토큰 수(0:제한 없음)",
"Merge": "병합",
"Merge LoRA": "LoRA 병합",
"Merge successfully": "성공적으로 병합 되었습니다.",
"Minimum Audio Duration": "최소 오디오 길이",
"Model Output Path": "모델 출력 경로",
"Model Size": "모델 크기",
"Move": "이동",
"Move files successfully": "파일이 성공적으로 이동되었습니다.",
"No audio generated, please check the input text.": "생성된 오디오가 없습니다. 입력된 텍스트를 확인하세요.",
"No selected options": "옵션이 선택되지 않았습니다.",
"Number of Workers": "작업자 수",
"Open Inference Server": "추론 서버 열기",
"Open Labeler WebUI": "라벨러 WebUI 열기",
"Open Tensorboard": "Tensorboard 열기",
"Opened labeler in browser": "브라우저에서 라벨러가 열렸습니다.",
"Optional Label Language": "선택적 라벨 언어",
"Optional online ver": "온라인 버전 선택",
"Output Path": "출력 경로",
"Path error, please check the model file exists in the corresponding path": "경로 오류, 해당 경로에 모델 파일이 있는지 확인하십시오.",
"Precision": "정밀도",
"Probability of applying Speaker Condition": "화자 조건 적용 확률",
"Put your text here.": "여기에 텍스트를 입력하세요.",
"Reference Audio": "참고 오디오",
"Reference Text": "참고 텍스트",
"Related code and weights are released under CC BY-NC-SA 4.0 License.": "관련 코드 및 가중치는 CC BY-NC-SA 4.0 라이선스 하에 배포됩니다.",
"Remove Selected Data": "선택한 데이터 제거",
"Removed path successfully!": "경로가 성공적으로 제거되었습니다!",
"Repetition Penalty": "반복 패널티",
"Save model every n steps": "n 단계마다 모델 저장",
"Select LLAMA ckpt": "LLAMA ckpt 선택",
"Select VITS ckpt": "VITS ckpt 선택",
"Select VQGAN ckpt": "VQGAN ckpt 선택",
"Select source file processing method": "소스 파일 처리 방법 선택",
"Select the model to be trained (Depending on the Tab page you are on)": "학습할 모델 선택(탭 페이지에 따라 다름)",
"Selected: {}": "선택됨: {}",
"Speaker": "화자",
"Speaker is identified by the folder name": "화자는 폴더 이름으로 식별됩니다",
"Start Training": "학습 시작",
"Streaming Audio": "스트리밍 오디오",
"Streaming Generate": "스트리밍 생성",
"Tensorboard Host": "Tensorboard 호스트",
"Tensorboard Log Path": "Tensorboard 로그 경로",
"Tensorboard Port": "Tensorboard 포트",
"Tensorboard interface is closed": "Tensorboard 인터페이스가 닫혔습니다",
"Tensorboard interface is launched at {}": "Tensorboard 인터페이스가 {}에서 시작되었습니다.",
"Text is too long, please keep it under {} characters.": "텍스트가 너무 깁니다. {}자 이하로 입력해주세요.",
"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "왼쪽의 입력 폴더 경로 또는 파일 목록의 경로. 체크 여부에 관계없이 이 목록에서 후속 학습에 사용됩니다.",
"Training Configuration": "학습 설정",
"Training Error": "학습 오류",
"Training stopped": "학습이 중지되었습니다.",
"Type name of the speaker": "화자의 이름을 입력하세요.",
"Type the path or select from the dropdown": "경로를 입력하거나 드롭다운에서 선택하세요.",
"Use LoRA": "LoRA 사용",
"Use LoRA can save GPU memory, but may reduce the quality of the model": "LoRA를 사용하면 GPU 메모리를 절약할 수 있지만, 모델의 품질이 저하될 수 있습니다.",
"Use filelist": "파일 목록 사용",
"Use large for 10G+ GPU, medium for 5G, small for 2G": "10G+ GPU 환경에선 large, 5G에선 medium, 2G에선 small을 사용할 것을 권장합니다.",
"VITS Configuration": "VITS 설정",
"VQGAN Configuration": "VQGAN 설정",
"Validation Batch Size": "검증 배치 크기",
"View the status of the preprocessing folder (use the slider to control the depth of the tree)": "전처리 폴더의 상태를 확인합니다(슬라이더를 사용하여 트리의 깊이를 조절합니다)",
"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "모델의 오용에 대해 책임지지 않습니다. 사용하기 전에 현지 법률과 규정을 고려하시길 바랍니다.",
"WebUI Host": "WebUI 호스트",
"WebUI Port": "WebUI 포트",
"Whisper Model": "Whisper 모델",
"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "소스 코드는 [이곳](https://github.com/fishaudio/fish-speech)에서, 모델은 [이곳](https://huggingface.co/fishaudio/fish-speech-1)에서 확인하실 수 있습니다.",
"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30+ 시리즈 GPU에는 bf16-true를, 10+ 시리즈 GPU에는 16-mixed를 권장합니다",
"latest": "최신",
"new": "새로운",
"Realtime Transform Text": "실시간 텍스트 변환",
"Normalization Result Preview (Currently Only Chinese)": "정규화 결과 미리보기(현재 중국어만 지원)",
"Text Normalization": "텍스트 정규화",
"Select Example Audio": "예시 오디오 선택"
}
{
"5 to 10 seconds of reference audio, useful for specifying speaker.": "5 a 10 segundos de áudio de referência, útil para especificar o orador.",
"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "Um modelo de texto para fala baseado em VQ-GAN e Llama desenvolvido por [Fish Audio](https://fish.audio).",
"Accumulate Gradient Batches": "Acumular Lotes de Gradiente",
"Add to Processing Area": "Adicionar à Área de Processamento",
"Added path successfully!": "Caminho adicionado com sucesso!",
"Advanced Config": "Configuração Avançada",
"Base LLAMA Model": "Modelo LLAMA Base",
"Batch Inference": "Inferência em Lote",
"Batch Size": "Tamanho do Lote",
"Changing with the Model Path": "Alterando com o Caminho do Modelo",
"Compile Model": "Compilar Modelo",
"Compile the model can significantly reduce the inference time, but will increase cold start time": "Compilar o modelo pode reduzir significativamente o tempo de inferência, mas aumentará a latência inicial",
"Copy": "Copiar",
"Data Preprocessing": "Pré-processamento de Dados",
"Data Preprocessing Path": "Caminho de Pré-processamento de Dados",
"Data Source": "Fonte de Dados",
"Decoder Model Config": "Configuração do Modelo Decodificador",
"Decoder Model Path": "Caminho do Modelo Decodificador",
"Disabled": "Desativado",
"Enable Initial Prompt": "Habilitar Prompt Inicial",
"Enable Reference Audio": "Habilitar Áudio de Referência",
"English": "Inglês",
"Japanese": "Japonês",
"Chinese": "Chinês",
"Portuguese": "Português",
"Spanish": "Espanhol",
"Error Message": "Mensagem de Erro",
"Faster Whisper, Up to 5g GPU memory usage": "Faster Whisper (Usa até 5 GB de vRAM)",
"File Preprocessing": "Pré-processamento de Arquivos",
"Generate": "Gerar",
"Generated Audio": "Áudio Gerado",
"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "Se não houver texto correspondente ao áudio, utilize o ASR para assistência (formatos .txt ou .lab)",
"Infer interface is closed": "A interface de inferência foi fechada",
"Inference Configuration": "Configuração de Inferência",
"Inference Server Configuration": "Configuração do Servidor de Inferência",
"Inference Server Error": "Erro do Servidor de Inferência",
"Inferring interface is launched at {}": "A interface de inferência foi iniciada em {}",
"Initial Learning Rate": "Taxa de Aprendizagem Inicial",
"Initial Prompt": "Prompt Inicial",
"Initial prompt can provide contextual or vocabulary-specific guidance to the model.": "O prompt inicial pode fornecer orientação contextual ou específica de vocabulário para o modelo.",
"Input Audio & Source Path for Transcription": "Entrada de Áudio/Caminho de Origem para Transcrição",
"Input Text": "Texto de Entrada",
"Invalid path: {}": "Caminho inválido: {}",
"It is recommended to use CUDA, if you have low configuration, use CPU": "Para GPUs Nvidia é recomendado usar CUDA. Se não tiver uma GPU Nvidia, use CPU",
"Iterative Prompt Length, 0 means off": "Comprimento do Prompt Iterativo (0 = desativado)",
"LLAMA Configuration": "Configuração do LLAMA",
"LLAMA Model Config": "Configuração do Modelo LLAMA",
"LLAMA Model Path": "Caminho do Modelo LLAMA",
"Labeling Device": "Dispositivo de Rotulagem",
"LoRA Model to be merged": "Modelo LoRA para mesclagem",
"Maximum Length per Sample": "Comprimento Máximo por Amostra",
"Maximum Training Steps": "Etapas Máximas de Treinamento",
"Maximum tokens per batch, 0 means no limit": "Número máximo de tokens por lote, 0 significa sem limite",
"Merge": "Mesclar",
"Merge LoRA": "Mesclar LoRA",
"Merge successfully": "Mesclado com sucesso",
"Model Output Path": "Caminho de Saída do Modelo",
"Model Quantization": "Quantização do Modelo",
"Model Size": "Tamanho do Modelo",
"Move": "Mover",
"Move files successfully": "Arquivos movidos com sucesso",
"No audio generated, please check the input text.": "Nenhum áudio gerado, verifique o texto de entrada.",
"No selected options": "Nenhuma opção selecionada",
"Normalization Result Preview (Currently Only Chinese)": "Pré-visualização do Resultado da Normalização (Atualmente Apenas Chinês)",
"Number of Workers": "Número de Processos",
"Open Inference Server": "Abrir Servidor de Inferência",
"Open Labeler WebUI": "Abrir WebUI de Rotulagem",
"Open Tensorboard": "Abrir Tensorboard",
"Opened labeler in browser": "WebUI de rotulagem aberta no navegador",
"Optional Label Language": "Idioma do Rótulo (Opcional)",
"Optional online ver": "Versão online (opcional)",
"Output Path": "Caminho de Saída",
"Path error, please check the model file exists in the corresponding path": "Erro de caminho, verifique se o arquivo do modelo existe no caminho correspondente",
"Post-quantification Precision": "Precisão Pós-quantização",
"Precision": "Precisão",
"Probability of applying Speaker Condition": "Probabilidade de Aplicar Condição de Orador",
"Put your text here.": "Insira seu texto aqui.",
"Quantify": "Quantizar",
"Quantify successfully": "Quantizado com sucesso",
"Realtime Transform Text": "Transformar Texto em Tempo Real",
"Reference Audio": "Áudio de Referência",
"Reference Text": "Texto de Referência",
"warning": "Aviso",
"Pre-processing begins...": "O pré-processamento começou!",
"Related code and weights are released under CC BY-NC-SA 4.0 License.": "O código relacionado e os pesos são licenciados sob a Licença CC BY-NC-SA 4.0.",
"Remove Selected Data": "Remover Dados Selecionados",
"Removed path successfully!": "Caminho removido com sucesso!",
"Repetition Penalty": "Penalidade de Repetição",
"Save model every n steps": "Salvar modelo a cada n etapas",
"Select LLAMA ckpt": "Selecionar .ckpt do LLAMA",
"Select source file processing method": "Escolha como processar o arquivo de origem",
"Select the model to be trained (Depending on the Tab page you are on)": "Selecione o modelo para o treinamento (dependendo da aba em que você está)",
"Selected: {}": "Selecionado: {}",
"Speaker is identified by the folder name": "O orador é identificado pelo nome da pasta",
"Start Training": "Iniciar Treinamento",
"Streaming Audio": "Áudio em Streaming",
"Streaming Generate": "Geração em Streaming",
"Tensorboard Host": "Host do Tensorboard",
"Tensorboard Log Path": "Caminho de Log do Tensorboard",
"Tensorboard Port": "Porta do Tensorboard",
"Tensorboard interface is closed": "A interface do Tensorboard está fechada",
"Tensorboard interface is launched at {}": "A interface do Tensorboard foi iniciada em {}",
"Text Normalization": "Normalização de Texto",
"Text is too long, please keep it under {} characters.": "O texto é muito longo. Mantenha-o com menos de {} caracteres.",
"The lower the quantitative precision, the more the effectiveness may decrease, but the greater the efficiency will increase": "Quanto menor a precisão quantitativa, mais a eficácia pode diminuir, mas maior será o aumento da eficiência",
"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "O caminho da pasta de entrada à esquerda ou a lista de arquivos. Independentemente de estar marcada ou não, ela será utilizada para o treinamento subsequente nesta lista.",
"Training Configuration": "Configuração de Treinamento",
"Training Error": "Erro de Treinamento",
"Training stopped": "Treinamento interrompido!",
"Type the path or select from the dropdown": "Digite o caminho ou selecione no menu suspenso",
"Use LoRA": "Usar LoRA",
"Use LoRA can save GPU memory, but may reduce the quality of the model": "O uso de LoRAs pode economizar memória da GPU, mas também pode reduzir a qualidade",
"Use filelist": "Usar lista de arquivos",
"VQGAN Configuration": "Configuração do VQGAN",
"View the status of the preprocessing folder (use the slider to control the depth of the tree)": "Visualizar o status da pasta de pré-processamento (use o controle deslizante para controlar a profundidade da árvore)",
"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "Não nos responsabilizamos por qualquer uso indevido do modelo. Por favor, considere as leis e regulamentações locais antes de usá-lo.",
"WebUI Host": "Host da WebUI",
"WebUI Port": "Porta da WebUI",
"Whisper Model": "Modelo Whisper",
"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "Você pode encontrar o código fonte [aqui](https://github.com/fishaudio/fish-speech) e os modelos [aqui](https://huggingface.co/fishaudio/fish-speech-1).",
"auto": "automático",
"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "bf16-true é recomendado para GPUs da série 30+, 16-mixed é recomendado para GPUs da série 10+",
"latest": "mais recente",
"new": "novo",
"This audio introduces the basic concepts and applications of artificial intelligence and machine learning.": "Este áudio introduz os conceitos básicos e aplicações de inteligência artificial e aprendizado de máquina.",
"You don't need to train this model!": "Não é necessário treinar este modelo!",
"Yes": "Sim",
"No": "Não",
"version:": "versão:",
"author:": "autor:"
}
{
"16-mixed is recommended for 10+ series GPU": "10+ 系列 GPU 建议使用 16-mixed",
"5 to 10 seconds of reference audio, useful for specifying speaker.": "5 到 10 秒的参考音频,适用于指定音色。",
"A text-to-speech model based on VQ-GAN and Llama developed by [Fish Audio](https://fish.audio).": "由 [Fish Audio](https://fish.audio) 研发的基于 VQ-GAN 和 Llama 的多语种语音合成.",
"Accumulate Gradient Batches": "梯度累积批次",
"Add to Processing Area": "加入处理区",
"Added path successfully!": "添加路径成功!",
"Advanced Config": "高级参数",
"Base LLAMA Model": "基础 LLAMA 模型",
"Batch Inference": "批量推理",
"Batch Size": "批次大小",
"Changing with the Model Path": "随模型路径变化",
"Chinese": "中文",
"Compile Model": "编译模型",
"Compile the model can significantly reduce the inference time, but will increase cold start time": "编译模型可以显著减少推理时间,但会增加冷启动时间",
"Copy": "复制",
"Data Preprocessing": "数据预处理",
"Data Preprocessing Path": "数据预处理路径",
"Data Source": "数据源",
"Decoder Model Config": "解码器模型配置",
"Decoder Model Path": "解码器模型路径",
"Disabled": "禁用",
"Enable Reference Audio": "启用参考音频",
"English": "英文",
"Error Message": "错误信息",
"File Preprocessing": "文件预处理",
"Generate": "生成",
"Generated Audio": "音频",
"If there is no corresponding text for the audio, apply ASR for assistance, support .txt or .lab format": "如果音频没有对应的文本,可以应用 ASR 辅助,支持 .txt 或 .lab 格式",
"Infer interface is closed": "推理界面已关闭",
"Inference Configuration": "推理配置",
"Inference Server Configuration": "推理服务器配置",
"Inference Server Error": "推理服务器错误",
"Inferring interface is launched at {}": "推理界面已在 {} 上启动",
"Initial Learning Rate": "初始学习率",
"Input Audio & Source Path for Transcription": "输入音频和转录源路径",
"Input Text": "输入文本",
"Invalid path: {}": "无效路径: {}",
"It is recommended to use CUDA, if you have low configuration, use CPU": "建议使用 CUDA,如果配置较低,使用 CPU",
"Iterative Prompt Length, 0 means off": "迭代提示长度,0 表示关闭",
"Japanese": "日文",
"LLAMA Configuration": "LLAMA 配置",
"LLAMA Model Config": "LLAMA 模型配置",
"LLAMA Model Path": "LLAMA 模型路径",
"Labeling Device": "标注加速设备",
"LoRA Model to be merged": "要合并的 LoRA 模型",
"Maximum Audio Duration": "最大音频时长",
"Maximum Length per Sample": "每个样本的最大长度",
"Maximum Training Steps": "最大训练步数",
"Maximum tokens per batch, 0 means no limit": "每批最大令牌数,0 表示无限制",
"Merge": "合并",
"Merge LoRA": "合并 LoRA",
"Merge successfully": "合并成功",
"Minimum Audio Duration": "最小音频时长",
"Model Output Path": "模型输出路径",
"Model Size": "模型规模",
"Move": "移动",
"Move files successfully": "移动文件成功",
"No audio generated, please check the input text.": "没有生成音频,请检查输入文本.",
"No selected options": "没有选择的选项",
"Number of Workers": "数据加载进程数",
"Open Inference Server": "打开推理服务器",
"Open Labeler WebUI": "打开标注工具",
"Open Tensorboard": "打开 Tensorboard",
"Opened labeler in browser": "在浏览器中打开标注工具",
"Optional Label Language": "[可选] 标注语言",
"Optional online ver": "[可选] 使用在线版",
"Output Path": "输出路径",
"Path error, please check the model file exists in the corresponding path": "路径错误,请检查模型文件是否存在于相应路径",
"Precision": "精度",
"Probability of applying Speaker Condition": "应用说话人条件的概率",
"Put your text here.": "在此处输入文本.",
"Reference Audio": "参考音频",
"Reference Text": "参考文本",
"Related code and weights are released under CC BY-NC-SA 4.0 License.": "相关代码和权重使用 CC BY-NC-SA 4.0 许可证发布.",
"Remove Selected Data": "移除选中数据",
"Removed path successfully!": "移除路径成功!",
"Repetition Penalty": "重复惩罚",
"Save model every n steps": "每 n 步保存模型",
"Select LLAMA ckpt": "选择 LLAMA 检查点",
"Select VITS ckpt": "选择 VITS 检查点",
"Select VQGAN ckpt": "选择 VQGAN 检查点",
"Select source file processing method": "选择源文件处理方法",
"Select the model to be trained (Depending on the Tab page you are on)": "根据您所在的选项卡页面选择要训练的模型",
"Selected: {}": "已选择: {}",
"Speaker": "说话人",
"Speaker is identified by the folder name": "自动根据父目录名称识别说话人",
"Start Training": "开始训练",
"Streaming Audio": "流式音频",
"Streaming Generate": "流式合成",
"Tensorboard Host": "Tensorboard 监听地址",
"Tensorboard Log Path": "Tensorboard 日志路径",
"Tensorboard Port": "Tensorboard 端口",
"Tensorboard interface is closed": "Tensorboard 界面已关闭",
"Tensorboard interface is launched at {}": "Tensorboard 界面已在 {} 上启动",
"Text is too long, please keep it under {} characters.": "文本太长,请保持在 {} 个字符以内.",
"The path of the input folder on the left or the filelist. Whether checked or not, it will be used for subsequent training in this list.": "左侧输入文件夹的路径或文件列表。无论是否选中,都将在此列表中用于后续训练.",
"Training Configuration": "训练配置",
"Training Error": "训练错误",
"Training stopped": "训练已停止",
"Type name of the speaker": "输入说话人的名称",
"Type the path or select from the dropdown": "输入路径或从下拉菜单中选择",
"Use LoRA": "使用 LoRA",
"Use LoRA can save GPU memory, but may reduce the quality of the model": "使用 LoRA 可以节省 GPU 内存,但可能会降低模型质量",
"Use filelist": "使用文件列表",
"Use large for 10G+ GPU, medium for 5G, small for 2G": "10G+ GPU 使用 large, 5G 使用 medium, 2G 使用 small",
"VITS Configuration": "VITS 配置",
"VQGAN Configuration": "VQGAN 配置",
"Validation Batch Size": "验证批次大小",
"View the status of the preprocessing folder (use the slider to control the depth of the tree)": "查看预处理文件夹的状态 (使用滑块控制树的深度)",
"We are not responsible for any misuse of the model, please consider your local laws and regulations before using it.": "我们不对模型的任何滥用负责,请在使用之前考虑您当地的法律法规.",
"WebUI Host": "WebUI 监听地址",
"WebUI Port": "WebUI 端口",
"Whisper Model": "Whisper 模型",
"You can find the source code [here](https://github.com/fishaudio/fish-speech) and models [here](https://huggingface.co/fishaudio/fish-speech-1).": "你可以在 [这里](https://github.com/fishaudio/fish-speech) 找到源代码和 [这里](https://huggingface.co/fishaudio/fish-speech-1) 找到模型.",
"bf16-true is recommended for 30+ series GPU, 16-mixed is recommended for 10+ series GPU": "30+ 系列 GPU 建议使用 bf16-true, 10+ 系列 GPU 建议使用 16-mixed",
"latest": "最近的检查点",
"new": "创建新的检查点",
"Realtime Transform Text": "实时规范化文本",
"Normalization Result Preview (Currently Only Chinese)": "规范化结果预览",
"Text Normalization": "文本规范化",
"Select Example Audio": "选择参考音频"
}
import ast
import glob
import json
from collections import OrderedDict
from pathlib import Path
from loguru import logger
from .core import DEFAULT_LANGUAGE, I18N_FILE_PATH
def extract_i18n_strings(node):
i18n_strings = []
if (
isinstance(node, ast.Call)
and isinstance(node.func, ast.Name)
and node.func.id == "i18n"
):
for arg in node.args:
if isinstance(arg, ast.Str):
i18n_strings.append(arg.s)
for child_node in ast.iter_child_nodes(node):
i18n_strings.extend(extract_i18n_strings(child_node))
return i18n_strings
# scan the directory for all .py files (recursively)
# for each file, parse the code into an AST
# for each AST, extract the i18n strings
strings = []
folders = ["fish_speech", "tools"]
# for filename in glob.iglob("**/*.py", recursive=True):
for folder in folders:
for f in Path(folder).rglob("*.py"):
code = f.read_text(encoding="utf-8")
if "i18n(" in code:
tree = ast.parse(code)
i18n_strings = extract_i18n_strings(tree)
logger.info(f"Found {len(i18n_strings)} i18n strings in {f}")
strings.extend(i18n_strings)
code_keys = set(strings)
logger.info(f"Total unique: {len(code_keys)}")
standard_file = I18N_FILE_PATH / f"{DEFAULT_LANGUAGE}.json"
with open(standard_file, "r", encoding="utf-8") as f:
standard_data = json.load(f, object_pairs_hook=OrderedDict)
standard_keys = set(standard_data.keys())
# Define the standard file name
unused_keys = standard_keys - code_keys
logger.info(f"Found {len(unused_keys)} unused keys in {standard_file}")
for unused_key in unused_keys:
logger.info(f"\t{unused_key}")
missing_keys = code_keys - standard_keys
logger.info(f"Found {len(missing_keys)} missing keys in {standard_file}")
for missing_key in missing_keys:
logger.info(f"\t{missing_key}")
code_keys_dict = OrderedDict()
for s in strings:
code_keys_dict[s] = s
# write back
with open(standard_file, "w", encoding="utf-8") as f:
json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True)
f.write("\n")
logger.info(f"Updated {standard_file}")
# Define the standard file name
standard_file = I18N_FILE_PATH / f"{DEFAULT_LANGUAGE}.json"
# Find all JSON files in the directory
dir_path = I18N_FILE_PATH
languages = [f for f in dir_path.glob("*.json") if f.stem != DEFAULT_LANGUAGE]
# Load the standard file
with open(standard_file, "r", encoding="utf-8") as f:
standard_data = json.load(f, object_pairs_hook=OrderedDict)
# Loop through each language file
for lang_file in languages:
# Load the language file
with open(lang_file, "r", encoding="utf-8") as f:
lang_data = json.load(f, object_pairs_hook=OrderedDict)
# Find the difference between the language file and the standard file
diff = set(standard_data.keys()) - set(lang_data.keys())
miss = set(lang_data.keys()) - set(standard_data.keys())
# Add any missing keys to the language file
for key in diff:
lang_data[key] = "#!" + key
logger.info(f"Added missing key: {key} to {lang_file}")
# Del any extra keys to the language file
for key in miss:
del lang_data[key]
logger.info(f"Del extra key: {key} from {lang_file}")
# Sort the keys of the language file to match the order of the standard file
lang_data = OrderedDict(
sorted(lang_data.items(), key=lambda x: list(standard_data.keys()).index(x[0]))
)
# Save the updated language file
with open(lang_file, "w", encoding="utf-8") as f:
json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True)
f.write("\n")
logger.info(f"Updated {lang_file}")
logger.info("Done")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment