Init preprocessing tools

d8a47abb · Lengyue · zjsun · c2c14c67 · d8a47abb · d8a47abb
Commit d8a47abb authored Oct 10, 2023 by Lengyue Committed by zjsun Sep 03, 2025
8 changed files
--- a/.gitignore
+++ b/.gitignore
+.pgx.*
--- a/ds_config.json
+++ b/ds_config.json
+{
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu" :"auto",
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": 1.0,
+    "bf16": {
+        "enabled": "auto"
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "flops_profiler": {
+        "enabled": false,
+        "profile_step": 1,
+        "module_depth": -1,
+        "top_modules": 1,
+        "detailed": true,
+        "output_file": null
+    }
+}
\ No newline at end of file
--- a/fine-tune.py
+++ b/fine-tune.py
+from dataclasses import dataclass, field
+from functools import partial
+from typing import Optional
+
+from datasets import load_dataset, load_from_disk
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    HfArgumentParser,
+    Trainer,
+)
+from transformers import TrainingArguments as _TrainingArguments
+
+
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="baichuan-inc/Baichuan2-7B-Base")
+
+
+@dataclass
+class DataArguments:
+    data_path: str = field(
+        default=None, metadata={"help": "Path to the training data."}
+    )
+
+
+@dataclass
+class TrainingArguments(_TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    model_max_length: int = field(
+        default=512,
+        metadata={
+            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    use_lora: bool = field(default=False)
+
+
+def dataset_transform(batch, tokenizer: AutoTokenizer=None):
+    outputs = tokenizer(batch["prompt"], padding="longest", truncation=True, max_length=512, return_tensors="pt")
+    labels = outputs.input_ids.clone()
+
+    # Set the labels to -100 so that the logits are not affected by loss
+    labels[outputs.attention_mask == 0] = -100
+
+    return {
+        "input_ids": outputs.input_ids,
+        "attention_mask": outputs.attention_mask,
+        "labels": labels,
+    }
+
+def train():
+    parser = HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_args.model_name_or_path,
+        trust_remote_code=True,
+        cache_dir=training_args.cache_dir,
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        use_fast=False,
+        trust_remote_code=True,
+        model_max_length=training_args.model_max_length,
+        cache_dir=training_args.cache_dir,
+    )
+    tokenizer.pad_token_id = tokenizer.eos_token_id
+
+    if training_args.use_lora:
+        from peft import LoraConfig, TaskType, get_peft_model
+
+        peft_config = LoraConfig(
+            task_type=TaskType.CAUSAL_LM,
+            target_modules=["W_pack"],
+            inference_mode=False,
+            r=16,
+            lora_alpha=64,
+            lora_dropout=0.1,
+        )
+        model.enable_input_require_grads()
+        model = get_peft_model(model, peft_config)
+        model.print_trainable_parameters()
+
+    try:
+        dataset = load_from_disk(data_args.data_path)
+        if 'train' in dataset:
+            dataset = dataset['train']
+    except:
+        dataset = load_dataset(data_args.data_path, split="train")
+    
+    dataset.set_transform(partial(dataset_transform, tokenizer=tokenizer))
+    dataset = dataset.train_test_split(test_size=1000, seed=42)
+
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=dataset["train"],
+        eval_dataset=dataset["test"],
+        tokenizer=tokenizer,
+        data_collator=DataCollatorWithPadding(tokenizer),
+    )
+    trainer.train()
+    trainer.save_state()
+    trainer.save_model(output_dir=training_args.output_dir)
+
+
+if __name__ == "__main__":
+    train()
--- a/finetune.sh
+++ b/finetune.sh
+export NCCL_P2P_DISABLE=1
+
+hostfile=""
+deepspeed --hostfile=$hostfile tools/tts/fine-tune.py \
+    --deepspeed tools/tts/ds_config.json \
+    --report_to "tensorboard" \
+    --data_path "fishaudio/libritts-r-encodec" \
+    --model_name_or_path "checkpoints/llama2-tiny-init" \
+    --output_dir "results" \
+    --model_max_length 4096 \
+    --max_steps 500000 \
+    --per_device_train_batch_size 32 \
+    --gradient_accumulation_steps 1 \
+    --save_strategy steps \
+    --save_steps 10000 \
+    --evaluation_strategy steps \
+    --eval_steps 10000 \
+    --learning_rate 1e-3 \
+    --lr_scheduler_type cosine \
+    --adam_beta1 0.9 \
+    --adam_beta2 0.98 \
+    --adam_epsilon 1e-8 \
+    --max_grad_norm 1.0 \
+    --weight_decay 1e-4 \
+    --warmup_steps 10000 \
+    --logging_steps 1 \
+    --gradient_checkpointing True \
+    --remove_unused_columns False \
+    --use_lora False \
+    --bf16 True \
+    --tf32 True
--- a/prepare_dataset.py
+++ b/prepare_dataset.py
+import json
+import os
+from pathlib import Path
+
+import librosa
+import torch
+from datasets import Dataset
+from multiprocess import set_start_method
+from transformers import AutoProcessor, EncodecModel
+
+set_start_method("spawn", force=True)
+
+encodec_name = "facebook/encodec_24khz"
+encodec_processor = AutoProcessor.from_pretrained(encodec_name)
+encodec_model = EncodecModel.from_pretrained(encodec_name)
+encodec_model.eval()
+
+
+def tokenize(text, audio, sr=None, speaker=None):
+    assert sr is None or sr == encodec_processor.sampling_rate
+
+    if isinstance(audio, (str, Path)):
+        audio, sr = librosa.load(audio, sr=sr, mono=True)
+
+    prompt = "[INST] "
+    if speaker:
+        prompt += f"[SPK] {speaker} [/SPK] "
+    prompt += f"{text} [/INST] "
+
+    inputs = encodec_processor(
+        raw_audio=audio, sampling_rate=sr, return_tensors="pt"
+    ).to(encodec_model.device)
+    outputs = encodec_model.encode(
+        inputs["input_values"], inputs["padding_mask"], bandwidth=1.5, return_dict=True
+    )
+
+    assert outputs.audio_codes.dim() == 4  # [batch, channel, codebook, code]
+    assert outputs.audio_codes.shape[0] == outputs.audio_codes.shape[1] == 1
+
+    codes = outputs.audio_codes[0, 0, 0, :].long()
+    codes_str = " ".join([f"<encodec_{int(c)}>" for c in codes.tolist()])
+    prompt += codes_str
+
+    return {
+        "prompt": prompt,
+        "codes": codes,
+    }
+
+
+def wrap_tokenize(x):
+    device = torch.device("cuda", 0)
+
+    if encodec_model.device != device:
+        encodec_model.to(device)
+
+    return tokenize(
+        text=x["text"],
+        audio=x["raw_path"],
+        sr=encodec_processor.sampling_rate,
+        speaker=x["speaker"],
+    )
+
+
+def generator_libritts_r():
+    base = Path("dataset/tts/LibriTTS_R")
+
+    for i in base.rglob("*.wav"):
+        text_file = i.with_suffix(".normalized.txt")
+        if not text_file.exists():
+            continue
+
+        text = text_file.read_text().strip()
+
+        yield {
+            "text": text,
+            "speaker": f"libritts_{i.parent.parent.name}",
+            "raw_path": str(i),
+            "path": str(i.relative_to(base)),
+        }
+
+
+if __name__ == "__main__":
+    dataset = Dataset.from_generator(generator_libritts_r)
+    dataset = dataset.map(wrap_tokenize, num_proc=12)
+    dataset = dataset.remove_columns(["raw_path"])
+
+    dataset.save_to_disk("dataset/tts/libritts-r-encodec")
+    dataset.push_to_hub("fishaudio/libritts-r-encodec", private=True)
--- a/preparing_data/to_flac.py
+++ b/preparing_data/to_flac.py
+from pathlib import Path
+import subprocess
+from multiprocessing import Pool, cpu_count
+from tqdm import tqdm
+import random
+
+def convert_to_flac(src_file_path):
+    dst_file_path = src_file_path.with_suffix(".flac")
+    dst_file_path.parent.mkdir(parents=True, exist_ok=True)
+
+    try:
+        subprocess.check_call(
+            ["ffmpeg", "-y", "-i", str(src_file_path), "-acodec", "flac", "-threads", "0", str(dst_file_path)],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+
+        # remove the input file
+        src_file_path.unlink()
+        return True
+    except subprocess.CalledProcessError:
+        return False
+
+
+if __name__ == "__main__":
+    src_dir = Path("dataset/tts/WenetSpeech/cleaned")
+
+    wav_files = list(src_dir.rglob("*.wav"))
+    random.shuffle(wav_files)
+    print(f"Found {len(wav_files)} wav files")
+
+    success_counter = 0
+    fail_counter = 0
+
+    with Pool(processes=cpu_count(), maxtasksperchild=100) as pool:
+        with tqdm(pool.imap_unordered(convert_to_flac, wav_files), total=len(wav_files)) as pbar:
+            for success in pbar:
+                if success:
+                    success_counter += 1
+                else:
+                    fail_counter += 1
+            
+            pbar.set_description(f"Success: {success_counter}, Fail: {fail_counter}")
+
+    print(f"Successfully converted: {success_counter}")
+    print(f"Failed conversions: {fail_counter}")
--- a/preparing_data/wenet_clean/clean_wenet_speech.py
+++ b/preparing_data/wenet_clean/clean_wenet_speech.py
+import json
+from pathlib import Path
+import subprocess
+
+import librosa
+import soundfile as sf
+import torch
+import torchaudio
+from fish_audio_preprocess.utils.separate_audio import (
+    separate_audio,
+    merge_tracks,
+    init_model,
+)
+from tqdm import tqdm
+import time
+import os
+import tempfile
+
+rank = int(os.environ.get("SLURM_PROCID", 0))
+world_size = int(os.environ.get("SLURM_NTASKS", 1))
+device = torch.device("cuda:0")
+print(f"Rank {rank}/{world_size} on {device}")
+
+
+def main():
+    meta_path = Path("dataset/tts/WenetSpeech/WenetSpeech.json")
+    dataset_path = Path("dataset/tts/WenetSpeech")
+    cleaned_path = Path("dataset/tts/WenetSpeech/cleaned")
+    if not cleaned_path.exists():
+        cleaned_path.mkdir(parents=True)
+
+    demucs = init_model("htdemucs", device)
+    print("Model loaded")
+
+    with open(meta_path) as f:
+        dataset = json.load(f)["audios"]
+
+    print(f"Dataset loaded, {len(dataset)} samples")
+    dataset = dataset[rank::world_size]
+    print(f"Dataset split, {len(dataset)} samples")
+
+    for data_idx, data in enumerate(dataset):
+        done_path = cleaned_path / data["aid"] / "done"
+        done_path.parent.mkdir(parents=True, exist_ok=True)
+
+        if done_path.exists():
+            continue
+
+        print(f"Processing {data_idx}/{len(dataset)} at rank {rank}")
+
+        try:
+            with tempfile.NamedTemporaryFile(suffix=".wav") as f:
+                subprocess.check_call(
+                    [
+                        "ffmpeg",
+                        "-y",
+                        "-i",
+                        str(dataset_path / data["path"]),
+                        "-c:a",
+                        "pcm_s16le",
+                        "-threads",
+                        "0",
+                        "-ar",
+                        "24000",
+                        str(f.name),
+                    ],
+                    stdout=subprocess.DEVNULL,
+                    stderr=subprocess.DEVNULL,
+                )
+                raw_audio, sr = librosa.load(f.name, sr=None, mono=True)
+
+            raw_audio = torch.from_numpy(raw_audio[None]).to(device)
+            audio = torchaudio.functional.resample(
+                raw_audio, orig_freq=sr, new_freq=demucs.samplerate
+            )
+            # Make it 2 channels
+            audio = torch.cat([audio, audio], dim=0)
+            tracks = separate_audio(demucs, audio, shifts=1, num_workers=0, progress=False)
+            audio = merge_tracks(tracks, filter=["vocals"])[0]
+            vocals, sr = (
+                torchaudio.functional.resample(
+                    audio, orig_freq=demucs.samplerate, new_freq=24000
+                ),
+                24000,
+            )
+            vocals = vocals.cpu().numpy()
+
+            for idx, segment in enumerate(data["segments"]):
+                if segment["confidence"] <= 0.95:
+                    continue
+
+                # Load audio
+                begin = int(segment["begin_time"] * sr)
+                end = int(segment["end_time"] * sr)
+                segment_audio = vocals[begin:end]
+
+                # Write audio
+                temp_path = cleaned_path / data["aid"] / f"S{idx:05d}.wav"
+                temp_path.parent.mkdir(parents=True, exist_ok=True)
+                sf.write(temp_path, segment_audio, samplerate=sr)
+
+                # Write text
+                temp_path = temp_path.with_suffix(".txt")
+                temp_path.write_text(segment["text"])
+
+            # Write done file
+            done_path.write_text("")
+        except Exception as e:
+            print(f"Error {e} on {data_idx}/{len(dataset)} at rank {rank}")
+            time.sleep(10)
+            continue
+
+    print("Done")
+
+
+if __name__ == "__main__":
+    main()
--- a/preparing_data/wenet_clean/launch.py
+++ b/preparing_data/wenet_clean/launch.py
+import os
+import subprocess as sp
+import sys
+
+SLURM_NTASKS = 6
+
+processes = []
+for i in range(SLURM_NTASKS):
+    env = os.environ.copy()
+    env["SLURM_PROCID"] = str(i)
+    env["SLURM_NTASKS"] = str(SLURM_NTASKS)
+    env["CUDA_VISIBLE_DEVICES"] = str(i % 8)
+
+    processes.append(
+        sp.Popen(
+            f"python preparing_data/wenet_clean/clean_wenet_speech.py",
+            shell=True,
+            env=env,
+            stdout=sys.stdout,
+            stderr=sys.stderr,
+        )
+    )
+
+
+for p in processes:
+    p.wait()
+    print(p.communicate())