initial re-structing pass

80562de2 · Dan Lyth · 10016fb0 · 80562de2 · 80562de2 · 80562de2
Commit 80562de2 authored Apr 23, 2024 by Dan Lyth
4 changed files
--- a/parler_tts/arguments.py
+++ b/parler_tts/arguments.py
+from dataclasses import dataclass, field
+from typing import Optional
+
+from transformers import Seq2SeqTrainingArguments
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    feature_extractor_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained feature extractor name or path if not the same as model_name"}
+    )
+    description_tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained description tokenizer name or path if not the same as model_name"}
+    )
+    prompt_tokenizer_name: Optional[str] = field(
+        default=None,
+        metadata={"help": "Pretrained prompt tokenizer name or path if not the same as description_tokenizer_name"},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    pad_token_id: int = field(
+        default=None,
+        metadata={"help": "If specified, change the model pad token id."},
+    )
+    decoder_start_token_id: int = field(
+        default=None,
+        metadata={"help": "If specified, change the model decoder start token id."},
+    )
+    freeze_text_encoder: bool = field(
+        default=False,
+        metadata={"help": "Whether to freeze the text encoder."},
+    )
+    do_sample: bool = field(
+        default=True,
+        metadata={"help": "Whether to do sampling or greedy decoding."},
+    )
+    temperature: float = field(
+        default=1.0,
+        metadata={"help": "Temperature if sampling."},
+    )
+    max_length: int = field(
+        default=2580,
+        metadata={"help": "Generation max length."},
+    )
+    bandwidth: float = field(
+        default=6,
+        metadata={"help": "Audio encoder bandwidth."},
+    )
+    asr_model_name_or_path: str = field(
+        default="distil-whisper/distil-large-v2",
+        metadata={"help": "Used to compute WER during evaluation. Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    clap_model_name_or_path: str = field(
+        default="laion/larger_clap_music_and_speech",
+        metadata={"help": "Used to compute audio similarity during evaluation. Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+
+    Using `HfArgumentParser` we can turn this class
+    into argparse arguments to be able to specify them on
+    the command line.
+    """
+
+    train_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
+            " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
+        },
+    )
+    train_dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset configs by a '+' symbol."
+        },
+    )
+    train_split_name: str = field(
+        default="train",
+        metadata={
+            "help": ("The name of the training data set split to use (via the datasets library). Defaults to 'train'")
+        },
+    )
+    train_dataset_samples: str = field(
+        default=None,
+        metadata={
+            "help": "Number of samples in the training data. Load and combine "
+            "multiple datasets by separating dataset samples by a '+' symbol."
+        },
+    )
+    train_metadata_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the metadata training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
+            " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
+        },
+    )
+    eval_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset name if unspecified."
+        },
+    )
+    eval_dataset_config_name: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The configuration name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset config name if unspecified"
+        },
+    )
+    eval_split_name: str = field(
+        default="test",
+        metadata={
+            "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
+        },
+    )
+    eval_metadata_dataset_name: str = field(
+        default=None,
+        metadata={
+            "help": "The name of the metadata training dataset to use (via the datasets library). Load and combine "
+            "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
+            " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
+        },
+    )
+    target_audio_column_name: str = field(
+        default="audio",
+        metadata={"help": "The name of the dataset column containing the target audio data. Defaults to 'audio'"},
+    )
+    description_column_name: str = field(
+        default=None,
+        metadata={"help": "The name of the dataset column containing the description text data. Defaults to 'None'."},
+    )
+    prompt_column_name: str = field(
+        default=None,
+        metadata={"help": "The name of the dataset column containing the prompt text data. Defaults to 'None'."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of training examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": (
+                "For debugging purposes or quicker training, truncate the number of validation examples to this "
+                "value if set."
+            )
+        },
+    )
+    max_duration_in_seconds: float = field(
+        default=35.0,
+        metadata={
+            "help": (
+                "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`."
+                "Also, used to set maximum audio length if `pad_to_max_length=True`."
+            )
+        },
+    )
+    min_duration_in_seconds: float = field(
+        default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
+    )
+    max_text_length: int = field(
+        default=500, metadata={"help": "If set, max description lengths in number of characters."}
+    )
+    max_prompt_token_length: int = field(
+        default=None,
+        metadata={
+            "help": (
+                "If set, filter samples with prompts that are longer than `max_prompt_token_length` tokens."
+                "Also, used to set maximum prompt token length if `pad_to_max_length=True`."
+            )
+        },
+    )
+    max_description_token_length: int = field(
+        default=None,
+        metadata={
+            "help": (
+                "If set, filter samples with descriptions that are longer than `max_description_token_length` tokens."
+                "Also, used to set maximum desription token length if `pad_to_max_length=True`."
+            )
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "If `True`, pad audio, prompt and description to a maximum length set with respectively "
+                "`max_duration_in_seconds`, `max_prompt_token_length`, `max_description_token_length`."
+            )
+        },
+    )
+    preprocessing_only: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to only do data preprocessing and skip training. This is especially useful when data"
+                " preprocessing errors out in distributed training due to timeout. In this case, one should run the"
+                " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
+                " can consequently be loaded in distributed training."
+                " In this training script, `save_to_disk` must be set to the path in which the dataset should be saved. "
+            )
+        },
+    )
+    token: str = field(
+        default=None,
+        metadata={
+            "help": (
+                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
+                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
+            )
+        },
+    )
+    use_auth_token: bool = field(
+        default=None,
+        metadata={
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
+        },
+    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
+                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
+                "execute code present on the Hub on your local machine."
+            )
+        },
+    )
+    add_audio_samples_to_wandb: bool = field(
+        default=False,
+        metadata={"help": "If set and if `wandb` in args.report_to, will add generated audio samples to wandb logs."},
+    )
+    id_column_name: str = field(default=None, metadata={"help": "id column name."})
+    wandb_project: str = field(
+        default="parler-speech",
+        metadata={"help": "The name of the wandb project."},
+    )
+    save_to_disk: str = field(
+        default=None,
+        metadata={
+            "help": "If set, will save the dataset to this path if this is an empyt folder. If not empty, will load the datasets from it."
+        },
+    )
+    temporary_save_to_disk: str = field(default=None, metadata={"help": "Temporarily save audio labels here."})
+    pad_to_multiple_of: Optional[int] = field(
+        default=2,
+        metadata={"help": ("Pad to multiple of for tokenizers.")},
+    )
+
+
+@dataclass
+class ParlerTTSTrainingArguments(Seq2SeqTrainingArguments):
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": (
+                "The data type (dtype) in which to run training. One of `float32` (full-precision), "
+                "`float16` or `bfloat16` (both half-precision)."
+            )
+        },
+    )
+    audio_encoder_per_device_batch_size: int = field(
+        default=8,
+        metadata={"help": ("Specify the batch size of the audio encoding pre-processing steps.")},
+    )
--- a/parler_tts/data.py
+++ b/parler_tts/data.py
+import logging
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Union, Set
+
+import torch
+import numpy as np
+import datasets
+from datasets import load_dataset, Dataset, IterableDataset, interleave_datasets, concatenate_datasets
+from transformers import AutoFeatureExtractor, AutoTokenizer
+from tqdm import tqdm
+
+from accelerate import Accelerator
+
+@dataclass
+class DataCollatorEncodecWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received to the longest sequence in the batch or
+    to `max_length` if `max_length` is set and `padding=max_length`.
+    """
+
+    feature_extractor: AutoFeatureExtractor
+    audio_column_name: str
+    feature_extractor_input_name: Optional[str] = "input_values"
+    max_length: Optional[int] = None
+    padding: Optional[str] = "longest"
+
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lengths and need
+        # different padding methods
+        audios = [feature[self.audio_column_name]["array"] for feature in features]
+        len_audio = [len(audio) for audio in audios]
+
+        batch = self.feature_extractor(audios, return_tensors="pt", padding=self.padding, max_length=self.max_length)
+        batch["len_audio"] = torch.tensor(len_audio).unsqueeze(1)
+        return batch
+
+
+@dataclass
+class DataCollatorParlerTTSWithPadding:
+    """
+    Data collator that will dynamically pad the inputs received.
+    Args:
+        prompt_tokenizer (:class:`~transformers.AutoTokenizer`)
+            The prompt_tokenizer used for proccessing the data.
+        description_tokenizer (:class:`~transformers.AutoTokenizer`)
+            The description_tokenizer used for proccessing the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+    """
+
+    prompt_tokenizer: AutoTokenizer
+    description_tokenizer: AutoTokenizer
+    padding: Union[bool, str] = "longest"
+    pad_to_multiple_of: Optional[int] = None
+    prompt_max_length: Optional[int] = None
+    description_max_length: Optional[int] = None
+    audio_max_length: Optional[int] = None
+
+    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+        # split inputs and labels since they have to be of different lengths and need
+        # different padding methods
+
+        labels = [torch.tensor(feature["labels"]).transpose(0, 1) for feature in features]
+        # (bsz, seq_len, num_codebooks)
+        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)
+        if self.audio_max_length is not None and self.padding == "max_length":
+            labels = torch.nn.functional.pad(labels, pad=(0, 0, 0, max(self.audio_max_length - labels.shape[1], 0)))
+
+        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
+
+        input_ids = self.description_tokenizer.pad(
+            input_ids,
+            return_tensors="pt",
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            max_length=self.description_max_length,
+        )
+
+        batch = {"labels": labels, **input_ids}
+
+        if self.audio_max_length is not None and self.padding == "max_length":
+            # if we do torch.compile, we need to also specify the attention_mask
+            decoder_attention_mask = torch.ones(labels.shape[:2], dtype=input_ids["attention_mask"].dtype)
+            batch["decoder_attention_mask"] = decoder_attention_mask
+
+        prompt_input_ids = [{"input_ids": feature["prompt_input_ids"]} for feature in features]
+        prompt_input_ids = self.prompt_tokenizer.pad(
+            prompt_input_ids,
+            return_tensors="pt",
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            max_length=self.prompt_max_length,
+        )
+
+        batch["prompt_input_ids"] = prompt_input_ids["input_ids"]
+        if "attention_mask" in prompt_input_ids:
+            batch["prompt_attention_mask"] = prompt_input_ids["attention_mask"]
+
+        return batch
+
+
+def convert_dataset_str_to_list(
+    dataset_names,
+    dataset_config_names,
+    metadata_dataset_names=None,
+    splits=None,
+    dataset_samples=None,
+    default_split="train",
+):
+    if isinstance(dataset_names, str):
+        dataset_names = dataset_names.split("+")
+        dataset_config_names = dataset_config_names.split("+")
+        splits = splits.split("+") if splits is not None else None
+        dataset_samples = dataset_samples.split("+") if dataset_samples is not None else None
+        metadata_dataset_names = metadata_dataset_names.split("+") if metadata_dataset_names is not None else None
+
+    # basic checks to ensure we've got the right number of datasets/configs/splits/columns/probs
+    if len(dataset_names) != len(dataset_config_names):
+        raise ValueError(
+            f"Ensure one config is passed for each dataset, got {len(dataset_names)} datasets and"
+            f" {len(dataset_config_names)} configs."
+        )
+
+    if splits is not None and len(splits) != len(dataset_names):
+        raise ValueError(
+            f"Ensure one split is passed for each dataset, got {len(dataset_names)} datasets and {len(splits)} splits."
+        )
+
+    if metadata_dataset_names is not None and len(metadata_dataset_names) != len(dataset_names):
+        raise ValueError(
+            f"Ensure one metadata dataset is passed for each dataset, got {len(dataset_names)} datasets and {len(metadata_dataset_names)} metadata datasets."
+        )
+
+    if dataset_samples is not None:
+        if len(dataset_samples) != len(dataset_names):
+            raise ValueError(
+                f"Ensure one sample is passed for each dataset, got {len(dataset_names)} datasets and "
+                f"{len(dataset_samples)} samples."
+            )
+        dataset_samples = [float(ds_sample) for ds_sample in dataset_samples]
+    else:
+        dataset_samples = [None] * len(dataset_names)
+
+    splits = splits if splits is not None else [default_split for _ in range(len(dataset_names))]
+
+    dataset_names_dict = []
+    for i, ds_name in enumerate(dataset_names):
+        dataset_names_dict.append(
+            {
+                "name": ds_name,
+                "config": dataset_config_names[i],
+                "split": splits[i],
+                "metadata_dataset_name": metadata_dataset_names[i],
+                "samples": dataset_samples[i],
+            }
+        )
+    return dataset_names_dict
+
+
+def load_multiple_datasets(
+    accelerator: Accelerator,
+    dataset_names: Union[List, str],
+    dataset_config_names: Union[List, str],
+    metadata_dataset_names: Optional[str] = None,
+    splits: Optional[Union[List, str]] = None,
+    label_column_names: Optional[List] = None,
+    stopping_strategy: Optional[str] = "first_exhausted",
+    dataset_samples: Optional[Union[List, np.array]] = None,
+    streaming: Optional[bool] = False,
+    seed: Optional[int] = None,
+    id_column_name: Optional[str] = None,
+    columns_to_keep: Optional[Set[str]] = None,
+    prompt_column_name: Optional[str] = None,
+    sampling_rate: Optional[int] = None,
+    audio_column_name: Optional[str] = None,
+    logger: Optional[logging.Logger] = None,
+    **kwargs,
+) -> Union[Dataset, IterableDataset]:
+    dataset_names_dict = convert_dataset_str_to_list(
+        dataset_names, dataset_config_names, metadata_dataset_names, splits, label_column_names, dataset_samples
+    )
+
+    if dataset_samples is not None:
+        dataset_samples = [ds_dict["samples"] for ds_dict in dataset_names_dict]
+        probabilities = np.array(dataset_samples) / np.sum(dataset_samples)
+    else:
+        probabilities = None
+
+    all_datasets = []
+    # iterate over the datasets we want to interleave
+    for dataset_dict in tqdm(dataset_names_dict, desc="Combining datasets..."):
+        with accelerator.main_process_first():
+            dataset = load_dataset(
+                dataset_dict["name"],
+                dataset_dict["config"],
+                split=dataset_dict["split"],
+                streaming=streaming,
+                **kwargs,
+            )
+            dataset_features = dataset.features.keys()
+
+            if sampling_rate is not None and audio_column_name is not None:
+                # resample target audio
+                dataset = dataset.cast_column(audio_column_name, datasets.features.Audio(sampling_rate=sampling_rate))
+
+            metadata_dataset_name = dataset_dict["metadata_dataset_name"]
+            if metadata_dataset_name is not None:
+                logger.info(
+                    f'Merging {dataset_dict["name"]} - {dataset_dict["split"]} with {metadata_dataset_name} - {dataset_dict["split"]}'
+                )
+                metadata_dataset = load_dataset(
+                    metadata_dataset_name,
+                    dataset_dict["config"],
+                    split=dataset_dict["split"],
+                    streaming=streaming,
+                    **kwargs,
+                )
+
+                # TODO(YL): I forgot to create unique ids for MLS english.
+                # To iterate faster, I bypass the original id check and do another one. - Done once because assuming it won't change next time
+                # if dataset_dict["name"] == "parler-tts/mls_eng_10k":
+                #     def concat_ids(book_id, speaker_id, begin_time):
+                #         return {"id": f"{book_id}_{speaker_id}_{str(begin_time).replace('.', '_')}"}
+                #     dataset = dataset.map(concat_ids, input_columns=["book_id", "speaker_id", "begin_time"], num_proc=24)
+                #     metadata_dataset = metadata_dataset.map(concat_ids, input_columns=["book_id", "speaker_id", "begin_time"], num_proc=24)
+                #     metadata_dataset = metadata_dataset.rename_column(id_column_name, f"metadata_{id_column_name}")
+
+                if dataset_dict["name"] != "parler-tts/mls_eng_10k":
+                    if id_column_name is not None and id_column_name not in dataset.column_names:
+                        raise ValueError(
+                            f"id_column_name={id_column_name} but has not been found in the dataset columns"
+                            f"- one of {', '.join(list(dataset.column_names))}."
+                        )
+                    if id_column_name is not None and id_column_name not in metadata_dataset.column_names:
+                        raise ValueError(
+                            f"id_column_name={id_column_name} but has not been found in the metadata dataset columns"
+                            f"- one of {', '.join(list(metadata_dataset.column_names))}."
+                        )
+                    elif id_column_name is not None:
+                        metadata_dataset = metadata_dataset.rename_column(id_column_name, f"metadata_{id_column_name}")
+
+                metadata_columns_to_remove = set(metadata_dataset.column_names).intersection(set(dataset.column_names))
+
+                if prompt_column_name is not None:
+                    # We might have applied some transformations to the prompts (e.g  punctuation restoration)
+                    # so we make sure to remove it from the original dataset
+                    if prompt_column_name in dataset.column_names:
+                        logger.info(
+                            f"REMOVE {prompt_column_name} from dataset {dataset_dict['name']} - dataset_dict['split']"
+                        )
+                        dataset.remove_columns(prompt_column_name)
+
+                metadata_columns_to_remove = set(metadata_dataset.column_names).intersection(set(dataset.column_names))
+                metadata_dataset = metadata_dataset.remove_columns(metadata_columns_to_remove)
+
+                dataset = concatenate_datasets([dataset, metadata_dataset], axis=1)
+
+                if id_column_name is not None and dataset_dict["name"] != "parler-tts/mls_eng_10k":
+                    if (
+                        len(
+                            dataset.filter(
+                                lambda id1, id2: id1 != id2,
+                                input_columns=[id_column_name, f"metadata_{id_column_name}"],
+                            )
+                        )
+                        != 0
+                    ):
+                        raise ValueError(
+                            f"Concatenate didn't work. Some ids don't correspond on dataset {dataset_dict['name']}"
+                        )
+
+                dataset_features = dataset.features.keys()
+
+            if columns_to_keep is not None:
+                dataset = dataset.remove_columns(set(dataset_features - columns_to_keep))
+        all_datasets.append(dataset)
+
+    if len(all_datasets) == 1:
+        # we have a single dataset so just return it as is
+        return all_datasets[0]
+
+    if streaming:
+        interleaved_dataset = interleave_datasets(
+            all_datasets,
+            stopping_strategy=stopping_strategy,
+            probabilities=probabilities,
+            seed=seed,
+        )
+    else:
+        with accelerator.main_process_first():
+            interleaved_dataset = concatenate_datasets(all_datasets)
+
+    return interleaved_dataset
\ No newline at end of file
--- a/parler_tts/utils.py
+++ b/parler_tts/utils.py
+import os
+import re
+import shutil
+from pathlib import Path
+from dataclasses import field
+from typing import Dict, List
+
+import torch
+from wandb import Audio
+
+def list_field(default=None, metadata=None):
+    return field(default_factory=lambda: default, metadata=metadata)
+
+
+_RE_CHECKPOINT = re.compile(r"^checkpoint-(\d+)-epoch-(\d+)$")
+
+
+def get_last_checkpoint(folder):
+    content = os.listdir(folder)
+    checkpoints = [
+        path
+        for path in content
+        if _RE_CHECKPOINT.search(path) is not None and os.path.isdir(os.path.join(folder, path))
+    ]
+    if len(checkpoints) == 0:
+        return
+    return os.path.join(folder, max(checkpoints, key=lambda x: int(_RE_CHECKPOINT.search(x).groups()[0])))
+
+
+def sorted_checkpoints(output_dir=None, checkpoint_prefix="checkpoint") -> List[str]:
+    """Helper function to sort saved checkpoints from oldest to newest."""
+    ordering_and_checkpoint_path = []
+
+    glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{checkpoint_prefix}-*") if os.path.isdir(x)]
+
+    for path in glob_checkpoints:
+        regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path)
+        if regex_match is not None and regex_match.groups() is not None:
+            ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
+
+    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
+    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
+    return checkpoints_sorted
+
+
+def rotate_checkpoints(save_total_limit=None, output_dir=None, checkpoint_prefix="checkpoint", logger=None) -> None:
+    """Helper function to delete old checkpoints."""
+    if save_total_limit is None or save_total_limit <= 0:
+        return
+    # Check if we should delete older checkpoint(s)
+    checkpoints_sorted = sorted_checkpoints(output_dir=output_dir, checkpoint_prefix=checkpoint_prefix)
+    if len(checkpoints_sorted) <= save_total_limit:
+        return
+
+    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - save_total_limit)
+    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
+    for checkpoint in checkpoints_to_be_deleted:
+        logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
+        shutil.rmtree(checkpoint, ignore_errors=True)
+
+
+def log_metric(
+    accelerator,
+    metrics: Dict,
+    train_time: float,
+    step: int,
+    epoch: int,
+    learning_rate: float = None,
+    prefix: str = "train",
+):
+    """Helper function to log all training/evaluation metrics with the correct prefixes and styling."""
+    log_metrics = {}
+    for k, v in metrics.items():
+        log_metrics[f"{prefix}/{k}"] = v
+    log_metrics[f"{prefix}/time"] = train_time
+    log_metrics[f"{prefix}/epoch"] = epoch
+    if learning_rate is not None:
+        log_metrics[f"{prefix}/learning_rate"] = learning_rate
+    accelerator.log(log_metrics, step=step)
+
+
+def log_pred(
+    accelerator,
+    pred_descriptions: List[str],
+    pred_prompts: List[str],
+    transcriptions: List[str],
+    audios: List[torch.Tensor],
+    sampling_rate: int,
+    step: int,
+    prefix: str = "eval",
+    num_lines: int = 200000,
+):
+    """Helper function to log target/predicted transcriptions to weights and biases (wandb)."""
+    if accelerator.is_main_process:
+        wandb_tracker = accelerator.get_tracker("wandb")
+        # pretty name for current step: step 50000 -> step 50k
+        cur_step_pretty = f"{int(step // 1000)}k" if step > 1000 else step
+        prefix_pretty = prefix.replace("/", "-")
+
+        # convert str data to a wandb compatible format
+        str_data = [[pred_descriptions[i], pred_prompts[i], transcriptions[i]] for i in range(len(pred_descriptions))]
+        # log as a table with the appropriate headers
+        wandb_tracker.log_table(
+            table_name=f"predictions/{prefix_pretty}-step-{cur_step_pretty}",
+            columns=["Target descriptions", "Target prompts", "Predicted transcriptions"],
+            data=str_data[:num_lines],
+            step=step,
+            commit=False,
+        )
+
+        # wandb can only loads 100 audios per step
+        wandb_tracker.log(
+            {
+                "Speech samples": [
+                    Audio(
+                        audio,
+                        caption=f"{pred_prompts[i]} --- DESCRIPTION: {pred_descriptions[i]}",
+                        sample_rate=sampling_rate,
+                    )
+                    for (i, audio) in enumerate(audios[: min(len(audios), 100)])
+                ]
+            },
+            step=step,
+        )
\ No newline at end of file
--- a/training/run_parler_tts_training.py
+++ b/training/run_parler_tts_training.py
@@ -20,7 +20,6 @@ import logging
 import os
 import re
 import sys
-import shutil
 import time
 from multiprocess import set_start_method
 from datetime import timedelta
@@ -29,15 +28,12 @@ from datetime import timedelta
 import evaluate
 from tqdm import tqdm
 from pathlib import Path
-from dataclasses import dataclass, field
-from typing import Dict, List, Optional, Union, Set

 import datasets
-import numpy as np
 import torch
 from torch.utils.data import DataLoader

-from datasets import DatasetDict, load_dataset, Dataset, IterableDataset, interleave_datasets, concatenate_datasets
+from datasets import DatasetDict, Dataset, IterableDataset, concatenate_datasets

 from huggingface_hub import Repository, create_repo
 import transformers
@@ -46,8 +42,7 @@ from transformers import (
    AutoModel,
    AutoProcessor,
    AutoTokenizer,
-    HfArgumentParser,
-    Seq2SeqTrainingArguments,
+    HfArgumentParser
 )
 from transformers.trainer_pt_utils import LengthGroupedSampler
 from transformers import pipeline
@@ -66,720 +61,14 @@ from parler_tts import (
    build_delay_pattern_mask,
 )

-from wandb import Audio
+from parler_tts.utils import get_last_checkpoint, rotate_checkpoints, log_pred, log_metric
+from parler_tts.arguments import ModelArguments, DataTrainingArguments, ParlerTTSTrainingArguments
+from parler_tts.data import load_multiple_datasets, DataCollatorParlerTTSWithPadding, DataCollatorEncodecWithPadding


 logger = logging.getLogger(__name__)


-def list_field(default=None, metadata=None):
-    return field(default_factory=lambda: default, metadata=metadata)
-
-
-_RE_CHECKPOINT = re.compile(r"^checkpoint-(\d+)-epoch-(\d+)$")
-
-
-def get_last_checkpoint(folder):
-    content = os.listdir(folder)
-    checkpoints = [
-        path
-        for path in content
-        if _RE_CHECKPOINT.search(path) is not None and os.path.isdir(os.path.join(folder, path))
-    ]
-    if len(checkpoints) == 0:
-        return
-    return os.path.join(folder, max(checkpoints, key=lambda x: int(_RE_CHECKPOINT.search(x).groups()[0])))
-
-
-def sorted_checkpoints(output_dir=None, checkpoint_prefix="checkpoint") -> List[str]:
-    """Helper function to sort saved checkpoints from oldest to newest."""
-    ordering_and_checkpoint_path = []
-
-    glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{checkpoint_prefix}-*") if os.path.isdir(x)]
-
-    for path in glob_checkpoints:
-        regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path)
-        if regex_match is not None and regex_match.groups() is not None:
-            ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
-
-    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
-    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
-    return checkpoints_sorted
-
-
-def rotate_checkpoints(save_total_limit=None, output_dir=None, checkpoint_prefix="checkpoint") -> None:
-    """Helper function to delete old checkpoints."""
-    if save_total_limit is None or save_total_limit <= 0:
-        return
-    # Check if we should delete older checkpoint(s)
-    checkpoints_sorted = sorted_checkpoints(output_dir=output_dir, checkpoint_prefix=checkpoint_prefix)
-    if len(checkpoints_sorted) <= save_total_limit:
-        return
-
-    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - save_total_limit)
-    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
-    for checkpoint in checkpoints_to_be_deleted:
-        logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
-        shutil.rmtree(checkpoint, ignore_errors=True)
-
-
-def log_metric(
-    accelerator,
-    metrics: Dict,
-    train_time: float,
-    step: int,
-    epoch: int,
-    learning_rate: float = None,
-    prefix: str = "train",
-):
-    """Helper function to log all training/evaluation metrics with the correct prefixes and styling."""
-    log_metrics = {}
-    for k, v in metrics.items():
-        log_metrics[f"{prefix}/{k}"] = v
-    log_metrics[f"{prefix}/time"] = train_time
-    log_metrics[f"{prefix}/epoch"] = epoch
-    if learning_rate is not None:
-        log_metrics[f"{prefix}/learning_rate"] = learning_rate
-    accelerator.log(log_metrics, step=step)
-
-
-def log_pred(
-    accelerator,
-    pred_descriptions: List[str],
-    pred_prompts: List[str],
-    transcriptions: List[str],
-    audios: List[torch.Tensor],
-    sampling_rate: int,
-    step: int,
-    prefix: str = "eval",
-    num_lines: int = 200000,
-):
-    """Helper function to log target/predicted transcriptions to weights and biases (wandb)."""
-    if accelerator.is_main_process:
-        wandb_tracker = accelerator.get_tracker("wandb")
-        # pretty name for current step: step 50000 -> step 50k
-        cur_step_pretty = f"{int(step // 1000)}k" if step > 1000 else step
-        prefix_pretty = prefix.replace("/", "-")
-
-        # convert str data to a wandb compatible format
-        str_data = [[pred_descriptions[i], pred_prompts[i], transcriptions[i]] for i in range(len(pred_descriptions))]
-        # log as a table with the appropriate headers
-        wandb_tracker.log_table(
-            table_name=f"predictions/{prefix_pretty}-step-{cur_step_pretty}",
-            columns=["Target descriptions", "Target prompts", "Predicted transcriptions"],
-            data=str_data[:num_lines],
-            step=step,
-            commit=False,
-        )
-
-        # wandb can only loads 100 audios per step
-        wandb_tracker.log(
-            {
-                "Speech samples": [
-                    Audio(
-                        audio,
-                        caption=f"{pred_prompts[i]} --- DESCRIPTION: {pred_descriptions[i]}",
-                        sample_rate=sampling_rate,
-                    )
-                    for (i, audio) in enumerate(audios[: min(len(audios), 100)])
-                ]
-            },
-            step=step,
-        )
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    feature_extractor_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained feature extractor name or path if not the same as model_name"}
-    )
-    description_tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained description tokenizer name or path if not the same as model_name"}
-    )
-    prompt_tokenizer_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "Pretrained prompt tokenizer name or path if not the same as description_tokenizer_name"},
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
-    )
-    use_fast_tokenizer: bool = field(
-        default=True,
-        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    pad_token_id: int = field(
-        default=None,
-        metadata={"help": "If specified, change the model pad token id."},
-    )
-    decoder_start_token_id: int = field(
-        default=None,
-        metadata={"help": "If specified, change the model decoder start token id."},
-    )
-    freeze_text_encoder: bool = field(
-        default=False,
-        metadata={"help": "Whether to freeze the text encoder."},
-    )
-    do_sample: bool = field(
-        default=True,
-        metadata={"help": "Whether to do sampling or greedy decoding."},
-    )
-    temperature: float = field(
-        default=1.0,
-        metadata={"help": "Temperature if sampling."},
-    )
-    max_length: int = field(
-        default=2580,
-        metadata={"help": "Generation max length."},
-    )
-    bandwidth: float = field(
-        default=6,
-        metadata={"help": "Audio encoder bandwidth."},
-    )
-    asr_model_name_or_path: str = field(
-        default="distil-whisper/distil-large-v2",
-        metadata={"help": "Used to compute WER during evaluation. Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    clap_model_name_or_path: str = field(
-        default="laion/larger_clap_music_and_speech",
-        metadata={"help": "Used to compute audio similarity during evaluation. Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-
-    Using `HfArgumentParser` we can turn this class
-    into argparse arguments to be able to specify them on
-    the command line.
-    """
-
-    train_dataset_name: str = field(
-        default=None,
-        metadata={
-            "help": "The name of the training dataset to use (via the datasets library). Load and combine "
-            "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
-            " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
-        },
-    )
-    train_dataset_config_name: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "The configuration name of the training dataset to use (via the datasets library). Load and combine "
-            "multiple datasets by separating dataset configs by a '+' symbol."
-        },
-    )
-    train_split_name: str = field(
-        default="train",
-        metadata={
-            "help": ("The name of the training data set split to use (via the datasets library). Defaults to 'train'")
-        },
-    )
-    train_dataset_samples: str = field(
-        default=None,
-        metadata={
-            "help": "Number of samples in the training data. Load and combine "
-            "multiple datasets by separating dataset samples by a '+' symbol."
-        },
-    )
-    train_metadata_dataset_name: str = field(
-        default=None,
-        metadata={
-            "help": "The name of the metadata training dataset to use (via the datasets library). Load and combine "
-            "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
-            " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
-        },
-    )
-    eval_dataset_name: str = field(
-        default=None,
-        metadata={
-            "help": "The name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset name if unspecified."
-        },
-    )
-    eval_dataset_config_name: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "The configuration name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset config name if unspecified"
-        },
-    )
-    eval_split_name: str = field(
-        default="test",
-        metadata={
-            "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
-        },
-    )
-    eval_metadata_dataset_name: str = field(
-        default=None,
-        metadata={
-            "help": "The name of the metadata training dataset to use (via the datasets library). Load and combine "
-            "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
-            " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
-        },
-    )
-    target_audio_column_name: str = field(
-        default="audio",
-        metadata={"help": "The name of the dataset column containing the target audio data. Defaults to 'audio'"},
-    )
-    description_column_name: str = field(
-        default=None,
-        metadata={"help": "The name of the dataset column containing the description text data. Defaults to 'None'."},
-    )
-    prompt_column_name: str = field(
-        default=None,
-        metadata={"help": "The name of the dataset column containing the prompt text data. Defaults to 'None'."},
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
-    )
-    preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
-    )
-    max_train_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of training examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_eval_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": (
-                "For debugging purposes or quicker training, truncate the number of validation examples to this "
-                "value if set."
-            )
-        },
-    )
-    max_duration_in_seconds: float = field(
-        default=35.0,
-        metadata={
-            "help": (
-                "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`."
-                "Also, used to set maximum audio length if `pad_to_max_length=True`."
-            )
-        },
-    )
-    min_duration_in_seconds: float = field(
-        default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
-    )
-    max_text_length: int = field(
-        default=500, metadata={"help": "If set, max description lengths in number of characters."}
-    )
-    max_prompt_token_length: int = field(
-        default=None,
-        metadata={
-            "help": (
-                "If set, filter samples with prompts that are longer than `max_prompt_token_length` tokens."
-                "Also, used to set maximum prompt token length if `pad_to_max_length=True`."
-            )
-        },
-    )
-    max_description_token_length: int = field(
-        default=None,
-        metadata={
-            "help": (
-                "If set, filter samples with descriptions that are longer than `max_description_token_length` tokens."
-                "Also, used to set maximum desription token length if `pad_to_max_length=True`."
-            )
-        },
-    )
-    pad_to_max_length: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "If `True`, pad audio, prompt and description to a maximum length set with respectively "
-                "`max_duration_in_seconds`, `max_prompt_token_length`, `max_description_token_length`."
-            )
-        },
-    )
-    preprocessing_only: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether to only do data preprocessing and skip training. This is especially useful when data"
-                " preprocessing errors out in distributed training due to timeout. In this case, one should run the"
-                " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
-                " can consequently be loaded in distributed training."
-                " In this training script, `save_to_disk` must be set to the path in which the dataset should be saved. "
-            )
-        },
-    )
-    token: str = field(
-        default=None,
-        metadata={
-            "help": (
-                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
-                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
-            )
-        },
-    )
-    use_auth_token: bool = field(
-        default=None,
-        metadata={
-            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
-        },
-    )
-    trust_remote_code: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
-            )
-        },
-    )
-    add_audio_samples_to_wandb: bool = field(
-        default=False,
-        metadata={"help": "If set and if `wandb` in args.report_to, will add generated audio samples to wandb logs."},
-    )
-    id_column_name: str = field(default=None, metadata={"help": "id column name."})
-    wandb_project: str = field(
-        default="parler-speech",
-        metadata={"help": "The name of the wandb project."},
-    )
-    save_to_disk: str = field(
-        default=None,
-        metadata={
-            "help": "If set, will save the dataset to this path if this is an empyt folder. If not empty, will load the datasets from it."
-        },
-    )
-    temporary_save_to_disk: str = field(default=None, metadata={"help": "Temporarily save audio labels here."})
-    pad_to_multiple_of: Optional[int] = field(
-        default=2,
-        metadata={"help": ("Pad to multiple of for tokenizers.")},
-    )
-
-
-@dataclass
-class ParlerTTSTrainingArguments(Seq2SeqTrainingArguments):
-    dtype: Optional[str] = field(
-        default="float32",
-        metadata={
-            "help": (
-                "The data type (dtype) in which to run training. One of `float32` (full-precision), "
-                "`float16` or `bfloat16` (both half-precision)."
-            )
-        },
-    )
-    audio_encoder_per_device_batch_size: int = field(
-        default=8,
-        metadata={"help": ("Specify the batch size of the audio encoding pre-processing steps.")},
-    )
-
-
-@dataclass
-class DataCollatorEncodecWithPadding:
-    """
-    Data collator that will dynamically pad the inputs received to the longest sequence in the batch or
-    to `max_length` if `max_length` is set and `padding=max_length`.
-    """
-
-    feature_extractor: AutoFeatureExtractor
-    audio_column_name: str
-    feature_extractor_input_name: Optional[str] = "input_values"
-    max_length: Optional[int] = None
-    padding: Optional[str] = "longest"
-
-    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-        # split inputs and labels since they have to be of different lengths and need
-        # different padding methods
-        audios = [feature[self.audio_column_name]["array"] for feature in features]
-        len_audio = [len(audio) for audio in audios]
-
-        batch = self.feature_extractor(audios, return_tensors="pt", padding=self.padding, max_length=self.max_length)
-        batch["len_audio"] = torch.tensor(len_audio).unsqueeze(1)
-        return batch
-
-
-@dataclass
-class DataCollatorParlerTTSWithPadding:
-    """
-    Data collator that will dynamically pad the inputs received.
-    Args:
-        prompt_tokenizer (:class:`~transformers.AutoTokenizer`)
-            The prompt_tokenizer used for proccessing the data.
-        description_tokenizer (:class:`~transformers.AutoTokenizer`)
-            The description_tokenizer used for proccessing the data.
-        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
-            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
-            among:
-            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-              sequence if provided).
-            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
-              maximum acceptable input length for the model if that argument is not provided.
-            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
-              different lengths).
-        pad_to_multiple_of (:obj:`int`, `optional`):
-            If set will pad the sequence to a multiple of the provided value.
-            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
-            7.5 (Volta).
-    """
-
-    prompt_tokenizer: AutoTokenizer
-    description_tokenizer: AutoTokenizer
-    padding: Union[bool, str] = "longest"
-    pad_to_multiple_of: Optional[int] = None
-    prompt_max_length: Optional[int] = None
-    description_max_length: Optional[int] = None
-    audio_max_length: Optional[int] = None
-
-    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-        # split inputs and labels since they have to be of different lengths and need
-        # different padding methods
-
-        labels = [torch.tensor(feature["labels"]).transpose(0, 1) for feature in features]
-        # (bsz, seq_len, num_codebooks)
-        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)
-        if self.audio_max_length is not None and self.padding == "max_length":
-            labels = torch.nn.functional.pad(labels, pad=(0, 0, 0, max(self.audio_max_length - labels.shape[1], 0)))
-
-        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
-
-        input_ids = self.description_tokenizer.pad(
-            input_ids,
-            return_tensors="pt",
-            padding=self.padding,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            max_length=self.description_max_length,
-        )
-
-        batch = {"labels": labels, **input_ids}
-
-        if self.audio_max_length is not None and self.padding == "max_length":
-            # if we do torch.compile, we need to also specify the attention_mask
-            decoder_attention_mask = torch.ones(labels.shape[:2], dtype=input_ids["attention_mask"].dtype)
-            batch["decoder_attention_mask"] = decoder_attention_mask
-
-        prompt_input_ids = [{"input_ids": feature["prompt_input_ids"]} for feature in features]
-        prompt_input_ids = self.prompt_tokenizer.pad(
-            prompt_input_ids,
-            return_tensors="pt",
-            padding=self.padding,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            max_length=self.prompt_max_length,
-        )
-
-        batch["prompt_input_ids"] = prompt_input_ids["input_ids"]
-        if "attention_mask" in prompt_input_ids:
-            batch["prompt_attention_mask"] = prompt_input_ids["attention_mask"]
-
-        return batch
-
-
-def convert_dataset_str_to_list(
-    dataset_names,
-    dataset_config_names,
-    metadata_dataset_names=None,
-    splits=None,
-    dataset_samples=None,
-    default_split="train",
-):
-    if isinstance(dataset_names, str):
-        dataset_names = dataset_names.split("+")
-        dataset_config_names = dataset_config_names.split("+")
-        splits = splits.split("+") if splits is not None else None
-        dataset_samples = dataset_samples.split("+") if dataset_samples is not None else None
-        metadata_dataset_names = metadata_dataset_names.split("+") if metadata_dataset_names is not None else None
-
-    # basic checks to ensure we've got the right number of datasets/configs/splits/columns/probs
-    if len(dataset_names) != len(dataset_config_names):
-        raise ValueError(
-            f"Ensure one config is passed for each dataset, got {len(dataset_names)} datasets and"
-            f" {len(dataset_config_names)} configs."
-        )
-
-    if splits is not None and len(splits) != len(dataset_names):
-        raise ValueError(
-            f"Ensure one split is passed for each dataset, got {len(dataset_names)} datasets and {len(splits)} splits."
-        )
-
-    if metadata_dataset_names is not None and len(metadata_dataset_names) != len(dataset_names):
-        raise ValueError(
-            f"Ensure one metadata dataset is passed for each dataset, got {len(dataset_names)} datasets and {len(metadata_dataset_names)} metadata datasets."
-        )
-
-    if dataset_samples is not None:
-        if len(dataset_samples) != len(dataset_names):
-            raise ValueError(
-                f"Ensure one sample is passed for each dataset, got {len(dataset_names)} datasets and "
-                f"{len(dataset_samples)} samples."
-            )
-        dataset_samples = [float(ds_sample) for ds_sample in dataset_samples]
-    else:
-        dataset_samples = [None] * len(dataset_names)
-
-    splits = splits if splits is not None else [default_split for _ in range(len(dataset_names))]
-
-    dataset_names_dict = []
-    for i, ds_name in enumerate(dataset_names):
-        dataset_names_dict.append(
-            {
-                "name": ds_name,
-                "config": dataset_config_names[i],
-                "split": splits[i],
-                "metadata_dataset_name": metadata_dataset_names[i],
-                "samples": dataset_samples[i],
-            }
-        )
-    return dataset_names_dict
-
-
-def load_multiple_datasets(
-    accelerator: Accelerator,
-    dataset_names: Union[List, str],
-    dataset_config_names: Union[List, str],
-    metadata_dataset_names: Optional[str] = None,
-    splits: Optional[Union[List, str]] = None,
-    label_column_names: Optional[List] = None,
-    stopping_strategy: Optional[str] = "first_exhausted",
-    dataset_samples: Optional[Union[List, np.array]] = None,
-    streaming: Optional[bool] = False,
-    seed: Optional[int] = None,
-    id_column_name: Optional[str] = None,
-    columns_to_keep: Optional[Set[str]] = None,
-    prompt_column_name: Optional[str] = None,
-    sampling_rate: Optional[int] = None,
-    audio_column_name: Optional[str] = None,
-    **kwargs,
-) -> Union[Dataset, IterableDataset]:
-    dataset_names_dict = convert_dataset_str_to_list(
-        dataset_names, dataset_config_names, metadata_dataset_names, splits, label_column_names, dataset_samples
-    )
-
-    if dataset_samples is not None:
-        dataset_samples = [ds_dict["samples"] for ds_dict in dataset_names_dict]
-        probabilities = np.array(dataset_samples) / np.sum(dataset_samples)
-    else:
-        probabilities = None
-
-    all_datasets = []
-    # iterate over the datasets we want to interleave
-    for dataset_dict in tqdm(dataset_names_dict, desc="Combining datasets..."):
-        with accelerator.main_process_first():
-            dataset = load_dataset(
-                dataset_dict["name"],
-                dataset_dict["config"],
-                split=dataset_dict["split"],
-                streaming=streaming,
-                **kwargs,
-            )
-            dataset_features = dataset.features.keys()
-
-            if sampling_rate is not None and audio_column_name is not None:
-                # resample target audio
-                dataset = dataset.cast_column(audio_column_name, datasets.features.Audio(sampling_rate=sampling_rate))
-
-            metadata_dataset_name = dataset_dict["metadata_dataset_name"]
-            if metadata_dataset_name is not None:
-                logger.info(
-                    f'Merging {dataset_dict["name"]} - {dataset_dict["split"]} with {metadata_dataset_name} - {dataset_dict["split"]}'
-                )
-                metadata_dataset = load_dataset(
-                    metadata_dataset_name,
-                    dataset_dict["config"],
-                    split=dataset_dict["split"],
-                    streaming=streaming,
-                    **kwargs,
-                )
-
-                # TODO(YL): I forgot to create unique ids for MLS english.
-                # To iterate faster, I bypass the original id check and do another one. - Done once because assuming it won't change next time
-                # if dataset_dict["name"] == "parler-tts/mls_eng_10k":
-                #     def concat_ids(book_id, speaker_id, begin_time):
-                #         return {"id": f"{book_id}_{speaker_id}_{str(begin_time).replace('.', '_')}"}
-                #     dataset = dataset.map(concat_ids, input_columns=["book_id", "speaker_id", "begin_time"], num_proc=24)
-                #     metadata_dataset = metadata_dataset.map(concat_ids, input_columns=["book_id", "speaker_id", "begin_time"], num_proc=24)
-                #     metadata_dataset = metadata_dataset.rename_column(id_column_name, f"metadata_{id_column_name}")
-
-                if dataset_dict["name"] != "parler-tts/mls_eng_10k":
-                    if id_column_name is not None and id_column_name not in dataset.column_names:
-                        raise ValueError(
-                            f"id_column_name={id_column_name} but has not been found in the dataset columns"
-                            f"- one of {', '.join(list(dataset.column_names))}."
-                        )
-                    if id_column_name is not None and id_column_name not in metadata_dataset.column_names:
-                        raise ValueError(
-                            f"id_column_name={id_column_name} but has not been found in the metadata dataset columns"
-                            f"- one of {', '.join(list(metadata_dataset.column_names))}."
-                        )
-                    elif id_column_name is not None:
-                        metadata_dataset = metadata_dataset.rename_column(id_column_name, f"metadata_{id_column_name}")
-
-                metadata_columns_to_remove = set(metadata_dataset.column_names).intersection(set(dataset.column_names))
-
-                if prompt_column_name is not None:
-                    # We might have applied some transformations to the prompts (e.g  punctuation restoration)
-                    # so we make sure to remove it from the original dataset
-                    if prompt_column_name in dataset.column_names:
-                        logger.info(
-                            f"REMOVE {prompt_column_name} from dataset {dataset_dict['name']} - dataset_dict['split']"
-                        )
-                        dataset.remove_columns(prompt_column_name)
-
-                metadata_columns_to_remove = set(metadata_dataset.column_names).intersection(set(dataset.column_names))
-                metadata_dataset = metadata_dataset.remove_columns(metadata_columns_to_remove)
-
-                dataset = concatenate_datasets([dataset, metadata_dataset], axis=1)
-
-                if id_column_name is not None and dataset_dict["name"] != "parler-tts/mls_eng_10k":
-                    if (
-                        len(
-                            dataset.filter(
-                                lambda id1, id2: id1 != id2,
-                                input_columns=[id_column_name, f"metadata_{id_column_name}"],
-                            )
-                        )
-                        != 0
-                    ):
-                        raise ValueError(
-                            f"Concatenate didn't work. Some ids don't correspond on dataset {dataset_dict['name']}"
-                        )
-
-                dataset_features = dataset.features.keys()
-
-            if columns_to_keep is not None:
-                dataset = dataset.remove_columns(set(dataset_features - columns_to_keep))
-        all_datasets.append(dataset)
-
-    if len(all_datasets) == 1:
-        # we have a single dataset so just return it as is
-        return all_datasets[0]
-
-    if streaming:
-        interleaved_dataset = interleave_datasets(
-            all_datasets,
-            stopping_strategy=stopping_strategy,
-            probabilities=probabilities,
-            seed=seed,
-        )
-    else:
-        with accelerator.main_process_first():
-            interleaved_dataset = concatenate_datasets(all_datasets)
-
-    return interleaved_dataset
-
-
 def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
@@ -967,6 +256,7 @@ def main():
                prompt_column_name=data_args.prompt_column_name,
                audio_column_name=data_args.target_audio_column_name,
                sampling_rate=sampling_rate,
+                logger=logger,
                # streaming=data_args.streaming, TODO(SG): optionally enable streaming mode
            )

@@ -997,6 +287,7 @@ def main():
                prompt_column_name=data_args.prompt_column_name,
                audio_column_name=data_args.target_audio_column_name,
                sampling_rate=sampling_rate,
+                logger=logger,
                # streaming=data_args.streaming, TODO(SG): optionally enable streaming mode
            )

@@ -1445,7 +736,7 @@ def main():
                prompt_tokenizer.save_pretrained(training_args.output_dir)
            else:
                logger.warning(
-                    "Prompt tokenizer ('{model_args.prompt_tokenizer_name}') and description tokenizer ('{model_args.description_tokenizer_name}') are not the same. Saving only the prompt tokenizer."
+                    f"Prompt tokenizer ('{model_args.prompt_tokenizer_name}') and description tokenizer ('{model_args.description_tokenizer_name}') are not the same. Saving only the prompt tokenizer."
                )
                prompt_tokenizer.save_pretrained(training_args.output_dir)

@@ -1616,7 +907,7 @@ def main():
                    accelerator.save_state(output_dir=intermediate_dir, safe_serialization=False)
                    accelerator.wait_for_everyone()
                    if accelerator.is_main_process:
-                        rotate_checkpoints(training_args.save_total_limit, output_dir=training_args.output_dir)
+                        rotate_checkpoints(training_args.save_total_limit, output_dir=training_args.output_dir, logger=logger)

                        if cur_step == total_train_steps:
                            # un-wrap student model for save