"""Apply pre-processing formatting to the accent labels"""
processed_labels=[]
forlabelinlabels:
if"_"inlabel:
# voxpopuli stylises the accent as a language code (e.g. en_pl for "polish") - convert to full accent
language_code=label.split("_")[-1]
label=LANGUAGES[language_code]
iflabel=="British":
# 1 speaker in VCTK is labelled as British instead of English - let's normalise
label="English"
processed_labels.append(label.capitalize())
returnprocessed_labels
@dataclass
@dataclass
classDataTrainingArguments:
classDataTrainingArguments:
"""
"""
...
@@ -79,6 +95,12 @@ class DataTrainingArguments:
...
@@ -79,6 +95,12 @@ class DataTrainingArguments:
"multiple datasets by separating dataset configs by a '+' symbol."
"multiple datasets by separating dataset configs by a '+' symbol."
},
},
)
)
train_split_name:str=field(
default="train",
metadata={
"help":("The name of the training data set split to use (via the datasets library). Defaults to 'train'")
},
)
train_dataset_samples:str=field(
train_dataset_samples:str=field(
default=None,
default=None,
metadata={
metadata={
...
@@ -98,6 +120,15 @@ class DataTrainingArguments:
...
@@ -98,6 +120,15 @@ class DataTrainingArguments:
"help":"The configuration name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset config name if unspecified"
"help":"The configuration name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset config name if unspecified"
},
},
)
)
eval_split_name:str=field(
default="validation",
metadata={
"help":(
"The name of the evaluation data set split to use (via the datasets"
" library). Defaults to 'validation'"
)
},
)
audio_column_name:str=field(
audio_column_name:str=field(
default="audio",
default="audio",
metadata={"help":"The name of the dataset column containing the audio data. Defaults to 'audio'"},
metadata={"help":"The name of the dataset column containing the audio data. Defaults to 'audio'"},