Commit 308fa09e authored by sanchit-gandhi's avatar sanchit-gandhi
Browse files

add cv to accent classifier

parent 6b8b5647
#!/usr/bin/env bash
python run_audio_classification.py \
--model_name_or_path "facebook/mms-lid-126" \
--train_dataset_name "stable-speech/concatenated-normalized-accent-dataset+stable-speech/concatenated-common-voice-15-accented" \
--train_dataset_config_name "default+default" \
--train_split_name "train+train" \
--train_label_column_name "labels+labels" \
--eval_dataset_name "stable-speech/concatenated-normalized-accent-dataset" \
--eval_dataset_config_name "default" \
--eval_split_name "test" \
--eval_label_column_name "labels" \
--output_dir "./" \
--do_train \
--do_eval \
--overwrite_output_dir \
--remove_unused_columns False \
--fp16 \
--fp16_full_eval \
--learning_rate 1e-4 \
--lr_scheduler_type "constant_with_warmup" \
--max_length_seconds 20 \
--min_length_seconds 5 \
--attention_mask \
--warmup_steps 100 \
--max_steps 5000 \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--preprocessing_num_workers 4 \
--dataloader_num_workers 4 \
--logging_strategy "steps" \
--logging_steps 10 \
--evaluation_strategy "steps" \
--eval_steps 1000 \
--save_strategy "no" \
--save_steps 5000 \
--filter_threshold 0.01 \
--freeze_base_model False \
--push_to_hub False \
--trust_remote_code
#!/usr/bin/env bash
python run_dataset_concatenation.py \
--dataset_name "stable-speech/common_voice_15_0_accented" \
--dataset_config_name "en" \
--dataset_split_name "train" \
--label_column_name "accent" \
--text_column_name "sentence" \
--speaker_column_name "client_id" \
--batch_size 250 \
--preprocessing_num_workers 4 \
--output_dir "./concatenated-dataset-cv"
python run_dataset_concatenation.py \
--dataset_name "stable-speech/common_voice_15_0_accented" \
--dataset_config_name "en" \
--dataset_split_name "test" \
--label_column_name "accent" \
--text_column_name "sentence" \
--speaker_column_name "client_id" \
--batch_size 250 \
--preprocessing_num_workers 4 \
--output_dir "./concatenated-dataset-cv-test"
......@@ -76,14 +76,17 @@ def main():
accent_dataset = load_dataset("sanchit-gandhi/edacc_accents", split="train")
def format_dataset(batch):
batch["speaker_id"] = batch["Final-Participant_ID"].replace("EAEC", "EDACC").replace("P1", "-A").replace("P2", "-B")
batch["speaker_id"] = (
batch["Final-Participant_ID"].replace("EAEC", "EDACC").replace("P1", "-A").replace("P2", "-B")
)
return batch
accent_dataset = accent_dataset.map(format_dataset, remove_columns=["Final-Participant_ID"])
# 2. Clean accents for each speaker
linguistic_background_clean = {
participant: accent.strip() for participant, accent in zip(accent_dataset["speaker_id"], accent_dataset["English_Variety"])
participant: accent.strip()
for participant, accent in zip(accent_dataset["speaker_id"], accent_dataset["English_Variety"])
}
linguistic_variety = {
participant: l1.strip() for participant, l1 in zip(accent_dataset["speaker_id"], accent_dataset["L1_Variety"])
......
......@@ -57,21 +57,22 @@ def random_subsample(wav: np.ndarray, max_length: float, sample_rate: int = 1600
random_offset = randint(0, len(wav) - sample_length - 1)
return wav[random_offset : random_offset + sample_length]
def deterministic_subsample(wav: np.ndarray, max_length: float, sample_rate: int = 16000) -> np.ndarray:
"""Take first `max_length` seconds from the input audio"""
sample_length = int(round(sample_rate * max_length))
if len(wav) <= sample_length:
return wav
return wav[0 : sample_length]
return wav[0:sample_length]
ACCENT_MAPPING = {
"British": "English",
"Canadian": "American",
# "Canadian": "American",
"Northern irish": "Irish",
"New zealand": "Australian",
# "New zealand": "Australian",
"Pakistani": "Indian",
"Mainstream us english": "American",
"Mainstream u s english": "American",
"Southern british english": "English",
"Indian english": "Indian",
"Scottish english": "Scottish",
......@@ -83,12 +84,152 @@ ACCENT_MAPPING = {
"Indonesian english": "Indonesian",
"South african english": "South african",
"Irish english": "Irish",
"Latin": "Latin American",
"Latin": "Latin american",
"European": "Unknown", # Too general
"Eastern european": "Eastern european", # TODO(SG): keep for now, but maybe remove later
"Eastern european": "Eastern european", # TODO(SG): keep for now, but maybe remove later as too general
"Bangladeshi": "Indian",
"England": "English",
"India": "Indian",
"Afrikaans": "South african",
"California": "American",
"Nepali": "Indian",
"New york city": "American",
"New jerseyan": "American",
"Northumbrian british english": "English",
"Nottinghamshire,east midlands": "English",
"Southern african": "South african",
"United states english": "American",
"West indies": "Jamaican",
"2nd language": "Unknown", # Too vague
"A savage texas gentleman": "American",
"A variety of texan english with some german influence that has undergone the cot-caught merger": "American",
"A'lo": "Unknown", # Unclear
"Academic southern english,england english": "English",
"Argentinian english": "Latin american",
"Austrian": "German",
"Bangladesh,india and south asia (india, pakistan, sri lanka)": "Indian",
"Brazillian accent": "Brazilian",
"British accent": "English",
"Caribbean canadian": "Unknown", # Specific combination not listed
"Colombian accent": "Latin american",
"Czech accent": "Czech",
"East african khoja": "Unknown", # Specific community
"East indian": "Indian",
"East london": "English",
"England,london,academic": "English",
"Filipino": "Unknown", # Unique blend
"Fluent,e sl,european": "Unknown", # Too vague
"Generic european": "Unknown", # Too vague
"Georgian english": "Unknown", # No direct match
"Ghanaian english accent,african regular reader": "Unknown", # Specific category not listed
"Haitian creole": "Unknown", # Unique blend
"Hispanic": "Latin american",
"Hispanic/latino": "Latin american",
"Hong kong english": "Chinese",
"Hong kong english,scottish english": "Chinese",
"Hunglish": "Hungarian",
"I think mine accent is influenced by indian accent ,yes please. ,india and south asia (india, pakistan, sri lanka)": "Indian",
"I was born in england and have lived in australia, canada and france.": "English",
"International english,united states english,australian english": "American",
"Israeli": "Unknown", # No direct match
"Israeli english": "Unknown", # No direct match
"Javanese,indonesian english,malaysian english": "Indonesian",
"Kazakhstan english": "Unknown", # No direct match
"Kiwi": "New zealand", # Could be generalised to Australian
"Latin america,united states english": "Latin american",
"Latin american accent": "Latin american",
"Latin english": "Unknown", # Too vague
"Latino": "Latin american",
"Latvian": "Latvian", # Note: added new
"Little latino,united states english,second language": "Latin american",
"Liverpool english,lancashire english,england english": "English",
"Liverpudlian english": "English",
"Malaysian english": "Malaysian", # Note: added new
"Mexican accent": "Latin american",
"Mid-atlantic united states english,philadelphia, pennsylvania, united states english,united states english,philadelphia style united states english": "American",
"Mid-atlantic,england english,united states english": "American",
"Midatlantic,england english": "American",
"Midwestern states (michigan),united states english": "American",
"Mild northern england english": "English",
"Minor french accent": "French",
"Mix of american and british ,native polish": "Polish",
"Mix of american and british accent": "Unknown", # Combination not clearly mapped
"Mostly american with some british and australian inflections": "Unknown", # Combination not clearly mapped
"My accent is influenced by the phones of all letters within a sentence.,southern african (south africa, zimbabwe, namibia)": "South african",
"New zealand english": "New Zealand English",
"Nigeria english": "Nigerian", # Note: added new
"Non native speaker from france": "French",
"Non-native": "Unknown", # Too vague
"Non-native,german accent": "German",
"North european english": "Unknown", # Too broad
"Norwegian": "Norwegian", # Note: added new
"Ontario,canadian english": "Canadian", # Note: added new
"Polish english": "Polish",
"Rhode island new england accent": "American",
"Singaporean english": "Singaporean", # Note: added new
"Slavic": "Eastern european",
"Slighty southern affected by decades in the midwest, 4 years in spain and germany, speak some german, spanish, polish. have lived in nine states.": "Unknown", # Complex blend
"South african": "South african",
"South atlantic (falkland islands, saint helena)": "Unknown", # Specific regions not listed
"South australia": "Australian",
"South indian": "Indian",
"Southern drawl": "American",
"Southern texas accent,united states english": "American",
"Southern united states,united states english": "American",
"Spanish bilingual": "Spanish",
"Spanish,foreign,non-native": "Spanish",
"Strong latvian accent": "Latvian",
"Swedish accent": "Swedish", # Note: added new
"Transnational englishes blend": "Unknown", # Too vague
"U.k. english": "English",
"Very slight russian accent,standard american english,boston influence": "American",
"Welsh english": "Welsh",
"West african": "Unknown", # No specific West African category
"West indian": "Unknown", # Caribbean, but no specific match
"Western europe": "Unknown", # Too broad
"With heavy cantonese accent": "Chinese",
}
STARTS_WITH = [
"Afrikaans",
"American",
"Australian",
"Bangladeshi",
"Canadian",
"Chinese",
"Dutch",
"Eastern European",
"European",
"England",
"English",
"German",
"Filipino",
"India",
"Irish" "Israeli",
"Italian",
"Japanese",
"Kenyan",
"Northern Irish",
"New Zealand",
"Nigerian",
"Malaysian",
"Russian",
"Scottish",
"Singaporean",
"Slavic",
"South African",
"Southern African",
"Swedish",
"Swiss",
"United States English",
"West Indies",
"french",
"polish",
"serbian",
]
def preprocess_labels(label: str) -> str:
"""Apply pre-processing formatting to the accent labels"""
if "_" in label:
......@@ -96,7 +237,10 @@ def preprocess_labels(label: str) -> str:
language_code = label.split("_")[-1]
label = LANGUAGES[language_code]
# VCTK labels for two words are concatenated into one (NewZeleand-> New Zealand)
label = re.sub(r"(\w)([A-Z])", r"\1 \2", label)
label = re.sub(r"(\w)([A-Z])", r"\1 \2", label).strip()
for prefix in STARTS_WITH:
if label.startswith(prefix):
label = prefix
# convert Whisper language code (polish) to capitalised (Polish)
label = label.capitalize()
if label in ACCENT_MAPPING:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment