Commit c5de50bd authored by sanchit-gandhi's avatar sanchit-gandhi
Browse files

up

parent 1a72a0de
...@@ -36,5 +36,6 @@ python run_audio_classification.py \ ...@@ -36,5 +36,6 @@ python run_audio_classification.py \
--save_steps 5000 \ --save_steps 5000 \
--filter_threshold 0.01 \ --filter_threshold 0.01 \
--freeze_base_model False \ --freeze_base_model False \
--gradient_checkpointing \
--push_to_hub False \ --push_to_hub False \
--trust_remote_code --trust_remote_code
...@@ -66,11 +66,55 @@ def deterministic_subsample(wav: np.ndarray, max_length: float, sample_rate: int ...@@ -66,11 +66,55 @@ def deterministic_subsample(wav: np.ndarray, max_length: float, sample_rate: int
return wav[0:sample_length] return wav[0:sample_length]
# This list first defines the accent prefixes, which we use to strip the accent from CV
# e.g. England, southern accent, slight west-country expression -> England
# TODO(YL): update this with any CV test prefixes not present in the train set
STARTS_WITH = [
"Afrikaans",
"American",
"Australian",
"Bangladeshi",
"Canadian",
"Chinese",
"Dutch",
"Eastern European",
"European",
"England",
"English",
"German",
"Filipino",
"India",
"Irish" "Israeli",
"Italian",
"Japanese",
"Kenyan",
"Northern Irish",
"New Zealand",
"Nigerian",
"Malaysian",
"Russian",
"Scottish",
"Singaporean",
"Slavic",
"South African",
"Southern African",
"Swedish",
"Swiss",
"United States English",
"West Indies",
"french",
"polish",
"serbian",
]
# This dictionary is used to map the un-normalised accent names to normalised ones
# TODO(YL): update this with any CV test mappings not present in the train set
ACCENT_MAPPING = { ACCENT_MAPPING = {
"British": "English", "British": "English",
# "Canadian": "American", # "Canadian": "American", TODO(SG): decide whether to normalize these to closely related accents
"Northern irish": "Irish",
# "New zealand": "Australian", # "New zealand": "Australian",
"Northern irish": "Irish",
"Pakistani": "Indian", "Pakistani": "Indian",
"Mainstream u s english": "American", "Mainstream u s english": "American",
"Southern british english": "English", "Southern british english": "English",
...@@ -191,45 +235,6 @@ ACCENT_MAPPING = { ...@@ -191,45 +235,6 @@ ACCENT_MAPPING = {
} }
STARTS_WITH = [
"Afrikaans",
"American",
"Australian",
"Bangladeshi",
"Canadian",
"Chinese",
"Dutch",
"Eastern European",
"European",
"England",
"English",
"German",
"Filipino",
"India",
"Irish" "Israeli",
"Italian",
"Japanese",
"Kenyan",
"Northern Irish",
"New Zealand",
"Nigerian",
"Malaysian",
"Russian",
"Scottish",
"Singaporean",
"Slavic",
"South African",
"Southern African",
"Swedish",
"Swiss",
"United States English",
"West Indies",
"french",
"polish",
"serbian",
]
def preprocess_labels(label: str) -> str: def preprocess_labels(label: str) -> str:
"""Apply pre-processing formatting to the accent labels""" """Apply pre-processing formatting to the accent labels"""
if "_" in label: if "_" in label:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment