up

c5de50bd · sanchit-gandhi · 1a72a0de · c5de50bd · c5de50bd
Commit c5de50bd authored Mar 12, 2024 by sanchit-gandhi
Hide whitespace changes
Inline Side-by-side

Showing with 47 additions and 41 deletions

audio_classification_scripts/run_mms_lid_with_cv.sh audio_classification_scripts/run_mms_lid_with_cv.sh +1 -0

run_audio_classification.py run_audio_classification.py +46 -41

No files found.
--- a/audio_classification_scripts/run_mms_lid_with_cv.sh
+++ b/audio_classification_scripts/run_mms_lid_with_cv.sh
@@ -36,5 +36,6 @@ python run_audio_classification.py \
    --save_steps 5000 \
    --filter_threshold 0.01 \
    --freeze_base_model False \
+    --gradient_checkpointing \
    --push_to_hub False \
    --trust_remote_code
--- a/run_audio_classification.py
+++ b/run_audio_classification.py
@@ -66,11 +66,55 @@ def deterministic_subsample(wav: np.ndarray, max_length: float, sample_rate: int
    return wav[0:sample_length]
+# This list first defines the accent prefixes, which we use to strip the accent from CV
+# e.g. England, southern accent, slight west-country expression -> England
+# TODO(YL): update this with any CV test prefixes not present in the train set
+STARTS_WITH = [
+    "Afrikaans",
+    "American",
+    "Australian",
+    "Bangladeshi",
+    "Canadian",
+    "Chinese",
+    "Dutch",
+    "Eastern European",
+    "European",
+    "England",
+    "English",
+    "German",
+    "Filipino",
+    "India",
+    "Irish" "Israeli",
+    "Italian",
+    "Japanese",
+    "Kenyan",
+    "Northern Irish",
+    "New Zealand",
+    "Nigerian",
+    "Malaysian",
+    "Russian",
+    "Scottish",
+    "Singaporean",
+    "Slavic",
+    "South African",
+    "Southern African",
+    "Swedish",
+    "Swiss",
+    "United States English",
+    "West Indies",
+    "french",
+    "polish",
+    "serbian",
+]
+# This dictionary is used to map the un-normalised accent names to normalised ones
+# TODO(YL): update this with any CV test mappings not present in the train set
 ACCENT_MAPPING = {
    "British": "English",
-    # "Canadian": "American",
+    # "Canadian": "American",  TODO(SG): decide whether to normalize these to closely related accents
-    "Northern irish": "Irish",
    # "New zealand": "Australian",
+    "Northern irish": "Irish",
    "Pakistani": "Indian",
    "Mainstream u s english": "American",
    "Southern british english": "English",
@@ -191,45 +235,6 @@ ACCENT_MAPPING = {
 }
-STARTS_WITH = [
-    "Afrikaans",
-    "American",
-    "Australian",
-    "Bangladeshi",
-    "Canadian",
-    "Chinese",
-    "Dutch",
-    "Eastern European",
-    "European",
-    "England",
-    "English",
-    "German",
-    "Filipino",
-    "India",
-    "Irish" "Israeli",
-    "Italian",
-    "Japanese",
-    "Kenyan",
-    "Northern Irish",
-    "New Zealand",
-    "Nigerian",
-    "Malaysian",
-    "Russian",
-    "Scottish",
-    "Singaporean",
-    "Slavic",
-    "South African",
-    "Southern African",
-    "Swedish",
-    "Swiss",
-    "United States English",
-    "West Indies",
-    "french",
-    "polish",
-    "serbian",
-]
 def preprocess_labels(label: str) -> str:
    """Apply pre-processing formatting to the accent labels"""
    if "_" in label: