finish edacc and push

733ea787 · sanchit-gandhi · 00f621dd · 733ea787 · 733ea787
Commit 733ea787 authored Feb 14, 2024 by sanchit-gandhi
Hide whitespace changes
Inline Side-by-side

Showing with 93 additions and 7 deletions

prepare_edacc.py prepare_edacc.py +90 -6

run_edacc.sh run_edacc.sh +3 -1

No files found.
--- a/prepare_edacc.py
+++ b/prepare_edacc.py
 import csv
 import os
+import re
+import shutil
 import sys
 from dataclasses import dataclass, field
+
+from datasets import DatasetDict, Dataset, Audio
+from tqdm import tqdm
 from transformers import HfArgumentParser
+import soundfile as sf


 @dataclass
@@ -24,14 +30,26 @@ class DataTrainingArguments:
            "original dataset name. E.g. 'facebook/voxpopuli' will be saved under 'voxpopuli'."
        },
    )
+    overwrite_output_dir: bool = field(
+        default=True,
+        metadata={"help": "Overwrite the content of the output directory."},
+    )
    push_to_hub: bool = field(
        default=False,
        metadata={"help": "Whether or not to push the processed dataset to the Hub."},
    )
+    hub_dataset_id: str = field(
+        default=False,
+        metadata={"help": "Repository namespace if pushing to the Hugging Face Hub."},
+    )
    private_repo: bool = field(
        default=True,
        metadata={"help": "Whether or not to push the processed dataset to a private repository on the Hub"},
    )
+    max_samples: int = field(
+        default=None,
+        metadata={"help": "Maximum number of samples per split. Useful for debugging purposes."},
+    )

 ACCENT_MAPPING = {
    'Italian': 'Italian',
@@ -136,24 +154,90 @@ def main():
    # 2. Clean accents for each speaker
    linguistic_background_clean = {participant: ACCENT_MAPPING[accent.strip()] for participant, accent in linguistic_background.items()}

-    # 3. Iterate over dev/test files
-    for split in ["dev", "test"]:
+    # 3. Initialize dataset dict
+    raw_datasets = DatasetDict()
+
+    if data_args.overwrite_output_dir and os.path.exists(data_args.output_dir) and os.path.isdir(data_args.output_dir):
+        shutil.rmtree(data_args.output_dir)
+    output_dir_processed = os.path.join(data_args.output_dir, "processed")
+
+    # 4. Iterate over dev/test files
+    for split, split_formatted in zip(["dev", "test"], ["validation", "test"]):
        data_dir = os.path.join(data_args.dataset_dir, split)
        metadata = os.path.join(data_dir, "stm")
+        output_dir_split = os.path.join(output_dir_processed, split)
+        os.makedirs(output_dir_split, exist_ok=True)
+
+        all_speakers = []
+        all_genders = []
+        all_l1s = []
+        all_texts = []
+        all_audio_paths = []
+        all_normalized_accents = []
+        all_raw_accents = []
+        
+        current_audio = None
+        current_audio_array = None
+        current_sampling_rate = None
+        current_counter = 1
+
+        gender_pat = r'.*?\<(.*),.*'
+        l1_pat = r'.*?\,(.*)>.*'

        with open(metadata, "r") as file:
-            for line in file:
+            for idx, line in tqdm(enumerate(file), desc=split):
                # example line is: 'EDACC-C06 1 EDACC-C06-A 0.00 5.27 <male,l1> C ELEVEN DASH P ONE\n
                # the transcription always comes to the right of the last rangle bracket
-                text_idx = line.rfind(">") + 1
-                text = line[text_idx:-1]
+                text_idx = line.find(">") + 1
+                all_texts.append(line[text_idx + 1:-1])
                # the metadata immediately proceeds this
                line = line[:text_idx]
-                file, channel, speaker, start, end, gender = line.split(" ")
+                file, channel, speaker, start, end, gender_l1 = line.split(" ")
+
+                # add speaker information to cumulative lists
+                all_raw_accents.append(linguistic_background[speaker])
+                all_normalized_accents.append(linguistic_background_clean[speaker])
+                all_speakers.append(speaker)
+
+                # add gender/l1 information
+                all_genders.append(re.search(gender_pat, gender_l1).group(1))
+                all_l1s.append(re.search(l1_pat, gender_l1).group(1))
+                
+                # read audio file if different from previous
+                if file != current_audio:
+                    current_audio_array, current_sampling_rate = sf.read(os.path.join(data_args.dataset_dir, "data", file + ".wav"))
+                    current_audio = file
+                    current_counter = 1
+                else:
+                    current_counter += 1
+
+                # chunk audio file according to start/end times
+                start = int(float(start) * current_sampling_rate)
+                end = int(float(end) * current_sampling_rate)
+                end = min(end, len(current_audio_array))
+                chunked_audio = current_audio_array[start: end]
+                save_path = os.path.join(output_dir_split, f"{file}-{current_counter}.wav")
+                sf.write(save_path, chunked_audio, current_sampling_rate)
+                all_audio_paths.append(save_path)

+                if data_args.max_samples is not None and (data_args.max_samples - 1) == idx:
+                    break

+        raw_datasets[split_formatted] = Dataset.from_dict(
+            {"speaker": all_speakers,
+             "text": all_texts,
+             "accent": all_normalized_accents,
+             "raw_accent": all_raw_accents,
+             "gender": all_genders,
+             "language": all_l1s,
+             "audio": all_audio_paths,
+             }
+        ).cast_column("audio", Audio())

+    if data_args.push_to_hub:
+        raw_datasets.push_to_hub(data_args.hub_dataset_id, token=True)

+    raw_datasets.save_to_disk(data_args.output_dir)

 if __name__ == "__main__":
    main()
--- a/run_edacc.sh
+++ b/run_edacc.sh
@@ -2,4 +2,6 @@

 python prepare_edacc.py \
    --dataset_dir "/fsx/sanchit/edacc/edacc_v1.0" \
-    --output_dir "/fsx/sanchit/edacc_processed"
\ No newline at end of file
+    --output_dir "/fsx/sanchit/edacc_processed" \
+    --hub_dataset_id "sanchit-gandhi/edacc" \
+    --push_to_hub True