Commit 733ea787 authored by sanchit-gandhi's avatar sanchit-gandhi
Browse files

finish edacc and push

parent 00f621dd
import csv
import os
import re
import shutil
import sys
from dataclasses import dataclass, field
from datasets import DatasetDict, Dataset, Audio
from tqdm import tqdm
from transformers import HfArgumentParser
import soundfile as sf
@dataclass
......@@ -24,14 +30,26 @@ class DataTrainingArguments:
"original dataset name. E.g. 'facebook/voxpopuli' will be saved under 'voxpopuli'."
},
)
overwrite_output_dir: bool = field(
default=True,
metadata={"help": "Overwrite the content of the output directory."},
)
push_to_hub: bool = field(
default=False,
metadata={"help": "Whether or not to push the processed dataset to the Hub."},
)
hub_dataset_id: str = field(
default=False,
metadata={"help": "Repository namespace if pushing to the Hugging Face Hub."},
)
private_repo: bool = field(
default=True,
metadata={"help": "Whether or not to push the processed dataset to a private repository on the Hub"},
)
max_samples: int = field(
default=None,
metadata={"help": "Maximum number of samples per split. Useful for debugging purposes."},
)
ACCENT_MAPPING = {
'Italian': 'Italian',
......@@ -136,24 +154,90 @@ def main():
# 2. Clean accents for each speaker
linguistic_background_clean = {participant: ACCENT_MAPPING[accent.strip()] for participant, accent in linguistic_background.items()}
# 3. Iterate over dev/test files
for split in ["dev", "test"]:
# 3. Initialize dataset dict
raw_datasets = DatasetDict()
if data_args.overwrite_output_dir and os.path.exists(data_args.output_dir) and os.path.isdir(data_args.output_dir):
shutil.rmtree(data_args.output_dir)
output_dir_processed = os.path.join(data_args.output_dir, "processed")
# 4. Iterate over dev/test files
for split, split_formatted in zip(["dev", "test"], ["validation", "test"]):
data_dir = os.path.join(data_args.dataset_dir, split)
metadata = os.path.join(data_dir, "stm")
output_dir_split = os.path.join(output_dir_processed, split)
os.makedirs(output_dir_split, exist_ok=True)
all_speakers = []
all_genders = []
all_l1s = []
all_texts = []
all_audio_paths = []
all_normalized_accents = []
all_raw_accents = []
current_audio = None
current_audio_array = None
current_sampling_rate = None
current_counter = 1
gender_pat = r'.*?\<(.*),.*'
l1_pat = r'.*?\,(.*)>.*'
with open(metadata, "r") as file:
for line in file:
for idx, line in tqdm(enumerate(file), desc=split):
# example line is: 'EDACC-C06 1 EDACC-C06-A 0.00 5.27 <male,l1> C ELEVEN DASH P ONE\n
# the transcription always comes to the right of the last rangle bracket
text_idx = line.rfind(">") + 1
text = line[text_idx:-1]
text_idx = line.find(">") + 1
all_texts.append(line[text_idx + 1:-1])
# the metadata immediately proceeds this
line = line[:text_idx]
file, channel, speaker, start, end, gender = line.split(" ")
file, channel, speaker, start, end, gender_l1 = line.split(" ")
# add speaker information to cumulative lists
all_raw_accents.append(linguistic_background[speaker])
all_normalized_accents.append(linguistic_background_clean[speaker])
all_speakers.append(speaker)
# add gender/l1 information
all_genders.append(re.search(gender_pat, gender_l1).group(1))
all_l1s.append(re.search(l1_pat, gender_l1).group(1))
# read audio file if different from previous
if file != current_audio:
current_audio_array, current_sampling_rate = sf.read(os.path.join(data_args.dataset_dir, "data", file + ".wav"))
current_audio = file
current_counter = 1
else:
current_counter += 1
# chunk audio file according to start/end times
start = int(float(start) * current_sampling_rate)
end = int(float(end) * current_sampling_rate)
end = min(end, len(current_audio_array))
chunked_audio = current_audio_array[start: end]
save_path = os.path.join(output_dir_split, f"{file}-{current_counter}.wav")
sf.write(save_path, chunked_audio, current_sampling_rate)
all_audio_paths.append(save_path)
if data_args.max_samples is not None and (data_args.max_samples - 1) == idx:
break
raw_datasets[split_formatted] = Dataset.from_dict(
{"speaker": all_speakers,
"text": all_texts,
"accent": all_normalized_accents,
"raw_accent": all_raw_accents,
"gender": all_genders,
"language": all_l1s,
"audio": all_audio_paths,
}
).cast_column("audio", Audio())
if data_args.push_to_hub:
raw_datasets.push_to_hub(data_args.hub_dataset_id, token=True)
raw_datasets.save_to_disk(data_args.output_dir)
if __name__ == "__main__":
main()
......@@ -2,4 +2,6 @@
python prepare_edacc.py \
--dataset_dir "/fsx/sanchit/edacc/edacc_v1.0" \
--output_dir "/fsx/sanchit/edacc_processed"
\ No newline at end of file
--output_dir "/fsx/sanchit/edacc_processed" \
--hub_dataset_id "sanchit-gandhi/edacc" \
--push_to_hub True
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment