Commit 9b7b518e authored by sanchit-gandhi's avatar sanchit-gandhi
Browse files

style

parent 733ea787
...@@ -5,10 +5,10 @@ import shutil ...@@ -5,10 +5,10 @@ import shutil
import sys import sys
from dataclasses import dataclass, field from dataclasses import dataclass, field
from datasets import DatasetDict, Dataset, Audio import soundfile as sf
from datasets import Audio, Dataset, DatasetDict
from tqdm import tqdm from tqdm import tqdm
from transformers import HfArgumentParser from transformers import HfArgumentParser
import soundfile as sf
@dataclass @dataclass
...@@ -16,12 +16,13 @@ class DataTrainingArguments: ...@@ -16,12 +16,13 @@ class DataTrainingArguments:
""" """
Arguments pertaining to what data we are going to input our data for prepareation Arguments pertaining to what data we are going to input our data for prepareation
""" """
dataset_dir: str = field( dataset_dir: str = field(
default=None, default=None,
metadata={ metadata={
"help": "Path where the EdAcc tar.gz archive is extracted. Leave in it's raw format: the script will " "help": "Path where the EdAcc tar.gz archive is extracted. Leave in it's raw format: the script will "
"assume it's unchanged from the download and use relative paths to load the relevant audio files." "assume it's unchanged from the download and use relative paths to load the relevant audio files."
} },
) )
output_dir: str = field( output_dir: str = field(
default=None, default=None,
...@@ -51,85 +52,86 @@ class DataTrainingArguments: ...@@ -51,85 +52,86 @@ class DataTrainingArguments:
metadata={"help": "Maximum number of samples per split. Useful for debugging purposes."}, metadata={"help": "Maximum number of samples per split. Useful for debugging purposes."},
) )
ACCENT_MAPPING = { ACCENT_MAPPING = {
'Italian': 'Italian', "Italian": "Italian",
'International': 'Unknown', "International": "Unknown",
'American': 'American', "American": "American",
'English': 'English', "English": "English",
'Latin American': 'Latin American', "Latin American": "Latin American",
'British': 'English', "British": "English",
'Romanian': 'Romanian', "Romanian": "Romanian",
'Standard Indian English': 'Indian', "Standard Indian English": "Indian",
'Trans-Atlantic': 'Unknown', "Trans-Atlantic": "Unknown",
'Slightly American': 'American', "Slightly American": "American",
'European': 'Unknown', "European": "Unknown",
'Scottish (Fife)': 'Scottish', "Scottish (Fife)": "Scottish",
'English with Scottish inflections': 'Scottish', "English with Scottish inflections": "Scottish",
'Indian': 'Indian', "Indian": "Indian",
'Asian': 'Asian', "Asian": "Asian",
'NA': 'Unknown', "NA": "Unknown",
'German': 'German', "German": "German",
'South London': 'English', "South London": "English",
'Dutch': 'Dutch', "Dutch": "Dutch",
'Mostly West Coast American with some Australian Intonation': 'American', "Mostly West Coast American with some Australian Intonation": "American",
'Japanese': 'Japanese', "Japanese": "Japanese",
'Chinese': 'Chinese', "Chinese": "Chinese",
'Generic middle class white person': 'English', "Generic middle class white person": "English",
'French': 'French', "French": "French",
'Chinese accent or mixed accent(US, UK, China..) perhaps': 'Chinese', "Chinese accent or mixed accent(US, UK, China..) perhaps": "Chinese",
'American accent': 'American', "American accent": "American",
'Catalan': 'Catalan', "Catalan": "Catalan",
'American, I guess.': 'American', "American, I guess.": "American",
'Spanish American': 'Latin American', "Spanish American": "Latin American",
'Spanish': 'Spanish', "Spanish": "Spanish",
'Standard American,Scottish': 'American', "Standard American,Scottish": "American",
'Bulgarian': 'Bulgarian', "Bulgarian": "Bulgarian",
'Latin': 'Latin American', "Latin": "Latin American",
'Latín American': 'Latin American', "Latín American": "Latin American",
'Mexican': 'Latin American', # TODO: un-generalise latin american accents? "Mexican": "Latin American", # TODO: un-generalise latin american accents?
'North American': 'American', "North American": "American",
'Afrian': 'African', "Afrian": "African",
'Nigerian': 'African', # TODO: un-generalise african accents? "Nigerian": "African", # TODO: un-generalise african accents?
'East-European': 'Eastern European', "East-European": "Eastern European",
'Eastern European': 'Eastern European', "Eastern European": "Eastern European",
'Southern London': 'English', "Southern London": "English",
'American with a slight accent': 'American', "American with a slight accent": "American",
'American-ish': 'American', "American-ish": "American",
'Indian / Pakistani accent': 'Indian', "Indian / Pakistani accent": "Indian",
'Pakistani/American': 'Pakistani', "Pakistani/American": "Pakistani",
'African accent': 'African', "African accent": "African",
'Kenyan': 'African', # TODO: un-generalise african accents? "Kenyan": "African", # TODO: un-generalise african accents?
'Ghanaian': 'African', # TODO: un-generalise african accents? "Ghanaian": "African", # TODO: un-generalise african accents?
'Spanish accent': 'Spanish', "Spanish accent": "Spanish",
'Lithuanian': 'Lithuanian', "Lithuanian": "Lithuanian",
'Lithuanian (eastern European)': 'Lithuanian', "Lithuanian (eastern European)": "Lithuanian",
'Indonesian': 'Indonesian', "Indonesian": "Indonesian",
'Egyptian': 'Egyptian', "Egyptian": "Egyptian",
'South African English': 'South African', "South African English": "South African",
"Neutral": "English", "Neutral": "English",
'Neutral accent': 'English', "Neutral accent": "English",
'Neutral English, Italian': 'English', "Neutral English, Italian": "English",
'Fluent': 'Unknown', "Fluent": "Unknown",
'Glaswegian': 'Scottish', "Glaswegian": "Scottish",
'Glaswegian (not slang)': 'Scottish', "Glaswegian (not slang)": "Scottish",
'Irish': 'Irish', "Irish": "Irish",
'Jamaican': 'Jamaican', "Jamaican": "Jamaican",
'Jamaican accent': 'Jamaican', "Jamaican accent": "Jamaican",
'Irish/ Dublin': 'Irish', "Irish/ Dublin": "Irish",
'South Dublin Irish': 'Irish', "South Dublin Irish": "Irish",
'italian': 'Italian', "italian": "Italian",
'italian mixed with American and British English': 'Italian', "italian mixed with American and British English": "Italian",
'Italian mixed with American accent': 'Italian', "Italian mixed with American accent": "Italian",
'South American': 'Latin American', "South American": "Latin American",
'Brazilian accent': 'Latin American', # TODO: un-generalise latin american accents? "Brazilian accent": "Latin American", # TODO: un-generalise latin american accents?
'Israeli': 'Israeli', "Israeli": "Israeli",
'Vietnamese accent': 'Vietnamese', "Vietnamese accent": "Vietnamese",
'Southern Irish': 'Irish', "Southern Irish": "Irish",
'Slight Vietnamese accent': 'Vietnamese', "Slight Vietnamese accent": "Vietnamese",
'Midwestern United States': 'American', "Midwestern United States": "American",
'Vietnamese English': 'Vietnamese', "Vietnamese English": "Vietnamese",
"Vietnamese": "Vietnamese", "Vietnamese": "Vietnamese",
"": "Unknown" "": "Unknown",
} }
...@@ -144,15 +146,19 @@ def main(): ...@@ -144,15 +146,19 @@ def main():
data_args = parser.parse_args_into_dataclasses()[0] data_args = parser.parse_args_into_dataclasses()[0]
# 1. Load accents for each speaker # 1. Load accents for each speaker
linguistic_background = dict() linguistic_background = {}
linguistic_background_csv = os.path.join(data_args.dataset_dir, "linguistic_background.csv") linguistic_background_csv = os.path.join(data_args.dataset_dir, "linguistic_background.csv")
with open(linguistic_background_csv, encoding="utf-8") as file: with open(linguistic_background_csv, encoding="utf-8") as file:
reader = csv.DictReader(file, delimiter=",") reader = csv.DictReader(file, delimiter=",")
for line in reader: for line in reader:
linguistic_background[line["PARTICIPANT_ID"]] = line["How would you describe your accent in English? (e.g. Italian, Glaswegian)"] linguistic_background[line["PARTICIPANT_ID"]] = line[
"How would you describe your accent in English? (e.g. Italian, Glaswegian)"
]
# 2. Clean accents for each speaker # 2. Clean accents for each speaker
linguistic_background_clean = {participant: ACCENT_MAPPING[accent.strip()] for participant, accent in linguistic_background.items()} linguistic_background_clean = {
participant: ACCENT_MAPPING[accent.strip()] for participant, accent in linguistic_background.items()
}
# 3. Initialize dataset dict # 3. Initialize dataset dict
raw_datasets = DatasetDict() raw_datasets = DatasetDict()
...@@ -175,21 +181,21 @@ def main(): ...@@ -175,21 +181,21 @@ def main():
all_audio_paths = [] all_audio_paths = []
all_normalized_accents = [] all_normalized_accents = []
all_raw_accents = [] all_raw_accents = []
current_audio = None current_audio = None
current_audio_array = None current_audio_array = None
current_sampling_rate = None current_sampling_rate = None
current_counter = 1 current_counter = 1
gender_pat = r'.*?\<(.*),.*' gender_pat = r".*?\<(.*),.*"
l1_pat = r'.*?\,(.*)>.*' l1_pat = r".*?\,(.*)>.*"
with open(metadata, "r") as file: with open(metadata, "r") as file:
for idx, line in tqdm(enumerate(file), desc=split): for idx, line in tqdm(enumerate(file), desc=split):
# example line is: 'EDACC-C06 1 EDACC-C06-A 0.00 5.27 <male,l1> C ELEVEN DASH P ONE\n # example line is: 'EDACC-C06 1 EDACC-C06-A 0.00 5.27 <male,l1> C ELEVEN DASH P ONE\n
# the transcription always comes to the right of the last rangle bracket # the transcription always comes to the right of the last rangle bracket
text_idx = line.find(">") + 1 text_idx = line.find(">") + 1
all_texts.append(line[text_idx + 1:-1]) all_texts.append(line[text_idx + 1 : -1])
# the metadata immediately proceeds this # the metadata immediately proceeds this
line = line[:text_idx] line = line[:text_idx]
file, channel, speaker, start, end, gender_l1 = line.split(" ") file, channel, speaker, start, end, gender_l1 = line.split(" ")
...@@ -202,10 +208,12 @@ def main(): ...@@ -202,10 +208,12 @@ def main():
# add gender/l1 information # add gender/l1 information
all_genders.append(re.search(gender_pat, gender_l1).group(1)) all_genders.append(re.search(gender_pat, gender_l1).group(1))
all_l1s.append(re.search(l1_pat, gender_l1).group(1)) all_l1s.append(re.search(l1_pat, gender_l1).group(1))
# read audio file if different from previous # read audio file if different from previous
if file != current_audio: if file != current_audio:
current_audio_array, current_sampling_rate = sf.read(os.path.join(data_args.dataset_dir, "data", file + ".wav")) current_audio_array, current_sampling_rate = sf.read(
os.path.join(data_args.dataset_dir, "data", file + ".wav")
)
current_audio = file current_audio = file
current_counter = 1 current_counter = 1
else: else:
...@@ -215,7 +223,7 @@ def main(): ...@@ -215,7 +223,7 @@ def main():
start = int(float(start) * current_sampling_rate) start = int(float(start) * current_sampling_rate)
end = int(float(end) * current_sampling_rate) end = int(float(end) * current_sampling_rate)
end = min(end, len(current_audio_array)) end = min(end, len(current_audio_array))
chunked_audio = current_audio_array[start: end] chunked_audio = current_audio_array[start:end]
save_path = os.path.join(output_dir_split, f"{file}-{current_counter}.wav") save_path = os.path.join(output_dir_split, f"{file}-{current_counter}.wav")
sf.write(save_path, chunked_audio, current_sampling_rate) sf.write(save_path, chunked_audio, current_sampling_rate)
all_audio_paths.append(save_path) all_audio_paths.append(save_path)
...@@ -224,14 +232,15 @@ def main(): ...@@ -224,14 +232,15 @@ def main():
break break
raw_datasets[split_formatted] = Dataset.from_dict( raw_datasets[split_formatted] = Dataset.from_dict(
{"speaker": all_speakers, {
"text": all_texts, "speaker": all_speakers,
"accent": all_normalized_accents, "text": all_texts,
"raw_accent": all_raw_accents, "accent": all_normalized_accents,
"gender": all_genders, "raw_accent": all_raw_accents,
"language": all_l1s, "gender": all_genders,
"audio": all_audio_paths, "language": all_l1s,
} "audio": all_audio_paths,
}
).cast_column("audio", Audio()) ).cast_column("audio", Audio())
if data_args.push_to_hub: if data_args.push_to_hub:
...@@ -239,5 +248,6 @@ def main(): ...@@ -239,5 +248,6 @@ def main():
raw_datasets.save_to_disk(data_args.output_dir) raw_datasets.save_to_disk(data_args.output_dir)
if __name__ == "__main__": if __name__ == "__main__":
main() main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment