Commit 9b7b518e authored by sanchit-gandhi's avatar sanchit-gandhi
Browse files

style

parent 733ea787
......@@ -5,10 +5,10 @@ import shutil
import sys
from dataclasses import dataclass, field
from datasets import DatasetDict, Dataset, Audio
import soundfile as sf
from datasets import Audio, Dataset, DatasetDict
from tqdm import tqdm
from transformers import HfArgumentParser
import soundfile as sf
@dataclass
......@@ -16,12 +16,13 @@ class DataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our data for prepareation
"""
dataset_dir: str = field(
default=None,
metadata={
"help": "Path where the EdAcc tar.gz archive is extracted. Leave in it's raw format: the script will "
"assume it's unchanged from the download and use relative paths to load the relevant audio files."
}
"assume it's unchanged from the download and use relative paths to load the relevant audio files."
},
)
output_dir: str = field(
default=None,
......@@ -51,85 +52,86 @@ class DataTrainingArguments:
metadata={"help": "Maximum number of samples per split. Useful for debugging purposes."},
)
ACCENT_MAPPING = {
'Italian': 'Italian',
'International': 'Unknown',
'American': 'American',
'English': 'English',
'Latin American': 'Latin American',
'British': 'English',
'Romanian': 'Romanian',
'Standard Indian English': 'Indian',
'Trans-Atlantic': 'Unknown',
'Slightly American': 'American',
'European': 'Unknown',
'Scottish (Fife)': 'Scottish',
'English with Scottish inflections': 'Scottish',
'Indian': 'Indian',
'Asian': 'Asian',
'NA': 'Unknown',
'German': 'German',
'South London': 'English',
'Dutch': 'Dutch',
'Mostly West Coast American with some Australian Intonation': 'American',
'Japanese': 'Japanese',
'Chinese': 'Chinese',
'Generic middle class white person': 'English',
'French': 'French',
'Chinese accent or mixed accent(US, UK, China..) perhaps': 'Chinese',
'American accent': 'American',
'Catalan': 'Catalan',
'American, I guess.': 'American',
'Spanish American': 'Latin American',
'Spanish': 'Spanish',
'Standard American,Scottish': 'American',
'Bulgarian': 'Bulgarian',
'Latin': 'Latin American',
'Latín American': 'Latin American',
'Mexican': 'Latin American', # TODO: un-generalise latin american accents?
'North American': 'American',
'Afrian': 'African',
'Nigerian': 'African', # TODO: un-generalise african accents?
'East-European': 'Eastern European',
'Eastern European': 'Eastern European',
'Southern London': 'English',
'American with a slight accent': 'American',
'American-ish': 'American',
'Indian / Pakistani accent': 'Indian',
'Pakistani/American': 'Pakistani',
'African accent': 'African',
'Kenyan': 'African', # TODO: un-generalise african accents?
'Ghanaian': 'African', # TODO: un-generalise african accents?
'Spanish accent': 'Spanish',
'Lithuanian': 'Lithuanian',
'Lithuanian (eastern European)': 'Lithuanian',
'Indonesian': 'Indonesian',
'Egyptian': 'Egyptian',
'South African English': 'South African',
"Italian": "Italian",
"International": "Unknown",
"American": "American",
"English": "English",
"Latin American": "Latin American",
"British": "English",
"Romanian": "Romanian",
"Standard Indian English": "Indian",
"Trans-Atlantic": "Unknown",
"Slightly American": "American",
"European": "Unknown",
"Scottish (Fife)": "Scottish",
"English with Scottish inflections": "Scottish",
"Indian": "Indian",
"Asian": "Asian",
"NA": "Unknown",
"German": "German",
"South London": "English",
"Dutch": "Dutch",
"Mostly West Coast American with some Australian Intonation": "American",
"Japanese": "Japanese",
"Chinese": "Chinese",
"Generic middle class white person": "English",
"French": "French",
"Chinese accent or mixed accent(US, UK, China..) perhaps": "Chinese",
"American accent": "American",
"Catalan": "Catalan",
"American, I guess.": "American",
"Spanish American": "Latin American",
"Spanish": "Spanish",
"Standard American,Scottish": "American",
"Bulgarian": "Bulgarian",
"Latin": "Latin American",
"Latín American": "Latin American",
"Mexican": "Latin American", # TODO: un-generalise latin american accents?
"North American": "American",
"Afrian": "African",
"Nigerian": "African", # TODO: un-generalise african accents?
"East-European": "Eastern European",
"Eastern European": "Eastern European",
"Southern London": "English",
"American with a slight accent": "American",
"American-ish": "American",
"Indian / Pakistani accent": "Indian",
"Pakistani/American": "Pakistani",
"African accent": "African",
"Kenyan": "African", # TODO: un-generalise african accents?
"Ghanaian": "African", # TODO: un-generalise african accents?
"Spanish accent": "Spanish",
"Lithuanian": "Lithuanian",
"Lithuanian (eastern European)": "Lithuanian",
"Indonesian": "Indonesian",
"Egyptian": "Egyptian",
"South African English": "South African",
"Neutral": "English",
'Neutral accent': 'English',
'Neutral English, Italian': 'English',
'Fluent': 'Unknown',
'Glaswegian': 'Scottish',
'Glaswegian (not slang)': 'Scottish',
'Irish': 'Irish',
'Jamaican': 'Jamaican',
'Jamaican accent': 'Jamaican',
'Irish/ Dublin': 'Irish',
'South Dublin Irish': 'Irish',
'italian': 'Italian',
'italian mixed with American and British English': 'Italian',
'Italian mixed with American accent': 'Italian',
'South American': 'Latin American',
'Brazilian accent': 'Latin American', # TODO: un-generalise latin american accents?
'Israeli': 'Israeli',
'Vietnamese accent': 'Vietnamese',
'Southern Irish': 'Irish',
'Slight Vietnamese accent': 'Vietnamese',
'Midwestern United States': 'American',
'Vietnamese English': 'Vietnamese',
"Neutral accent": "English",
"Neutral English, Italian": "English",
"Fluent": "Unknown",
"Glaswegian": "Scottish",
"Glaswegian (not slang)": "Scottish",
"Irish": "Irish",
"Jamaican": "Jamaican",
"Jamaican accent": "Jamaican",
"Irish/ Dublin": "Irish",
"South Dublin Irish": "Irish",
"italian": "Italian",
"italian mixed with American and British English": "Italian",
"Italian mixed with American accent": "Italian",
"South American": "Latin American",
"Brazilian accent": "Latin American", # TODO: un-generalise latin american accents?
"Israeli": "Israeli",
"Vietnamese accent": "Vietnamese",
"Southern Irish": "Irish",
"Slight Vietnamese accent": "Vietnamese",
"Midwestern United States": "American",
"Vietnamese English": "Vietnamese",
"Vietnamese": "Vietnamese",
"": "Unknown"
"": "Unknown",
}
......@@ -144,15 +146,19 @@ def main():
data_args = parser.parse_args_into_dataclasses()[0]
# 1. Load accents for each speaker
linguistic_background = dict()
linguistic_background = {}
linguistic_background_csv = os.path.join(data_args.dataset_dir, "linguistic_background.csv")
with open(linguistic_background_csv, encoding="utf-8") as file:
reader = csv.DictReader(file, delimiter=",")
for line in reader:
linguistic_background[line["PARTICIPANT_ID"]] = line["How would you describe your accent in English? (e.g. Italian, Glaswegian)"]
linguistic_background[line["PARTICIPANT_ID"]] = line[
"How would you describe your accent in English? (e.g. Italian, Glaswegian)"
]
# 2. Clean accents for each speaker
linguistic_background_clean = {participant: ACCENT_MAPPING[accent.strip()] for participant, accent in linguistic_background.items()}
linguistic_background_clean = {
participant: ACCENT_MAPPING[accent.strip()] for participant, accent in linguistic_background.items()
}
# 3. Initialize dataset dict
raw_datasets = DatasetDict()
......@@ -175,21 +181,21 @@ def main():
all_audio_paths = []
all_normalized_accents = []
all_raw_accents = []
current_audio = None
current_audio_array = None
current_sampling_rate = None
current_counter = 1
gender_pat = r'.*?\<(.*),.*'
l1_pat = r'.*?\,(.*)>.*'
gender_pat = r".*?\<(.*),.*"
l1_pat = r".*?\,(.*)>.*"
with open(metadata, "r") as file:
for idx, line in tqdm(enumerate(file), desc=split):
# example line is: 'EDACC-C06 1 EDACC-C06-A 0.00 5.27 <male,l1> C ELEVEN DASH P ONE\n
# the transcription always comes to the right of the last rangle bracket
text_idx = line.find(">") + 1
all_texts.append(line[text_idx + 1:-1])
all_texts.append(line[text_idx + 1 : -1])
# the metadata immediately proceeds this
line = line[:text_idx]
file, channel, speaker, start, end, gender_l1 = line.split(" ")
......@@ -202,10 +208,12 @@ def main():
# add gender/l1 information
all_genders.append(re.search(gender_pat, gender_l1).group(1))
all_l1s.append(re.search(l1_pat, gender_l1).group(1))
# read audio file if different from previous
if file != current_audio:
current_audio_array, current_sampling_rate = sf.read(os.path.join(data_args.dataset_dir, "data", file + ".wav"))
current_audio_array, current_sampling_rate = sf.read(
os.path.join(data_args.dataset_dir, "data", file + ".wav")
)
current_audio = file
current_counter = 1
else:
......@@ -215,7 +223,7 @@ def main():
start = int(float(start) * current_sampling_rate)
end = int(float(end) * current_sampling_rate)
end = min(end, len(current_audio_array))
chunked_audio = current_audio_array[start: end]
chunked_audio = current_audio_array[start:end]
save_path = os.path.join(output_dir_split, f"{file}-{current_counter}.wav")
sf.write(save_path, chunked_audio, current_sampling_rate)
all_audio_paths.append(save_path)
......@@ -224,14 +232,15 @@ def main():
break
raw_datasets[split_formatted] = Dataset.from_dict(
{"speaker": all_speakers,
"text": all_texts,
"accent": all_normalized_accents,
"raw_accent": all_raw_accents,
"gender": all_genders,
"language": all_l1s,
"audio": all_audio_paths,
}
{
"speaker": all_speakers,
"text": all_texts,
"accent": all_normalized_accents,
"raw_accent": all_raw_accents,
"gender": all_genders,
"language": all_l1s,
"audio": all_audio_paths,
}
).cast_column("audio", Audio())
if data_args.push_to_hub:
......@@ -239,5 +248,6 @@ def main():
raw_datasets.save_to_disk(data_args.output_dir)
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment