prepare_edacc.py 5.86 KB
Newer Older
sanchit-gandhi's avatar
sanchit-gandhi committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import csv
import os
import sys
from dataclasses import dataclass, field
from transformers import HfArgumentParser


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our data for prepareation
    """
    dataset_dir: str = field(
        default=None,
        metadata={
            "help": "Path where the EdAcc tar.gz archive is extracted. Leave in it's raw format: the script will "
                    "assume it's unchanged from the download and use relative paths to load the relevant audio files."
        }
    )
    output_dir: str = field(
        default=None,
        metadata={
            "help": "Where to save the processed dataset to disk. If unspecified, uses a 'pretty' version of the "
            "original dataset name. E.g. 'facebook/voxpopuli' will be saved under 'voxpopuli'."
        },
    )
    push_to_hub: bool = field(
        default=False,
        metadata={"help": "Whether or not to push the processed dataset to the Hub."},
    )
    private_repo: bool = field(
        default=True,
        metadata={"help": "Whether or not to push the processed dataset to a private repository on the Hub"},
    )

ACCENT_MAPPING = {
    'Italian': 'Italian',
    'International': 'Unknown',
    'American': 'American',
    'English': 'English',
    'Latin American': 'Latin American',
    'British': 'English',
    'Romanian': 'Romanian',
    'Standard Indian English': 'Indian',
    'Trans-Atlantic': 'Unknown',
    'Slightly American': 'American',
    'European': 'Unknown',
    'Scottish (Fife)': 'Scottish',
    'English with Scottish inflections': 'Scottish',
    'Indian': 'Indian',
    'Asian': 'Asian',
    'NA': 'Unknown',
    'German': 'German',
    'South London': 'English',
    'Dutch': 'Dutch',
    'Mostly West Coast American with some Australian Intonation': 'American',
    'Japanese': 'Japanese',
    'Chinese': 'Chinese',
    'Generic middle class white person': 'English',
    'French': 'French',
    'Chinese accent or mixed accent(US, UK, China..) perhaps': 'Chinese',
    'American accent': 'American',
    'Catalan': 'Catalan',
    'American, I guess.': 'American',
    'Spanish American': 'Latin American',
    'Spanish': 'Spanish',
    'Standard American,Scottish': 'American',
    'Bulgarian': 'Bulgarian',
    'Latin': 'Latin American',
    'Latín American': 'Latin American',
    'Mexican': 'Latin American', # TODO: un-generalise latin american accents?
    'North American': 'American',
    'Afrian': 'African',
    'Nigerian': 'African', # TODO: un-generalise african accents?
    'East-European': 'Eastern European',
    'Eastern European': 'Eastern European',
    'Southern London': 'English',
    'American with a slight accent': 'American',
    'American-ish': 'American',
    'Indian / Pakistani accent': 'Indian',
    'Pakistani/American': 'Pakistani',
    'African accent': 'African',
    'Kenyan': 'African',  # TODO: un-generalise african accents?
    'Ghanaian': 'African', # TODO: un-generalise african accents?
    'Spanish accent': 'Spanish',
    'Lithuanian': 'Lithuanian',
    'Lithuanian (eastern European)': 'Lithuanian',
    'Indonesian': 'Indonesian',
    'Egyptian': 'Egyptian',
    'South African English': 'South African',
    "Neutral": "English",
    'Neutral accent': 'English',
    'Neutral English, Italian': 'English',
    'Fluent': 'Unknown',
    'Glaswegian': 'Scottish',
    'Glaswegian (not slang)': 'Scottish',
    'Irish': 'Irish',
    'Jamaican': 'Jamaican',
    'Jamaican accent': 'Jamaican',
    'Irish/ Dublin': 'Irish',
    'South Dublin Irish': 'Irish',
    'italian': 'Italian',
    'italian mixed with American and British English': 'Italian',
    'Italian mixed with American accent': 'Italian',
    'South American': 'Latin American',
    'Brazilian accent': 'Latin American', # TODO: un-generalise latin american accents?
    'Israeli': 'Israeli',
    'Vietnamese accent': 'Vietnamese',
    'Southern Irish': 'Irish',
    'Slight Vietnamese accent': 'Vietnamese',
    'Midwestern United States': 'American',
    'Vietnamese English': 'Vietnamese',
    "Vietnamese": "Vietnamese",
    "": "Unknown"
}


def main():
    # 1. Parse input arguments
    parser = HfArgumentParser(DataTrainingArguments)
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        data_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))[0]
    else:
        data_args = parser.parse_args_into_dataclasses()[0]

    # 1. Load accents for each speaker
    linguistic_background = dict()
    linguistic_background_csv = os.path.join(data_args.dataset_dir, "linguistic_background.csv")
    with open(linguistic_background_csv, encoding="utf-8") as file:
        reader = csv.DictReader(file, delimiter=",")
        for line in reader:
            linguistic_background[line["PARTICIPANT_ID"]] = line["How would you describe your accent in English? (e.g. Italian, Glaswegian)"]

    # 2. Clean accents for each speaker
    linguistic_background_clean = {participant: ACCENT_MAPPING[accent.strip()] for participant, accent in linguistic_background.items()}

    # 3. Iterate over dev/test files
    for split in ["dev", "test"]:
        data_dir = os.path.join(data_args.dataset_dir, split)
        metadata = os.path.join(data_dir, "stm")

        with open(metadata, "r") as file:
            for line in file:
                # example line is: 'EDACC-C06 1 EDACC-C06-A 0.00 5.27 <male,l1> C ELEVEN DASH P ONE\n
                # the transcription always comes to the right of the last rangle bracket
                text_idx = line.rfind(">") + 1
                text = line[text_idx:-1]
                # the metadata immediately proceeds this
                line = line[:text_idx]
                file, channel, speaker, start, end, gender = line.split(" ")





if __name__ == "__main__":
    main()