Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
parler-tts
Commits
9b7b518e
Commit
9b7b518e
authored
Feb 14, 2024
by
sanchit-gandhi
Browse files
style
parent
733ea787
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
108 additions
and
98 deletions
+108
-98
prepare_edacc.py
prepare_edacc.py
+108
-98
No files found.
prepare_edacc.py
View file @
9b7b518e
...
...
@@ -5,10 +5,10 @@ import shutil
import
sys
from
dataclasses
import
dataclass
,
field
from
datasets
import
DatasetDict
,
Dataset
,
Audio
import
soundfile
as
sf
from
datasets
import
Audio
,
Dataset
,
DatasetDict
from
tqdm
import
tqdm
from
transformers
import
HfArgumentParser
import
soundfile
as
sf
@
dataclass
...
...
@@ -16,12 +16,13 @@ class DataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our data for prepareation
"""
dataset_dir
:
str
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Path where the EdAcc tar.gz archive is extracted. Leave in it's raw format: the script will "
"assume it's unchanged from the download and use relative paths to load the relevant audio files."
}
"assume it's unchanged from the download and use relative paths to load the relevant audio files."
}
,
)
output_dir
:
str
=
field
(
default
=
None
,
...
...
@@ -51,85 +52,86 @@ class DataTrainingArguments:
metadata
=
{
"help"
:
"Maximum number of samples per split. Useful for debugging purposes."
},
)
ACCENT_MAPPING
=
{
'
Italian
'
:
'
Italian
'
,
'
International
'
:
'
Unknown
'
,
'
American
'
:
'
American
'
,
'
English
'
:
'
English
'
,
'
Latin American
'
:
'
Latin American
'
,
'
British
'
:
'
English
'
,
'
Romanian
'
:
'
Romanian
'
,
'
Standard Indian English
'
:
'
Indian
'
,
'
Trans-Atlantic
'
:
'
Unknown
'
,
'
Slightly American
'
:
'
American
'
,
'
European
'
:
'
Unknown
'
,
'
Scottish (Fife)
'
:
'
Scottish
'
,
'
English with Scottish inflections
'
:
'
Scottish
'
,
'
Indian
'
:
'
Indian
'
,
'
Asian
'
:
'
Asian
'
,
'
NA
'
:
'
Unknown
'
,
'
German
'
:
'
German
'
,
'
South London
'
:
'
English
'
,
'
Dutch
'
:
'
Dutch
'
,
'
Mostly West Coast American with some Australian Intonation
'
:
'
American
'
,
'
Japanese
'
:
'
Japanese
'
,
'
Chinese
'
:
'
Chinese
'
,
'
Generic middle class white person
'
:
'
English
'
,
'
French
'
:
'
French
'
,
'
Chinese accent or mixed accent(US, UK, China..) perhaps
'
:
'
Chinese
'
,
'
American accent
'
:
'
American
'
,
'
Catalan
'
:
'
Catalan
'
,
'
American, I guess.
'
:
'
American
'
,
'
Spanish American
'
:
'
Latin American
'
,
'
Spanish
'
:
'
Spanish
'
,
'
Standard American,Scottish
'
:
'
American
'
,
'
Bulgarian
'
:
'
Bulgarian
'
,
'
Latin
'
:
'
Latin American
'
,
'
Latín American
'
:
'
Latin American
'
,
'
Mexican
'
:
'
Latin American
'
,
# TODO: un-generalise latin american accents?
'
North American
'
:
'
American
'
,
'
Afrian
'
:
'
African
'
,
'
Nigerian
'
:
'
African
'
,
# TODO: un-generalise african accents?
'
East-European
'
:
'
Eastern European
'
,
'
Eastern European
'
:
'
Eastern European
'
,
'
Southern London
'
:
'
English
'
,
'
American with a slight accent
'
:
'
American
'
,
'
American-ish
'
:
'
American
'
,
'
Indian / Pakistani accent
'
:
'
Indian
'
,
'
Pakistani/American
'
:
'
Pakistani
'
,
'
African accent
'
:
'
African
'
,
'
Kenyan
'
:
'
African
'
,
# TODO: un-generalise african accents?
'
Ghanaian
'
:
'
African
'
,
# TODO: un-generalise african accents?
'
Spanish accent
'
:
'
Spanish
'
,
'
Lithuanian
'
:
'
Lithuanian
'
,
'
Lithuanian (eastern European)
'
:
'
Lithuanian
'
,
'
Indonesian
'
:
'
Indonesian
'
,
'
Egyptian
'
:
'
Egyptian
'
,
'
South African English
'
:
'
South African
'
,
"
Italian
"
:
"
Italian
"
,
"
International
"
:
"
Unknown
"
,
"
American
"
:
"
American
"
,
"
English
"
:
"
English
"
,
"
Latin American
"
:
"
Latin American
"
,
"
British
"
:
"
English
"
,
"
Romanian
"
:
"
Romanian
"
,
"
Standard Indian English
"
:
"
Indian
"
,
"
Trans-Atlantic
"
:
"
Unknown
"
,
"
Slightly American
"
:
"
American
"
,
"
European
"
:
"
Unknown
"
,
"
Scottish (Fife)
"
:
"
Scottish
"
,
"
English with Scottish inflections
"
:
"
Scottish
"
,
"
Indian
"
:
"
Indian
"
,
"
Asian
"
:
"
Asian
"
,
"
NA
"
:
"
Unknown
"
,
"
German
"
:
"
German
"
,
"
South London
"
:
"
English
"
,
"
Dutch
"
:
"
Dutch
"
,
"
Mostly West Coast American with some Australian Intonation
"
:
"
American
"
,
"
Japanese
"
:
"
Japanese
"
,
"
Chinese
"
:
"
Chinese
"
,
"
Generic middle class white person
"
:
"
English
"
,
"
French
"
:
"
French
"
,
"
Chinese accent or mixed accent(US, UK, China..) perhaps
"
:
"
Chinese
"
,
"
American accent
"
:
"
American
"
,
"
Catalan
"
:
"
Catalan
"
,
"
American, I guess.
"
:
"
American
"
,
"
Spanish American
"
:
"
Latin American
"
,
"
Spanish
"
:
"
Spanish
"
,
"
Standard American,Scottish
"
:
"
American
"
,
"
Bulgarian
"
:
"
Bulgarian
"
,
"
Latin
"
:
"
Latin American
"
,
"
Latín American
"
:
"
Latin American
"
,
"
Mexican
"
:
"
Latin American
"
,
# TODO: un-generalise latin american accents?
"
North American
"
:
"
American
"
,
"
Afrian
"
:
"
African
"
,
"
Nigerian
"
:
"
African
"
,
# TODO: un-generalise african accents?
"
East-European
"
:
"
Eastern European
"
,
"
Eastern European
"
:
"
Eastern European
"
,
"
Southern London
"
:
"
English
"
,
"
American with a slight accent
"
:
"
American
"
,
"
American-ish
"
:
"
American
"
,
"
Indian / Pakistani accent
"
:
"
Indian
"
,
"
Pakistani/American
"
:
"
Pakistani
"
,
"
African accent
"
:
"
African
"
,
"
Kenyan
"
:
"
African
"
,
# TODO: un-generalise african accents?
"
Ghanaian
"
:
"
African
"
,
# TODO: un-generalise african accents?
"
Spanish accent
"
:
"
Spanish
"
,
"
Lithuanian
"
:
"
Lithuanian
"
,
"
Lithuanian (eastern European)
"
:
"
Lithuanian
"
,
"
Indonesian
"
:
"
Indonesian
"
,
"
Egyptian
"
:
"
Egyptian
"
,
"
South African English
"
:
"
South African
"
,
"Neutral"
:
"English"
,
'
Neutral accent
'
:
'
English
'
,
'
Neutral English, Italian
'
:
'
English
'
,
'
Fluent
'
:
'
Unknown
'
,
'
Glaswegian
'
:
'
Scottish
'
,
'
Glaswegian (not slang)
'
:
'
Scottish
'
,
'
Irish
'
:
'
Irish
'
,
'
Jamaican
'
:
'
Jamaican
'
,
'
Jamaican accent
'
:
'
Jamaican
'
,
'
Irish/ Dublin
'
:
'
Irish
'
,
'
South Dublin Irish
'
:
'
Irish
'
,
'
italian
'
:
'
Italian
'
,
'
italian mixed with American and British English
'
:
'
Italian
'
,
'
Italian mixed with American accent
'
:
'
Italian
'
,
'
South American
'
:
'
Latin American
'
,
'
Brazilian accent
'
:
'
Latin American
'
,
# TODO: un-generalise latin american accents?
'
Israeli
'
:
'
Israeli
'
,
'
Vietnamese accent
'
:
'
Vietnamese
'
,
'
Southern Irish
'
:
'
Irish
'
,
'
Slight Vietnamese accent
'
:
'
Vietnamese
'
,
'
Midwestern United States
'
:
'
American
'
,
'
Vietnamese English
'
:
'
Vietnamese
'
,
"
Neutral accent
"
:
"
English
"
,
"
Neutral English, Italian
"
:
"
English
"
,
"
Fluent
"
:
"
Unknown
"
,
"
Glaswegian
"
:
"
Scottish
"
,
"
Glaswegian (not slang)
"
:
"
Scottish
"
,
"
Irish
"
:
"
Irish
"
,
"
Jamaican
"
:
"
Jamaican
"
,
"
Jamaican accent
"
:
"
Jamaican
"
,
"
Irish/ Dublin
"
:
"
Irish
"
,
"
South Dublin Irish
"
:
"
Irish
"
,
"
italian
"
:
"
Italian
"
,
"
italian mixed with American and British English
"
:
"
Italian
"
,
"
Italian mixed with American accent
"
:
"
Italian
"
,
"
South American
"
:
"
Latin American
"
,
"
Brazilian accent
"
:
"
Latin American
"
,
# TODO: un-generalise latin american accents?
"
Israeli
"
:
"
Israeli
"
,
"
Vietnamese accent
"
:
"
Vietnamese
"
,
"
Southern Irish
"
:
"
Irish
"
,
"
Slight Vietnamese accent
"
:
"
Vietnamese
"
,
"
Midwestern United States
"
:
"
American
"
,
"
Vietnamese English
"
:
"
Vietnamese
"
,
"Vietnamese"
:
"Vietnamese"
,
""
:
"Unknown"
""
:
"Unknown"
,
}
...
...
@@ -144,15 +146,19 @@ def main():
data_args
=
parser
.
parse_args_into_dataclasses
()[
0
]
# 1. Load accents for each speaker
linguistic_background
=
dict
()
linguistic_background
=
{}
linguistic_background_csv
=
os
.
path
.
join
(
data_args
.
dataset_dir
,
"linguistic_background.csv"
)
with
open
(
linguistic_background_csv
,
encoding
=
"utf-8"
)
as
file
:
reader
=
csv
.
DictReader
(
file
,
delimiter
=
","
)
for
line
in
reader
:
linguistic_background
[
line
[
"PARTICIPANT_ID"
]]
=
line
[
"How would you describe your accent in English? (e.g. Italian, Glaswegian)"
]
linguistic_background
[
line
[
"PARTICIPANT_ID"
]]
=
line
[
"How would you describe your accent in English? (e.g. Italian, Glaswegian)"
]
# 2. Clean accents for each speaker
linguistic_background_clean
=
{
participant
:
ACCENT_MAPPING
[
accent
.
strip
()]
for
participant
,
accent
in
linguistic_background
.
items
()}
linguistic_background_clean
=
{
participant
:
ACCENT_MAPPING
[
accent
.
strip
()]
for
participant
,
accent
in
linguistic_background
.
items
()
}
# 3. Initialize dataset dict
raw_datasets
=
DatasetDict
()
...
...
@@ -175,21 +181,21 @@ def main():
all_audio_paths
=
[]
all_normalized_accents
=
[]
all_raw_accents
=
[]
current_audio
=
None
current_audio_array
=
None
current_sampling_rate
=
None
current_counter
=
1
gender_pat
=
r
'
.*?\<(.*),.*
'
l1_pat
=
r
'
.*?\,(.*)>.*
'
gender_pat
=
r
"
.*?\<(.*),.*
"
l1_pat
=
r
"
.*?\,(.*)>.*
"
with
open
(
metadata
,
"r"
)
as
file
:
for
idx
,
line
in
tqdm
(
enumerate
(
file
),
desc
=
split
):
# example line is: 'EDACC-C06 1 EDACC-C06-A 0.00 5.27 <male,l1> C ELEVEN DASH P ONE\n
# the transcription always comes to the right of the last rangle bracket
text_idx
=
line
.
find
(
">"
)
+
1
all_texts
.
append
(
line
[
text_idx
+
1
:
-
1
])
all_texts
.
append
(
line
[
text_idx
+
1
:
-
1
])
# the metadata immediately proceeds this
line
=
line
[:
text_idx
]
file
,
channel
,
speaker
,
start
,
end
,
gender_l1
=
line
.
split
(
" "
)
...
...
@@ -202,10 +208,12 @@ def main():
# add gender/l1 information
all_genders
.
append
(
re
.
search
(
gender_pat
,
gender_l1
).
group
(
1
))
all_l1s
.
append
(
re
.
search
(
l1_pat
,
gender_l1
).
group
(
1
))
# read audio file if different from previous
if
file
!=
current_audio
:
current_audio_array
,
current_sampling_rate
=
sf
.
read
(
os
.
path
.
join
(
data_args
.
dataset_dir
,
"data"
,
file
+
".wav"
))
current_audio_array
,
current_sampling_rate
=
sf
.
read
(
os
.
path
.
join
(
data_args
.
dataset_dir
,
"data"
,
file
+
".wav"
)
)
current_audio
=
file
current_counter
=
1
else
:
...
...
@@ -215,7 +223,7 @@ def main():
start
=
int
(
float
(
start
)
*
current_sampling_rate
)
end
=
int
(
float
(
end
)
*
current_sampling_rate
)
end
=
min
(
end
,
len
(
current_audio_array
))
chunked_audio
=
current_audio_array
[
start
:
end
]
chunked_audio
=
current_audio_array
[
start
:
end
]
save_path
=
os
.
path
.
join
(
output_dir_split
,
f
"
{
file
}
-
{
current_counter
}
.wav"
)
sf
.
write
(
save_path
,
chunked_audio
,
current_sampling_rate
)
all_audio_paths
.
append
(
save_path
)
...
...
@@ -224,14 +232,15 @@ def main():
break
raw_datasets
[
split_formatted
]
=
Dataset
.
from_dict
(
{
"speaker"
:
all_speakers
,
"text"
:
all_texts
,
"accent"
:
all_normalized_accents
,
"raw_accent"
:
all_raw_accents
,
"gender"
:
all_genders
,
"language"
:
all_l1s
,
"audio"
:
all_audio_paths
,
}
{
"speaker"
:
all_speakers
,
"text"
:
all_texts
,
"accent"
:
all_normalized_accents
,
"raw_accent"
:
all_raw_accents
,
"gender"
:
all_genders
,
"language"
:
all_l1s
,
"audio"
:
all_audio_paths
,
}
).
cast_column
(
"audio"
,
Audio
())
if
data_args
.
push_to_hub
:
...
...
@@ -239,5 +248,6 @@ def main():
raw_datasets
.
save_to_disk
(
data_args
.
output_dir
)
if
__name__
==
"__main__"
:
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment