Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
parler-tts
Commits
9b7b518e
Commit
9b7b518e
authored
Feb 14, 2024
by
sanchit-gandhi
Browse files
style
parent
733ea787
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
108 additions
and
98 deletions
+108
-98
prepare_edacc.py
prepare_edacc.py
+108
-98
No files found.
prepare_edacc.py
View file @
9b7b518e
...
@@ -5,10 +5,10 @@ import shutil
...
@@ -5,10 +5,10 @@ import shutil
import
sys
import
sys
from
dataclasses
import
dataclass
,
field
from
dataclasses
import
dataclass
,
field
from
datasets
import
DatasetDict
,
Dataset
,
Audio
import
soundfile
as
sf
from
datasets
import
Audio
,
Dataset
,
DatasetDict
from
tqdm
import
tqdm
from
tqdm
import
tqdm
from
transformers
import
HfArgumentParser
from
transformers
import
HfArgumentParser
import
soundfile
as
sf
@
dataclass
@
dataclass
...
@@ -16,12 +16,13 @@ class DataTrainingArguments:
...
@@ -16,12 +16,13 @@ class DataTrainingArguments:
"""
"""
Arguments pertaining to what data we are going to input our data for prepareation
Arguments pertaining to what data we are going to input our data for prepareation
"""
"""
dataset_dir
:
str
=
field
(
dataset_dir
:
str
=
field
(
default
=
None
,
default
=
None
,
metadata
=
{
metadata
=
{
"help"
:
"Path where the EdAcc tar.gz archive is extracted. Leave in it's raw format: the script will "
"help"
:
"Path where the EdAcc tar.gz archive is extracted. Leave in it's raw format: the script will "
"assume it's unchanged from the download and use relative paths to load the relevant audio files."
"assume it's unchanged from the download and use relative paths to load the relevant audio files."
}
}
,
)
)
output_dir
:
str
=
field
(
output_dir
:
str
=
field
(
default
=
None
,
default
=
None
,
...
@@ -51,85 +52,86 @@ class DataTrainingArguments:
...
@@ -51,85 +52,86 @@ class DataTrainingArguments:
metadata
=
{
"help"
:
"Maximum number of samples per split. Useful for debugging purposes."
},
metadata
=
{
"help"
:
"Maximum number of samples per split. Useful for debugging purposes."
},
)
)
ACCENT_MAPPING
=
{
ACCENT_MAPPING
=
{
'
Italian
'
:
'
Italian
'
,
"
Italian
"
:
"
Italian
"
,
'
International
'
:
'
Unknown
'
,
"
International
"
:
"
Unknown
"
,
'
American
'
:
'
American
'
,
"
American
"
:
"
American
"
,
'
English
'
:
'
English
'
,
"
English
"
:
"
English
"
,
'
Latin American
'
:
'
Latin American
'
,
"
Latin American
"
:
"
Latin American
"
,
'
British
'
:
'
English
'
,
"
British
"
:
"
English
"
,
'
Romanian
'
:
'
Romanian
'
,
"
Romanian
"
:
"
Romanian
"
,
'
Standard Indian English
'
:
'
Indian
'
,
"
Standard Indian English
"
:
"
Indian
"
,
'
Trans-Atlantic
'
:
'
Unknown
'
,
"
Trans-Atlantic
"
:
"
Unknown
"
,
'
Slightly American
'
:
'
American
'
,
"
Slightly American
"
:
"
American
"
,
'
European
'
:
'
Unknown
'
,
"
European
"
:
"
Unknown
"
,
'
Scottish (Fife)
'
:
'
Scottish
'
,
"
Scottish (Fife)
"
:
"
Scottish
"
,
'
English with Scottish inflections
'
:
'
Scottish
'
,
"
English with Scottish inflections
"
:
"
Scottish
"
,
'
Indian
'
:
'
Indian
'
,
"
Indian
"
:
"
Indian
"
,
'
Asian
'
:
'
Asian
'
,
"
Asian
"
:
"
Asian
"
,
'
NA
'
:
'
Unknown
'
,
"
NA
"
:
"
Unknown
"
,
'
German
'
:
'
German
'
,
"
German
"
:
"
German
"
,
'
South London
'
:
'
English
'
,
"
South London
"
:
"
English
"
,
'
Dutch
'
:
'
Dutch
'
,
"
Dutch
"
:
"
Dutch
"
,
'
Mostly West Coast American with some Australian Intonation
'
:
'
American
'
,
"
Mostly West Coast American with some Australian Intonation
"
:
"
American
"
,
'
Japanese
'
:
'
Japanese
'
,
"
Japanese
"
:
"
Japanese
"
,
'
Chinese
'
:
'
Chinese
'
,
"
Chinese
"
:
"
Chinese
"
,
'
Generic middle class white person
'
:
'
English
'
,
"
Generic middle class white person
"
:
"
English
"
,
'
French
'
:
'
French
'
,
"
French
"
:
"
French
"
,
'
Chinese accent or mixed accent(US, UK, China..) perhaps
'
:
'
Chinese
'
,
"
Chinese accent or mixed accent(US, UK, China..) perhaps
"
:
"
Chinese
"
,
'
American accent
'
:
'
American
'
,
"
American accent
"
:
"
American
"
,
'
Catalan
'
:
'
Catalan
'
,
"
Catalan
"
:
"
Catalan
"
,
'
American, I guess.
'
:
'
American
'
,
"
American, I guess.
"
:
"
American
"
,
'
Spanish American
'
:
'
Latin American
'
,
"
Spanish American
"
:
"
Latin American
"
,
'
Spanish
'
:
'
Spanish
'
,
"
Spanish
"
:
"
Spanish
"
,
'
Standard American,Scottish
'
:
'
American
'
,
"
Standard American,Scottish
"
:
"
American
"
,
'
Bulgarian
'
:
'
Bulgarian
'
,
"
Bulgarian
"
:
"
Bulgarian
"
,
'
Latin
'
:
'
Latin American
'
,
"
Latin
"
:
"
Latin American
"
,
'
Latín American
'
:
'
Latin American
'
,
"
Latín American
"
:
"
Latin American
"
,
'
Mexican
'
:
'
Latin American
'
,
# TODO: un-generalise latin american accents?
"
Mexican
"
:
"
Latin American
"
,
# TODO: un-generalise latin american accents?
'
North American
'
:
'
American
'
,
"
North American
"
:
"
American
"
,
'
Afrian
'
:
'
African
'
,
"
Afrian
"
:
"
African
"
,
'
Nigerian
'
:
'
African
'
,
# TODO: un-generalise african accents?
"
Nigerian
"
:
"
African
"
,
# TODO: un-generalise african accents?
'
East-European
'
:
'
Eastern European
'
,
"
East-European
"
:
"
Eastern European
"
,
'
Eastern European
'
:
'
Eastern European
'
,
"
Eastern European
"
:
"
Eastern European
"
,
'
Southern London
'
:
'
English
'
,
"
Southern London
"
:
"
English
"
,
'
American with a slight accent
'
:
'
American
'
,
"
American with a slight accent
"
:
"
American
"
,
'
American-ish
'
:
'
American
'
,
"
American-ish
"
:
"
American
"
,
'
Indian / Pakistani accent
'
:
'
Indian
'
,
"
Indian / Pakistani accent
"
:
"
Indian
"
,
'
Pakistani/American
'
:
'
Pakistani
'
,
"
Pakistani/American
"
:
"
Pakistani
"
,
'
African accent
'
:
'
African
'
,
"
African accent
"
:
"
African
"
,
'
Kenyan
'
:
'
African
'
,
# TODO: un-generalise african accents?
"
Kenyan
"
:
"
African
"
,
# TODO: un-generalise african accents?
'
Ghanaian
'
:
'
African
'
,
# TODO: un-generalise african accents?
"
Ghanaian
"
:
"
African
"
,
# TODO: un-generalise african accents?
'
Spanish accent
'
:
'
Spanish
'
,
"
Spanish accent
"
:
"
Spanish
"
,
'
Lithuanian
'
:
'
Lithuanian
'
,
"
Lithuanian
"
:
"
Lithuanian
"
,
'
Lithuanian (eastern European)
'
:
'
Lithuanian
'
,
"
Lithuanian (eastern European)
"
:
"
Lithuanian
"
,
'
Indonesian
'
:
'
Indonesian
'
,
"
Indonesian
"
:
"
Indonesian
"
,
'
Egyptian
'
:
'
Egyptian
'
,
"
Egyptian
"
:
"
Egyptian
"
,
'
South African English
'
:
'
South African
'
,
"
South African English
"
:
"
South African
"
,
"Neutral"
:
"English"
,
"Neutral"
:
"English"
,
'
Neutral accent
'
:
'
English
'
,
"
Neutral accent
"
:
"
English
"
,
'
Neutral English, Italian
'
:
'
English
'
,
"
Neutral English, Italian
"
:
"
English
"
,
'
Fluent
'
:
'
Unknown
'
,
"
Fluent
"
:
"
Unknown
"
,
'
Glaswegian
'
:
'
Scottish
'
,
"
Glaswegian
"
:
"
Scottish
"
,
'
Glaswegian (not slang)
'
:
'
Scottish
'
,
"
Glaswegian (not slang)
"
:
"
Scottish
"
,
'
Irish
'
:
'
Irish
'
,
"
Irish
"
:
"
Irish
"
,
'
Jamaican
'
:
'
Jamaican
'
,
"
Jamaican
"
:
"
Jamaican
"
,
'
Jamaican accent
'
:
'
Jamaican
'
,
"
Jamaican accent
"
:
"
Jamaican
"
,
'
Irish/ Dublin
'
:
'
Irish
'
,
"
Irish/ Dublin
"
:
"
Irish
"
,
'
South Dublin Irish
'
:
'
Irish
'
,
"
South Dublin Irish
"
:
"
Irish
"
,
'
italian
'
:
'
Italian
'
,
"
italian
"
:
"
Italian
"
,
'
italian mixed with American and British English
'
:
'
Italian
'
,
"
italian mixed with American and British English
"
:
"
Italian
"
,
'
Italian mixed with American accent
'
:
'
Italian
'
,
"
Italian mixed with American accent
"
:
"
Italian
"
,
'
South American
'
:
'
Latin American
'
,
"
South American
"
:
"
Latin American
"
,
'
Brazilian accent
'
:
'
Latin American
'
,
# TODO: un-generalise latin american accents?
"
Brazilian accent
"
:
"
Latin American
"
,
# TODO: un-generalise latin american accents?
'
Israeli
'
:
'
Israeli
'
,
"
Israeli
"
:
"
Israeli
"
,
'
Vietnamese accent
'
:
'
Vietnamese
'
,
"
Vietnamese accent
"
:
"
Vietnamese
"
,
'
Southern Irish
'
:
'
Irish
'
,
"
Southern Irish
"
:
"
Irish
"
,
'
Slight Vietnamese accent
'
:
'
Vietnamese
'
,
"
Slight Vietnamese accent
"
:
"
Vietnamese
"
,
'
Midwestern United States
'
:
'
American
'
,
"
Midwestern United States
"
:
"
American
"
,
'
Vietnamese English
'
:
'
Vietnamese
'
,
"
Vietnamese English
"
:
"
Vietnamese
"
,
"Vietnamese"
:
"Vietnamese"
,
"Vietnamese"
:
"Vietnamese"
,
""
:
"Unknown"
""
:
"Unknown"
,
}
}
...
@@ -144,15 +146,19 @@ def main():
...
@@ -144,15 +146,19 @@ def main():
data_args
=
parser
.
parse_args_into_dataclasses
()[
0
]
data_args
=
parser
.
parse_args_into_dataclasses
()[
0
]
# 1. Load accents for each speaker
# 1. Load accents for each speaker
linguistic_background
=
dict
()
linguistic_background
=
{}
linguistic_background_csv
=
os
.
path
.
join
(
data_args
.
dataset_dir
,
"linguistic_background.csv"
)
linguistic_background_csv
=
os
.
path
.
join
(
data_args
.
dataset_dir
,
"linguistic_background.csv"
)
with
open
(
linguistic_background_csv
,
encoding
=
"utf-8"
)
as
file
:
with
open
(
linguistic_background_csv
,
encoding
=
"utf-8"
)
as
file
:
reader
=
csv
.
DictReader
(
file
,
delimiter
=
","
)
reader
=
csv
.
DictReader
(
file
,
delimiter
=
","
)
for
line
in
reader
:
for
line
in
reader
:
linguistic_background
[
line
[
"PARTICIPANT_ID"
]]
=
line
[
"How would you describe your accent in English? (e.g. Italian, Glaswegian)"
]
linguistic_background
[
line
[
"PARTICIPANT_ID"
]]
=
line
[
"How would you describe your accent in English? (e.g. Italian, Glaswegian)"
]
# 2. Clean accents for each speaker
# 2. Clean accents for each speaker
linguistic_background_clean
=
{
participant
:
ACCENT_MAPPING
[
accent
.
strip
()]
for
participant
,
accent
in
linguistic_background
.
items
()}
linguistic_background_clean
=
{
participant
:
ACCENT_MAPPING
[
accent
.
strip
()]
for
participant
,
accent
in
linguistic_background
.
items
()
}
# 3. Initialize dataset dict
# 3. Initialize dataset dict
raw_datasets
=
DatasetDict
()
raw_datasets
=
DatasetDict
()
...
@@ -175,21 +181,21 @@ def main():
...
@@ -175,21 +181,21 @@ def main():
all_audio_paths
=
[]
all_audio_paths
=
[]
all_normalized_accents
=
[]
all_normalized_accents
=
[]
all_raw_accents
=
[]
all_raw_accents
=
[]
current_audio
=
None
current_audio
=
None
current_audio_array
=
None
current_audio_array
=
None
current_sampling_rate
=
None
current_sampling_rate
=
None
current_counter
=
1
current_counter
=
1
gender_pat
=
r
'
.*?\<(.*),.*
'
gender_pat
=
r
"
.*?\<(.*),.*
"
l1_pat
=
r
'
.*?\,(.*)>.*
'
l1_pat
=
r
"
.*?\,(.*)>.*
"
with
open
(
metadata
,
"r"
)
as
file
:
with
open
(
metadata
,
"r"
)
as
file
:
for
idx
,
line
in
tqdm
(
enumerate
(
file
),
desc
=
split
):
for
idx
,
line
in
tqdm
(
enumerate
(
file
),
desc
=
split
):
# example line is: 'EDACC-C06 1 EDACC-C06-A 0.00 5.27 <male,l1> C ELEVEN DASH P ONE\n
# example line is: 'EDACC-C06 1 EDACC-C06-A 0.00 5.27 <male,l1> C ELEVEN DASH P ONE\n
# the transcription always comes to the right of the last rangle bracket
# the transcription always comes to the right of the last rangle bracket
text_idx
=
line
.
find
(
">"
)
+
1
text_idx
=
line
.
find
(
">"
)
+
1
all_texts
.
append
(
line
[
text_idx
+
1
:
-
1
])
all_texts
.
append
(
line
[
text_idx
+
1
:
-
1
])
# the metadata immediately proceeds this
# the metadata immediately proceeds this
line
=
line
[:
text_idx
]
line
=
line
[:
text_idx
]
file
,
channel
,
speaker
,
start
,
end
,
gender_l1
=
line
.
split
(
" "
)
file
,
channel
,
speaker
,
start
,
end
,
gender_l1
=
line
.
split
(
" "
)
...
@@ -202,10 +208,12 @@ def main():
...
@@ -202,10 +208,12 @@ def main():
# add gender/l1 information
# add gender/l1 information
all_genders
.
append
(
re
.
search
(
gender_pat
,
gender_l1
).
group
(
1
))
all_genders
.
append
(
re
.
search
(
gender_pat
,
gender_l1
).
group
(
1
))
all_l1s
.
append
(
re
.
search
(
l1_pat
,
gender_l1
).
group
(
1
))
all_l1s
.
append
(
re
.
search
(
l1_pat
,
gender_l1
).
group
(
1
))
# read audio file if different from previous
# read audio file if different from previous
if
file
!=
current_audio
:
if
file
!=
current_audio
:
current_audio_array
,
current_sampling_rate
=
sf
.
read
(
os
.
path
.
join
(
data_args
.
dataset_dir
,
"data"
,
file
+
".wav"
))
current_audio_array
,
current_sampling_rate
=
sf
.
read
(
os
.
path
.
join
(
data_args
.
dataset_dir
,
"data"
,
file
+
".wav"
)
)
current_audio
=
file
current_audio
=
file
current_counter
=
1
current_counter
=
1
else
:
else
:
...
@@ -215,7 +223,7 @@ def main():
...
@@ -215,7 +223,7 @@ def main():
start
=
int
(
float
(
start
)
*
current_sampling_rate
)
start
=
int
(
float
(
start
)
*
current_sampling_rate
)
end
=
int
(
float
(
end
)
*
current_sampling_rate
)
end
=
int
(
float
(
end
)
*
current_sampling_rate
)
end
=
min
(
end
,
len
(
current_audio_array
))
end
=
min
(
end
,
len
(
current_audio_array
))
chunked_audio
=
current_audio_array
[
start
:
end
]
chunked_audio
=
current_audio_array
[
start
:
end
]
save_path
=
os
.
path
.
join
(
output_dir_split
,
f
"
{
file
}
-
{
current_counter
}
.wav"
)
save_path
=
os
.
path
.
join
(
output_dir_split
,
f
"
{
file
}
-
{
current_counter
}
.wav"
)
sf
.
write
(
save_path
,
chunked_audio
,
current_sampling_rate
)
sf
.
write
(
save_path
,
chunked_audio
,
current_sampling_rate
)
all_audio_paths
.
append
(
save_path
)
all_audio_paths
.
append
(
save_path
)
...
@@ -224,14 +232,15 @@ def main():
...
@@ -224,14 +232,15 @@ def main():
break
break
raw_datasets
[
split_formatted
]
=
Dataset
.
from_dict
(
raw_datasets
[
split_formatted
]
=
Dataset
.
from_dict
(
{
"speaker"
:
all_speakers
,
{
"text"
:
all_texts
,
"speaker"
:
all_speakers
,
"accent"
:
all_normalized_accents
,
"text"
:
all_texts
,
"raw_accent"
:
all_raw_accents
,
"accent"
:
all_normalized_accents
,
"gender"
:
all_genders
,
"raw_accent"
:
all_raw_accents
,
"language"
:
all_l1s
,
"gender"
:
all_genders
,
"audio"
:
all_audio_paths
,
"language"
:
all_l1s
,
}
"audio"
:
all_audio_paths
,
}
).
cast_column
(
"audio"
,
Audio
())
).
cast_column
(
"audio"
,
Audio
())
if
data_args
.
push_to_hub
:
if
data_args
.
push_to_hub
:
...
@@ -239,5 +248,6 @@ def main():
...
@@ -239,5 +248,6 @@ def main():
raw_datasets
.
save_to_disk
(
data_args
.
output_dir
)
raw_datasets
.
save_to_disk
(
data_args
.
output_dir
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
main
()
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment