Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
parler-tts
Commits
733ea787
Commit
733ea787
authored
Feb 14, 2024
by
sanchit-gandhi
Browse files
finish edacc and push
parent
00f621dd
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
93 additions
and
7 deletions
+93
-7
prepare_edacc.py
prepare_edacc.py
+90
-6
run_edacc.sh
run_edacc.sh
+3
-1
No files found.
prepare_edacc.py
View file @
733ea787
import
csv
import
os
import
re
import
shutil
import
sys
from
dataclasses
import
dataclass
,
field
from
datasets
import
DatasetDict
,
Dataset
,
Audio
from
tqdm
import
tqdm
from
transformers
import
HfArgumentParser
import
soundfile
as
sf
@
dataclass
...
...
@@ -24,14 +30,26 @@ class DataTrainingArguments:
"original dataset name. E.g. 'facebook/voxpopuli' will be saved under 'voxpopuli'."
},
)
overwrite_output_dir
:
bool
=
field
(
default
=
True
,
metadata
=
{
"help"
:
"Overwrite the content of the output directory."
},
)
push_to_hub
:
bool
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether or not to push the processed dataset to the Hub."
},
)
hub_dataset_id
:
str
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Repository namespace if pushing to the Hugging Face Hub."
},
)
private_repo
:
bool
=
field
(
default
=
True
,
metadata
=
{
"help"
:
"Whether or not to push the processed dataset to a private repository on the Hub"
},
)
max_samples
:
int
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Maximum number of samples per split. Useful for debugging purposes."
},
)
ACCENT_MAPPING
=
{
'Italian'
:
'Italian'
,
...
...
@@ -136,24 +154,90 @@ def main():
# 2. Clean accents for each speaker
linguistic_background_clean
=
{
participant
:
ACCENT_MAPPING
[
accent
.
strip
()]
for
participant
,
accent
in
linguistic_background
.
items
()}
# 3. Iterate over dev/test files
for
split
in
[
"dev"
,
"test"
]:
# 3. Initialize dataset dict
raw_datasets
=
DatasetDict
()
if
data_args
.
overwrite_output_dir
and
os
.
path
.
exists
(
data_args
.
output_dir
)
and
os
.
path
.
isdir
(
data_args
.
output_dir
):
shutil
.
rmtree
(
data_args
.
output_dir
)
output_dir_processed
=
os
.
path
.
join
(
data_args
.
output_dir
,
"processed"
)
# 4. Iterate over dev/test files
for
split
,
split_formatted
in
zip
([
"dev"
,
"test"
],
[
"validation"
,
"test"
]):
data_dir
=
os
.
path
.
join
(
data_args
.
dataset_dir
,
split
)
metadata
=
os
.
path
.
join
(
data_dir
,
"stm"
)
output_dir_split
=
os
.
path
.
join
(
output_dir_processed
,
split
)
os
.
makedirs
(
output_dir_split
,
exist_ok
=
True
)
all_speakers
=
[]
all_genders
=
[]
all_l1s
=
[]
all_texts
=
[]
all_audio_paths
=
[]
all_normalized_accents
=
[]
all_raw_accents
=
[]
current_audio
=
None
current_audio_array
=
None
current_sampling_rate
=
None
current_counter
=
1
gender_pat
=
r
'.*?\<(.*),.*'
l1_pat
=
r
'.*?\,(.*)>.*'
with
open
(
metadata
,
"r"
)
as
file
:
for
line
in
file
:
for
idx
,
line
in
tqdm
(
enumerate
(
file
),
desc
=
split
)
:
# example line is: 'EDACC-C06 1 EDACC-C06-A 0.00 5.27 <male,l1> C ELEVEN DASH P ONE\n
# the transcription always comes to the right of the last rangle bracket
text_idx
=
line
.
r
find
(
">"
)
+
1
text
=
line
[
text_idx
:
-
1
]
text_idx
=
line
.
find
(
">"
)
+
1
all_texts
.
append
(
line
[
text_idx
+
1
:
-
1
]
)
# the metadata immediately proceeds this
line
=
line
[:
text_idx
]
file
,
channel
,
speaker
,
start
,
end
,
gender
=
line
.
split
(
" "
)
file
,
channel
,
speaker
,
start
,
end
,
gender_l1
=
line
.
split
(
" "
)
# add speaker information to cumulative lists
all_raw_accents
.
append
(
linguistic_background
[
speaker
])
all_normalized_accents
.
append
(
linguistic_background_clean
[
speaker
])
all_speakers
.
append
(
speaker
)
# add gender/l1 information
all_genders
.
append
(
re
.
search
(
gender_pat
,
gender_l1
).
group
(
1
))
all_l1s
.
append
(
re
.
search
(
l1_pat
,
gender_l1
).
group
(
1
))
# read audio file if different from previous
if
file
!=
current_audio
:
current_audio_array
,
current_sampling_rate
=
sf
.
read
(
os
.
path
.
join
(
data_args
.
dataset_dir
,
"data"
,
file
+
".wav"
))
current_audio
=
file
current_counter
=
1
else
:
current_counter
+=
1
# chunk audio file according to start/end times
start
=
int
(
float
(
start
)
*
current_sampling_rate
)
end
=
int
(
float
(
end
)
*
current_sampling_rate
)
end
=
min
(
end
,
len
(
current_audio_array
))
chunked_audio
=
current_audio_array
[
start
:
end
]
save_path
=
os
.
path
.
join
(
output_dir_split
,
f
"
{
file
}
-
{
current_counter
}
.wav"
)
sf
.
write
(
save_path
,
chunked_audio
,
current_sampling_rate
)
all_audio_paths
.
append
(
save_path
)
if
data_args
.
max_samples
is
not
None
and
(
data_args
.
max_samples
-
1
)
==
idx
:
break
raw_datasets
[
split_formatted
]
=
Dataset
.
from_dict
(
{
"speaker"
:
all_speakers
,
"text"
:
all_texts
,
"accent"
:
all_normalized_accents
,
"raw_accent"
:
all_raw_accents
,
"gender"
:
all_genders
,
"language"
:
all_l1s
,
"audio"
:
all_audio_paths
,
}
).
cast_column
(
"audio"
,
Audio
())
if
data_args
.
push_to_hub
:
raw_datasets
.
push_to_hub
(
data_args
.
hub_dataset_id
,
token
=
True
)
raw_datasets
.
save_to_disk
(
data_args
.
output_dir
)
if
__name__
==
"__main__"
:
main
()
run_edacc.sh
View file @
733ea787
...
...
@@ -2,4 +2,6 @@
python prepare_edacc.py
\
--dataset_dir
"/fsx/sanchit/edacc/edacc_v1.0"
\
--output_dir
"/fsx/sanchit/edacc_processed"
\ No newline at end of file
--output_dir
"/fsx/sanchit/edacc_processed"
\
--hub_dataset_id
"sanchit-gandhi/edacc"
\
--push_to_hub
True
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment