Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
parler-tts
Commits
7df8eb56
Unverified
Commit
7df8eb56
authored
Apr 10, 2024
by
Yoach Lacombe
Committed by
GitHub
Apr 10, 2024
Browse files
Merge pull request #3 from ylacombe/main
Remove artifacts
parents
a53577f2
260807c4
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
0 additions
and
590 deletions
+0
-590
audio_classification_scripts/run_dropout_sweep.yaml
audio_classification_scripts/run_dropout_sweep.yaml
+0
-128
audio_classification_scripts/run_mms_lid.sh
audio_classification_scripts/run_mms_lid.sh
+0
-38
audio_classification_scripts/run_mms_lid_with_cv.sh
audio_classification_scripts/run_mms_lid_with_cv.sh
+0
-41
audio_classification_scripts/run_sweep.yaml
audio_classification_scripts/run_sweep.yaml
+0
-86
dataset_concatenation_scripts/run_dataset_concatenation.sh
dataset_concatenation_scripts/run_dataset_concatenation.sh
+0
-21
dataset_concatenation_scripts/run_dataset_concatenation_cv.sh
...set_concatenation_scripts/run_dataset_concatenation_cv.sh
+0
-23
edacc/prepare_edacc.py
edacc/prepare_edacc.py
+0
-185
edacc/run_edacc.sh
edacc/run_edacc.sh
+0
-7
prompt_creation_scripts/run_prompt_creation_10k.sh
prompt_creation_scripts/run_prompt_creation_10k.sh
+0
-36
prompt_creation_scripts/run_prompt_creation_1k.sh
prompt_creation_scripts/run_prompt_creation_1k.sh
+0
-25
No files found.
audio_classification_scripts/run_dropout_sweep.yaml
deleted
100644 → 0
View file @
a53577f2
command
:
-
python3
-
${program}
-
--fp16
-
--fp16_full_eval
-
--do_train
-
--do_eval
-
--trust_remote_code
-
--overwrite_output_dir
-
--ignore_mismatched_sizes
-
--gradient_checkpointing
-
${args}
method
:
random
metric
:
goal
:
maximize
name
:
eval/accuracy
parameters
:
model_name_or_path
:
value
:
facebook/mms-lid-126
train_dataset_name
:
value
:
parler-tts/concatenated-normalized-accent-dataset
train_dataset_config_name
:
value
:
default
train_split_name
:
value
:
train
train_label_column_name
:
value
:
labels
eval_dataset_name
:
value
:
parler-tts/concatenated-normalized-accent-dataset
eval_dataset_config_name
:
value
:
default
eval_split_name
:
value
:
test
eval_label_column_name
:
value
:
labels
output_dir
:
value
:
./
remove_unused_columns
:
value
:
false
learning_rate
:
value
:
1e-4
lr_scheduler_type
:
value
:
constant_with_warmup
max_length_seconds
:
value
:
20
min_length_seconds
:
value
:
5
attention_mask
:
value
:
true
warmup_steps
:
value
:
50
max_steps
:
value
:
1000
per_device_train_batch_size
:
value
:
32
per_device_eval_batch_size
:
value
:
32
preprocessing_num_workers
:
value
:
4
dataloader_num_workers
:
value
:
4
logging_strategy
:
value
:
steps
logging_steps
:
value
:
10
evaluation_strategy
:
value
:
steps
eval_steps
:
value
:
1000
save_strategy
:
value
:
steps
save_steps
:
value
:
1000
freeze_base_model
:
values
:
-
false
-
true
push_to_hub
:
value
:
false
filter_threshold
:
value
:
1
feat_proj_dropout
:
values
:
-
0.0
-
0.1
-
0.2
attention_dropout
:
values
:
-
0.0
-
0.1
-
0.2
activation_dropout
:
values
:
-
0.0
-
0.1
-
0.2
hidden_dropout
:
values
:
-
0.0
-
0.1
-
0.2
final_dropout
:
values
:
-
0.0
-
0.1
-
0.2
mask_time_prob
:
values
:
-
0.0
-
0.1
-
0.2
mask_time_length
:
values
:
-
10
-
15
-
20
mask_feature_prob
:
values
:
-
0.0
-
0.1
-
0.2
mask_feature_length
:
values
:
-
10
-
15
-
20
program
:
run_audio_classification.py
project
:
mms-lid-accent-classification
\ No newline at end of file
audio_classification_scripts/run_mms_lid.sh
deleted
100644 → 0
View file @
a53577f2
#!/usr/bin/env bash
python run_audio_classification.py
\
--model_name_or_path
"facebook/mms-lid-126"
\
--train_dataset_name
"parler-tts/concatenated-normalized-accent-dataset"
\
--train_dataset_config_name
"default"
\
--train_split_name
"train"
\
--train_label_column_name
"labels"
\
--eval_dataset_name
"parler-tts/concatenated-normalized-accent-dataset"
\
--eval_dataset_config_name
"default"
\
--eval_split_name
"test"
\
--eval_label_column_name
"labels"
\
--output_dir
"./"
\
--do_train
\
--do_eval
\
--overwrite_output_dir
\
--remove_unused_columns
False
\
--fp16
\
--fp16_full_eval
\
--learning_rate
1e-4
\
--max_length_seconds
20
\
--min_length_seconds
5
\
--attention_mask
\
--warmup_steps
100
\
--max_steps
2000
\
--per_device_train_batch_size
32
\
--per_device_eval_batch_size
32
\
--preprocessing_num_workers
4
\
--dataloader_num_workers
4
\
--logging_strategy
"steps"
\
--logging_steps
10
\
--evaluation_strategy
"steps"
\
--eval_steps
500
\
--save_strategy
"no"
\
--save_steps
2000
\
--freeze_base_model
True
\
--push_to_hub
False
\
--trust_remote_code
audio_classification_scripts/run_mms_lid_with_cv.sh
deleted
100644 → 0
View file @
a53577f2
#!/usr/bin/env bash
python run_audio_classification.py
\
--model_name_or_path
"facebook/mms-lid-126"
\
--train_dataset_name
"parler-tts/concatenated-normalized-accent-dataset+parler-tts/concatenated-common-voice-15-accented"
\
--train_dataset_config_name
"default+default"
\
--train_split_name
"train+train"
\
--train_label_column_name
"labels+labels"
\
--eval_dataset_name
"parler-tts/concatenated-normalized-accent-dataset"
\
--eval_dataset_config_name
"default"
\
--eval_split_name
"test"
\
--eval_label_column_name
"labels"
\
--output_dir
"./"
\
--do_train
\
--do_eval
\
--overwrite_output_dir
\
--remove_unused_columns
False
\
--fp16
\
--fp16_full_eval
\
--learning_rate
1e-4
\
--lr_scheduler_type
"constant_with_warmup"
\
--max_length_seconds
20
\
--min_length_seconds
5
\
--attention_mask
\
--warmup_steps
100
\
--max_steps
5000
\
--per_device_train_batch_size
32
\
--per_device_eval_batch_size
32
\
--preprocessing_num_workers
4
\
--dataloader_num_workers
4
\
--logging_strategy
"steps"
\
--logging_steps
10
\
--evaluation_strategy
"steps"
\
--eval_steps
1000
\
--save_strategy
"no"
\
--save_steps
5000
\
--filter_threshold
0.01
\
--freeze_base_model
False
\
--gradient_checkpointing
\
--push_to_hub
False
\
--trust_remote_code
audio_classification_scripts/run_sweep.yaml
deleted
100644 → 0
View file @
a53577f2
command
:
-
python3
-
${program}
-
--load_best_model_at_end
-
--fp16
-
--fp16_full_eval
-
--do_train
-
--do_eval
-
--trust_remote_code
-
--overwrite_output_dir
-
--ignore_mismatched_sizes
-
--gradient_checkpointing
-
${args}
method
:
grid
metric
:
goal
:
maximize
name
:
eval/accuracy
parameters
:
model_name_or_path
:
value
:
facebook/mms-lid-126
train_dataset_name
:
value
:
parler-tts/concatenated-accent-dataset
train_dataset_config_name
:
value
:
default
train_split_name
:
value
:
train
train_label_column_name
:
value
:
labels
eval_dataset_name
:
value
:
parler-tts/concatenated-accent-dataset
eval_dataset_config_name
:
value
:
default
eval_split_name
:
value
:
test
eval_label_column_name
:
value
:
labels
output_dir
:
value
:
./
remove_unused_columns
:
value
:
false
learning_rate
:
value
:
1e-4
lr_scheduler_type
:
value
:
constant_with_warmup
max_length_seconds
:
value
:
20
# give some data diversity for longer audio samples
min_length_seconds
:
value
:
7
attention_mask
:
value
:
true
warmup_steps
:
value
:
100
max_steps
:
value
:
2000
per_device_train_batch_size
:
value
:
32
per_device_eval_batch_size
:
value
:
16
preprocessing_num_workers
:
value
:
4
dataloader_num_workers
:
value
:
4
logging_strategy
:
value
:
steps
logging_steps
:
value
:
10
evaluation_strategy
:
value
:
steps
eval_steps
:
value
:
1000
save_strategy
:
value
:
steps
save_steps
:
value
:
2000
metric_for_best_model
:
value
:
accuracy
freeze_base_model
:
values
:
-
false
-
true
group_by_length
:
value
:
false
# TODO(SG): batch by length
push_to_hub
:
value
:
false
program
:
run_audio_classification.py
project
:
mms-lid-accent-classification
\ No newline at end of file
dataset_concatenation_scripts/run_dataset_concatenation.sh
deleted
100644 → 0
View file @
a53577f2
#!/usr/bin/env bash
python run_dataset_concatenation.py
\
--dataset_name
"sanchit-gandhi/vctk+facebook/voxpopuli+edinburghcstr/edacc-normalized"
\
--dataset_config_name
"default+en_accented+default"
\
--dataset_split_name
"train+test+validation"
\
--label_column_name
"accent+accent+accent"
\
--text_column_name
"text+normalized_text+text"
\
--speaker_column_name
"speaker_id+speaker_id+speaker"
\
--batch_size
500
\
--output_dir
"./concatenated-dataset"
python run_dataset_concatenation.py
\
--dataset_name
"edinburghcstr/edacc-normalized"
\
--dataset_config_name
"default"
\
--dataset_split_name
"test"
\
--label_column_name
"accent"
\
--text_column_name
"text"
\
--speaker_column_name
"speaker"
\
--batch_size
500
\
--output_dir
"./concatenated-dataset-test"
dataset_concatenation_scripts/run_dataset_concatenation_cv.sh
deleted
100644 → 0
View file @
a53577f2
#!/usr/bin/env bash
python run_dataset_concatenation.py
\
--dataset_name
"parler-tts/common_voice_15_0_accented"
\
--dataset_config_name
"en"
\
--dataset_split_name
"train"
\
--label_column_name
"accent"
\
--text_column_name
"sentence"
\
--speaker_column_name
"client_id"
\
--batch_size
250
\
--preprocessing_num_workers
4
\
--output_dir
"./concatenated-dataset-cv"
python run_dataset_concatenation.py
\
--dataset_name
"parler-tts/common_voice_15_0_accented"
\
--dataset_config_name
"en"
\
--dataset_split_name
"test"
\
--label_column_name
"accent"
\
--text_column_name
"sentence"
\
--speaker_column_name
"client_id"
\
--batch_size
250
\
--preprocessing_num_workers
4
\
--output_dir
"./concatenated-dataset-cv-test"
edacc/prepare_edacc.py
deleted
100644 → 0
View file @
a53577f2
import
csv
import
os
import
re
import
shutil
import
sys
from
dataclasses
import
dataclass
,
field
import
soundfile
as
sf
from
datasets
import
Audio
,
Dataset
,
DatasetDict
,
load_dataset
from
tqdm
import
tqdm
from
transformers
import
HfArgumentParser
@
dataclass
class
DataTrainingArguments
:
"""
Arguments pertaining to what data we are going to input our data for prepareation
"""
dataset_dir
:
str
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Path where the EdAcc tar.gz archive is extracted. Leave in it's raw format: the script will "
"assume it's unchanged from the download and use relative paths to load the relevant audio files."
},
)
output_dir
:
str
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Where to save the processed dataset to disk. If unspecified, uses a 'pretty' version of the "
"original dataset name. E.g. 'facebook/voxpopuli' will be saved under 'voxpopuli'."
},
)
overwrite_output_dir
:
bool
=
field
(
default
=
True
,
metadata
=
{
"help"
:
"Overwrite the content of the output directory."
},
)
push_to_hub
:
bool
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Whether or not to push the processed dataset to the Hub."
},
)
hub_dataset_id
:
str
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Repository namespace if pushing to the Hugging Face Hub."
},
)
private_repo
:
bool
=
field
(
default
=
True
,
metadata
=
{
"help"
:
"Whether or not to push the processed dataset to a private repository on the Hub"
},
)
max_samples
:
int
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Maximum number of samples per split. Useful for debugging purposes."
},
)
def
main
():
# 1. Parse input arguments
parser
=
HfArgumentParser
(
DataTrainingArguments
)
if
len
(
sys
.
argv
)
==
2
and
sys
.
argv
[
1
].
endswith
(
".json"
):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
data_args
=
parser
.
parse_json_file
(
json_file
=
os
.
path
.
abspath
(
sys
.
argv
[
1
]))[
0
]
else
:
data_args
=
parser
.
parse_args_into_dataclasses
()[
0
]
# 1. Load accents for each speaker
linguistic_background
=
{}
linguistic_background_csv
=
os
.
path
.
join
(
data_args
.
dataset_dir
,
"linguistic_background.csv"
)
with
open
(
linguistic_background_csv
,
encoding
=
"utf-8"
)
as
file
:
reader
=
csv
.
DictReader
(
file
,
delimiter
=
","
)
for
line
in
reader
:
linguistic_background
[
line
[
"PARTICIPANT_ID"
]]
=
line
[
"How would you describe your accent in English? (e.g. Italian, Glaswegian)"
]
accent_dataset
=
load_dataset
(
"edinburghcstr/edacc_accents"
,
split
=
"train"
)
def
format_dataset
(
batch
):
batch
[
"speaker_id"
]
=
(
batch
[
"Final-Participant_ID"
].
replace
(
"EAEC"
,
"EDACC"
).
replace
(
"P1"
,
"-A"
).
replace
(
"P2"
,
"-B"
)
)
return
batch
accent_dataset
=
accent_dataset
.
map
(
format_dataset
,
remove_columns
=
[
"Final-Participant_ID"
])
# 2. Clean accents for each speaker
linguistic_background_clean
=
{
participant
:
accent
.
strip
()
for
participant
,
accent
in
zip
(
accent_dataset
[
"speaker_id"
],
accent_dataset
[
"English_Variety"
])
}
linguistic_variety
=
{
participant
:
l1
.
strip
()
for
participant
,
l1
in
zip
(
accent_dataset
[
"speaker_id"
],
accent_dataset
[
"L1_Variety"
])
}
# 3. Initialize dataset dict
raw_datasets
=
DatasetDict
()
if
data_args
.
overwrite_output_dir
and
os
.
path
.
exists
(
data_args
.
output_dir
)
and
os
.
path
.
isdir
(
data_args
.
output_dir
):
shutil
.
rmtree
(
data_args
.
output_dir
)
output_dir_processed
=
os
.
path
.
join
(
data_args
.
output_dir
,
"processed"
)
# 4. Iterate over dev/test files
for
split
,
split_formatted
in
zip
([
"dev"
,
"test"
],
[
"validation"
,
"test"
]):
data_dir
=
os
.
path
.
join
(
data_args
.
dataset_dir
,
split
)
metadata
=
os
.
path
.
join
(
data_dir
,
"stm"
)
output_dir_split
=
os
.
path
.
join
(
output_dir_processed
,
split
)
os
.
makedirs
(
output_dir_split
,
exist_ok
=
True
)
all_speakers
=
[]
all_genders
=
[]
all_l1s
=
[]
all_texts
=
[]
all_audio_paths
=
[]
all_normalized_accents
=
[]
all_raw_accents
=
[]
current_audio
=
None
current_audio_array
=
None
current_sampling_rate
=
None
current_counter
=
1
gender_pat
=
r
".*?\<(.*),.*"
l1_pat
=
r
".*?\,(.*)>.*"
with
open
(
metadata
,
"r"
)
as
file
:
for
idx
,
line
in
tqdm
(
enumerate
(
file
),
desc
=
split
):
# example line is: 'EDACC-C06 1 EDACC-C06-A 0.00 5.27 <male,l1> C ELEVEN DASH P ONE\n
# the transcription always comes to the right of the last rangle bracket
text_idx
=
line
.
find
(
">"
)
+
1
all_texts
.
append
(
line
[
text_idx
+
1
:
-
1
])
# the metadata immediately proceeds this
line
=
line
[:
text_idx
]
file
,
channel
,
speaker
,
start
,
end
,
gender_l1
=
line
.
split
(
" "
)
# add speaker information to cumulative lists
all_raw_accents
.
append
(
linguistic_background
[
speaker
])
all_normalized_accents
.
append
(
linguistic_background_clean
[
speaker
])
all_speakers
.
append
(
speaker
)
# add gender/l1 information
all_genders
.
append
(
re
.
search
(
gender_pat
,
gender_l1
).
group
(
1
))
all_l1s
.
append
(
linguistic_variety
[
speaker
])
# read audio file if different from previous
if
file
!=
current_audio
:
current_audio_array
,
current_sampling_rate
=
sf
.
read
(
os
.
path
.
join
(
data_args
.
dataset_dir
,
"data"
,
file
+
".wav"
)
)
current_audio
=
file
current_counter
=
1
else
:
current_counter
+=
1
# chunk audio file according to start/end times
start
=
int
(
float
(
start
)
*
current_sampling_rate
)
end
=
int
(
float
(
end
)
*
current_sampling_rate
)
end
=
min
(
end
,
len
(
current_audio_array
))
chunked_audio
=
current_audio_array
[
start
:
end
]
save_path
=
os
.
path
.
join
(
output_dir_split
,
f
"
{
file
}
-
{
current_counter
}
.wav"
)
sf
.
write
(
save_path
,
chunked_audio
,
current_sampling_rate
)
all_audio_paths
.
append
(
save_path
)
if
data_args
.
max_samples
is
not
None
and
(
data_args
.
max_samples
-
1
)
==
idx
:
break
raw_datasets
[
split_formatted
]
=
Dataset
.
from_dict
(
{
"speaker"
:
all_speakers
,
"text"
:
all_texts
,
"accent"
:
all_normalized_accents
,
"raw_accent"
:
all_raw_accents
,
"gender"
:
all_genders
,
"l1"
:
all_l1s
,
"audio"
:
all_audio_paths
,
}
).
cast_column
(
"audio"
,
Audio
())
if
data_args
.
push_to_hub
:
raw_datasets
.
push_to_hub
(
data_args
.
hub_dataset_id
,
token
=
True
)
raw_datasets
.
save_to_disk
(
data_args
.
output_dir
)
if
__name__
==
"__main__"
:
main
()
edacc/run_edacc.sh
deleted
100644 → 0
View file @
a53577f2
#!/usr/bin/env bash
python prepare_edacc.py
\
--dataset_dir
"/fsx/sanchit/edacc/edacc_v1.0"
\
--output_dir
"/fsx/sanchit/edacc_processed"
\
--hub_dataset_id
"edinburghcstr/edacc-normalized"
\
--push_to_hub
prompt_creation_scripts/run_prompt_creation_10k.sh
deleted
100644 → 0
View file @
a53577f2
#!/usr/bin/env bash
accelerate launch
--multi_gpu
--mixed_precision
=
fp16
--num_processes
=
8 run_prompt_creation.py
\
--dataset_name
"ylacombe/libritts_r_tags_tagged_10k"
\
--dataset_config_name
"clean"
\
--model_name_or_path
"mistralai/Mistral-7B-Instruct-v0.2"
\
--per_device_eval_batch_size
64
\
--attn_implementation
"sdpa"
\
--dataloader_num_workers
4
\
--output_dir
"./libritts_r_tags_tagged_10k_generated"
\
--load_in_4bit
\
--push_to_hub
\
--hub_dataset_id
"parler-tts/libritts_r_tags_tagged_10k_generated"
accelerate launch
--multi_gpu
--mixed_precision
=
fp16
--num_processes
=
8 run_prompt_creation.py
\
--dataset_name
"ylacombe/libritts_r_tags_tagged_10k"
\
--dataset_config_name
"other"
\
--model_name_or_path
"mistralai/Mistral-7B-Instruct-v0.2"
\
--per_device_eval_batch_size
64
\
--attn_implementation
"sdpa"
\
--dataloader_num_workers
4
\
--output_dir
"./libritts_r_tags_tagged_10k_generated"
\
--load_in_4bit
\
--push_to_hub
\
--hub_dataset_id
"parler-tts/libritts_r_tags_tagged_10k_generated"
accelerate launch
--multi_gpu
--mixed_precision
=
fp16
--num_processes
=
8 run_prompt_creation.py
\
--dataset_name
"ylacombe/mls-eng-10k-tags_tagged_10k"
\
--model_name_or_path
"mistralai/Mistral-7B-Instruct-v0.2"
\
--per_device_eval_batch_size
64
\
--attn_implementation
"sdpa"
\
--dataloader_num_workers
4
\
--output_dir
"./mls-eng-10k-tags_tagged_10k_generated"
\
--load_in_4bit
\
--push_to_hub
\
--hub_dataset_id
"parler-tts/mls-eng-10k-tags_tagged_10k_generated"
prompt_creation_scripts/run_prompt_creation_1k.sh
deleted
100644 → 0
View file @
a53577f2
#!/usr/bin/env bash
accelerate launch
--multi_gpu
--mixed_precision
=
fp16
--num_processes
=
8 run_prompt_creation.py
\
--dataset_name
"parler-tts/libritts-r-tags-and-text"
\
--dataset_config_name
"clean"
\
--model_name_or_path
"mistralai/Mistral-7B-Instruct-v0.2"
\
--per_device_eval_batch_size
64
\
--attn_implementation
"sdpa"
\
--dataloader_num_workers
4
\
--output_dir
"./"
\
--load_in_4bit
\
--push_to_hub
\
--hub_dataset_id
"parler-tts/libritts-r-tags-and-text-generated"
accelerate launch
--multi_gpu
--mixed_precision
=
fp16
--num_processes
=
8 run_prompt_creation.py
\
--dataset_name
"parler-tts/libritts-r-tags-and-text"
\
--dataset_config_name
"other"
\
--model_name_or_path
"mistralai/Mistral-7B-Instruct-v0.2"
\
--per_device_eval_batch_size
64
\
--attn_implementation
"sdpa"
\
--dataloader_num_workers
4
\
--output_dir
"./"
\
--load_in_4bit
\
--push_to_hub
\
--hub_dataset_id
"parler-tts/libritts-r-tags-and-text-generated"
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment