Unverified Commit 7df8eb56 authored by Yoach Lacombe's avatar Yoach Lacombe Committed by GitHub
Browse files

Merge pull request #3 from ylacombe/main

Remove artifacts
parents a53577f2 260807c4
command:
- python3
- ${program}
- --fp16
- --fp16_full_eval
- --do_train
- --do_eval
- --trust_remote_code
- --overwrite_output_dir
- --ignore_mismatched_sizes
- --gradient_checkpointing
- ${args}
method: random
metric:
goal: maximize
name: eval/accuracy
parameters:
model_name_or_path:
value: facebook/mms-lid-126
train_dataset_name:
value: parler-tts/concatenated-normalized-accent-dataset
train_dataset_config_name:
value: default
train_split_name:
value: train
train_label_column_name:
value: labels
eval_dataset_name:
value: parler-tts/concatenated-normalized-accent-dataset
eval_dataset_config_name:
value: default
eval_split_name:
value: test
eval_label_column_name:
value: labels
output_dir:
value: ./
remove_unused_columns:
value: false
learning_rate:
value: 1e-4
lr_scheduler_type:
value: constant_with_warmup
max_length_seconds:
value: 20
min_length_seconds:
value: 5
attention_mask:
value: true
warmup_steps:
value: 50
max_steps:
value: 1000
per_device_train_batch_size:
value: 32
per_device_eval_batch_size:
value: 32
preprocessing_num_workers:
value: 4
dataloader_num_workers:
value: 4
logging_strategy:
value: steps
logging_steps:
value: 10
evaluation_strategy:
value: steps
eval_steps:
value: 1000
save_strategy:
value: steps
save_steps:
value: 1000
freeze_base_model:
values:
- false
- true
push_to_hub:
value: false
filter_threshold:
value: 1
feat_proj_dropout:
values:
- 0.0
- 0.1
- 0.2
attention_dropout:
values:
- 0.0
- 0.1
- 0.2
activation_dropout:
values:
- 0.0
- 0.1
- 0.2
hidden_dropout:
values:
- 0.0
- 0.1
- 0.2
final_dropout:
values:
- 0.0
- 0.1
- 0.2
mask_time_prob:
values:
- 0.0
- 0.1
- 0.2
mask_time_length:
values:
- 10
- 15
- 20
mask_feature_prob:
values:
- 0.0
- 0.1
- 0.2
mask_feature_length:
values:
- 10
- 15
- 20
program: run_audio_classification.py
project: mms-lid-accent-classification
\ No newline at end of file
#!/usr/bin/env bash
python run_audio_classification.py \
--model_name_or_path "facebook/mms-lid-126" \
--train_dataset_name "parler-tts/concatenated-normalized-accent-dataset" \
--train_dataset_config_name "default" \
--train_split_name "train" \
--train_label_column_name "labels" \
--eval_dataset_name "parler-tts/concatenated-normalized-accent-dataset" \
--eval_dataset_config_name "default" \
--eval_split_name "test" \
--eval_label_column_name "labels" \
--output_dir "./" \
--do_train \
--do_eval \
--overwrite_output_dir \
--remove_unused_columns False \
--fp16 \
--fp16_full_eval \
--learning_rate 1e-4 \
--max_length_seconds 20 \
--min_length_seconds 5 \
--attention_mask \
--warmup_steps 100 \
--max_steps 2000 \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--preprocessing_num_workers 4 \
--dataloader_num_workers 4 \
--logging_strategy "steps" \
--logging_steps 10 \
--evaluation_strategy "steps" \
--eval_steps 500 \
--save_strategy "no" \
--save_steps 2000 \
--freeze_base_model True \
--push_to_hub False \
--trust_remote_code
#!/usr/bin/env bash
python run_audio_classification.py \
--model_name_or_path "facebook/mms-lid-126" \
--train_dataset_name "parler-tts/concatenated-normalized-accent-dataset+parler-tts/concatenated-common-voice-15-accented" \
--train_dataset_config_name "default+default" \
--train_split_name "train+train" \
--train_label_column_name "labels+labels" \
--eval_dataset_name "parler-tts/concatenated-normalized-accent-dataset" \
--eval_dataset_config_name "default" \
--eval_split_name "test" \
--eval_label_column_name "labels" \
--output_dir "./" \
--do_train \
--do_eval \
--overwrite_output_dir \
--remove_unused_columns False \
--fp16 \
--fp16_full_eval \
--learning_rate 1e-4 \
--lr_scheduler_type "constant_with_warmup" \
--max_length_seconds 20 \
--min_length_seconds 5 \
--attention_mask \
--warmup_steps 100 \
--max_steps 5000 \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 32 \
--preprocessing_num_workers 4 \
--dataloader_num_workers 4 \
--logging_strategy "steps" \
--logging_steps 10 \
--evaluation_strategy "steps" \
--eval_steps 1000 \
--save_strategy "no" \
--save_steps 5000 \
--filter_threshold 0.01 \
--freeze_base_model False \
--gradient_checkpointing \
--push_to_hub False \
--trust_remote_code
command:
- python3
- ${program}
- --load_best_model_at_end
- --fp16
- --fp16_full_eval
- --do_train
- --do_eval
- --trust_remote_code
- --overwrite_output_dir
- --ignore_mismatched_sizes
- --gradient_checkpointing
- ${args}
method: grid
metric:
goal: maximize
name: eval/accuracy
parameters:
model_name_or_path:
value: facebook/mms-lid-126
train_dataset_name:
value: parler-tts/concatenated-accent-dataset
train_dataset_config_name:
value: default
train_split_name:
value: train
train_label_column_name:
value: labels
eval_dataset_name:
value: parler-tts/concatenated-accent-dataset
eval_dataset_config_name:
value: default
eval_split_name:
value: test
eval_label_column_name:
value: labels
output_dir:
value: ./
remove_unused_columns:
value: false
learning_rate:
value: 1e-4
lr_scheduler_type:
value: constant_with_warmup
max_length_seconds:
value: 20 # give some data diversity for longer audio samples
min_length_seconds:
value: 7
attention_mask:
value: true
warmup_steps:
value: 100
max_steps:
value: 2000
per_device_train_batch_size:
value: 32
per_device_eval_batch_size:
value: 16
preprocessing_num_workers:
value: 4
dataloader_num_workers:
value: 4
logging_strategy:
value: steps
logging_steps:
value: 10
evaluation_strategy:
value: steps
eval_steps:
value: 1000
save_strategy:
value: steps
save_steps:
value: 2000
metric_for_best_model:
value: accuracy
freeze_base_model:
values:
- false
- true
group_by_length:
value: false # TODO(SG): batch by length
push_to_hub:
value: false
program: run_audio_classification.py
project: mms-lid-accent-classification
\ No newline at end of file
#!/usr/bin/env bash
python run_dataset_concatenation.py \
--dataset_name "sanchit-gandhi/vctk+facebook/voxpopuli+edinburghcstr/edacc-normalized" \
--dataset_config_name "default+en_accented+default" \
--dataset_split_name "train+test+validation" \
--label_column_name "accent+accent+accent" \
--text_column_name "text+normalized_text+text" \
--speaker_column_name "speaker_id+speaker_id+speaker" \
--batch_size 500 \
--output_dir "./concatenated-dataset"
python run_dataset_concatenation.py \
--dataset_name "edinburghcstr/edacc-normalized" \
--dataset_config_name "default" \
--dataset_split_name "test" \
--label_column_name "accent" \
--text_column_name "text" \
--speaker_column_name "speaker" \
--batch_size 500 \
--output_dir "./concatenated-dataset-test"
#!/usr/bin/env bash
python run_dataset_concatenation.py \
--dataset_name "parler-tts/common_voice_15_0_accented" \
--dataset_config_name "en" \
--dataset_split_name "train" \
--label_column_name "accent" \
--text_column_name "sentence" \
--speaker_column_name "client_id" \
--batch_size 250 \
--preprocessing_num_workers 4 \
--output_dir "./concatenated-dataset-cv"
python run_dataset_concatenation.py \
--dataset_name "parler-tts/common_voice_15_0_accented" \
--dataset_config_name "en" \
--dataset_split_name "test" \
--label_column_name "accent" \
--text_column_name "sentence" \
--speaker_column_name "client_id" \
--batch_size 250 \
--preprocessing_num_workers 4 \
--output_dir "./concatenated-dataset-cv-test"
import csv
import os
import re
import shutil
import sys
from dataclasses import dataclass, field
import soundfile as sf
from datasets import Audio, Dataset, DatasetDict, load_dataset
from tqdm import tqdm
from transformers import HfArgumentParser
@dataclass
class DataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our data for prepareation
"""
dataset_dir: str = field(
default=None,
metadata={
"help": "Path where the EdAcc tar.gz archive is extracted. Leave in it's raw format: the script will "
"assume it's unchanged from the download and use relative paths to load the relevant audio files."
},
)
output_dir: str = field(
default=None,
metadata={
"help": "Where to save the processed dataset to disk. If unspecified, uses a 'pretty' version of the "
"original dataset name. E.g. 'facebook/voxpopuli' will be saved under 'voxpopuli'."
},
)
overwrite_output_dir: bool = field(
default=True,
metadata={"help": "Overwrite the content of the output directory."},
)
push_to_hub: bool = field(
default=False,
metadata={"help": "Whether or not to push the processed dataset to the Hub."},
)
hub_dataset_id: str = field(
default=False,
metadata={"help": "Repository namespace if pushing to the Hugging Face Hub."},
)
private_repo: bool = field(
default=True,
metadata={"help": "Whether or not to push the processed dataset to a private repository on the Hub"},
)
max_samples: int = field(
default=None,
metadata={"help": "Maximum number of samples per split. Useful for debugging purposes."},
)
def main():
# 1. Parse input arguments
parser = HfArgumentParser(DataTrainingArguments)
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
data_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))[0]
else:
data_args = parser.parse_args_into_dataclasses()[0]
# 1. Load accents for each speaker
linguistic_background = {}
linguistic_background_csv = os.path.join(data_args.dataset_dir, "linguistic_background.csv")
with open(linguistic_background_csv, encoding="utf-8") as file:
reader = csv.DictReader(file, delimiter=",")
for line in reader:
linguistic_background[line["PARTICIPANT_ID"]] = line[
"How would you describe your accent in English? (e.g. Italian, Glaswegian)"
]
accent_dataset = load_dataset("edinburghcstr/edacc_accents", split="train")
def format_dataset(batch):
batch["speaker_id"] = (
batch["Final-Participant_ID"].replace("EAEC", "EDACC").replace("P1", "-A").replace("P2", "-B")
)
return batch
accent_dataset = accent_dataset.map(format_dataset, remove_columns=["Final-Participant_ID"])
# 2. Clean accents for each speaker
linguistic_background_clean = {
participant: accent.strip()
for participant, accent in zip(accent_dataset["speaker_id"], accent_dataset["English_Variety"])
}
linguistic_variety = {
participant: l1.strip() for participant, l1 in zip(accent_dataset["speaker_id"], accent_dataset["L1_Variety"])
}
# 3. Initialize dataset dict
raw_datasets = DatasetDict()
if data_args.overwrite_output_dir and os.path.exists(data_args.output_dir) and os.path.isdir(data_args.output_dir):
shutil.rmtree(data_args.output_dir)
output_dir_processed = os.path.join(data_args.output_dir, "processed")
# 4. Iterate over dev/test files
for split, split_formatted in zip(["dev", "test"], ["validation", "test"]):
data_dir = os.path.join(data_args.dataset_dir, split)
metadata = os.path.join(data_dir, "stm")
output_dir_split = os.path.join(output_dir_processed, split)
os.makedirs(output_dir_split, exist_ok=True)
all_speakers = []
all_genders = []
all_l1s = []
all_texts = []
all_audio_paths = []
all_normalized_accents = []
all_raw_accents = []
current_audio = None
current_audio_array = None
current_sampling_rate = None
current_counter = 1
gender_pat = r".*?\<(.*),.*"
l1_pat = r".*?\,(.*)>.*"
with open(metadata, "r") as file:
for idx, line in tqdm(enumerate(file), desc=split):
# example line is: 'EDACC-C06 1 EDACC-C06-A 0.00 5.27 <male,l1> C ELEVEN DASH P ONE\n
# the transcription always comes to the right of the last rangle bracket
text_idx = line.find(">") + 1
all_texts.append(line[text_idx + 1 : -1])
# the metadata immediately proceeds this
line = line[:text_idx]
file, channel, speaker, start, end, gender_l1 = line.split(" ")
# add speaker information to cumulative lists
all_raw_accents.append(linguistic_background[speaker])
all_normalized_accents.append(linguistic_background_clean[speaker])
all_speakers.append(speaker)
# add gender/l1 information
all_genders.append(re.search(gender_pat, gender_l1).group(1))
all_l1s.append(linguistic_variety[speaker])
# read audio file if different from previous
if file != current_audio:
current_audio_array, current_sampling_rate = sf.read(
os.path.join(data_args.dataset_dir, "data", file + ".wav")
)
current_audio = file
current_counter = 1
else:
current_counter += 1
# chunk audio file according to start/end times
start = int(float(start) * current_sampling_rate)
end = int(float(end) * current_sampling_rate)
end = min(end, len(current_audio_array))
chunked_audio = current_audio_array[start:end]
save_path = os.path.join(output_dir_split, f"{file}-{current_counter}.wav")
sf.write(save_path, chunked_audio, current_sampling_rate)
all_audio_paths.append(save_path)
if data_args.max_samples is not None and (data_args.max_samples - 1) == idx:
break
raw_datasets[split_formatted] = Dataset.from_dict(
{
"speaker": all_speakers,
"text": all_texts,
"accent": all_normalized_accents,
"raw_accent": all_raw_accents,
"gender": all_genders,
"l1": all_l1s,
"audio": all_audio_paths,
}
).cast_column("audio", Audio())
if data_args.push_to_hub:
raw_datasets.push_to_hub(data_args.hub_dataset_id, token=True)
raw_datasets.save_to_disk(data_args.output_dir)
if __name__ == "__main__":
main()
#!/usr/bin/env bash
python prepare_edacc.py \
--dataset_dir "/fsx/sanchit/edacc/edacc_v1.0" \
--output_dir "/fsx/sanchit/edacc_processed" \
--hub_dataset_id "edinburghcstr/edacc-normalized" \
--push_to_hub
#!/usr/bin/env bash
accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=8 run_prompt_creation.py \
--dataset_name "ylacombe/libritts_r_tags_tagged_10k" \
--dataset_config_name "clean" \
--model_name_or_path "mistralai/Mistral-7B-Instruct-v0.2" \
--per_device_eval_batch_size 64 \
--attn_implementation "sdpa" \
--dataloader_num_workers 4 \
--output_dir "./libritts_r_tags_tagged_10k_generated" \
--load_in_4bit \
--push_to_hub \
--hub_dataset_id "parler-tts/libritts_r_tags_tagged_10k_generated"
accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=8 run_prompt_creation.py \
--dataset_name "ylacombe/libritts_r_tags_tagged_10k" \
--dataset_config_name "other" \
--model_name_or_path "mistralai/Mistral-7B-Instruct-v0.2" \
--per_device_eval_batch_size 64 \
--attn_implementation "sdpa" \
--dataloader_num_workers 4 \
--output_dir "./libritts_r_tags_tagged_10k_generated" \
--load_in_4bit \
--push_to_hub \
--hub_dataset_id "parler-tts/libritts_r_tags_tagged_10k_generated"
accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=8 run_prompt_creation.py \
--dataset_name "ylacombe/mls-eng-10k-tags_tagged_10k" \
--model_name_or_path "mistralai/Mistral-7B-Instruct-v0.2" \
--per_device_eval_batch_size 64 \
--attn_implementation "sdpa" \
--dataloader_num_workers 4 \
--output_dir "./mls-eng-10k-tags_tagged_10k_generated" \
--load_in_4bit \
--push_to_hub \
--hub_dataset_id "parler-tts/mls-eng-10k-tags_tagged_10k_generated"
#!/usr/bin/env bash
accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=8 run_prompt_creation.py \
--dataset_name "parler-tts/libritts-r-tags-and-text" \
--dataset_config_name "clean" \
--model_name_or_path "mistralai/Mistral-7B-Instruct-v0.2" \
--per_device_eval_batch_size 64 \
--attn_implementation "sdpa" \
--dataloader_num_workers 4 \
--output_dir "./" \
--load_in_4bit \
--push_to_hub \
--hub_dataset_id "parler-tts/libritts-r-tags-and-text-generated"
accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=8 run_prompt_creation.py \
--dataset_name "parler-tts/libritts-r-tags-and-text" \
--dataset_config_name "other" \
--model_name_or_path "mistralai/Mistral-7B-Instruct-v0.2" \
--per_device_eval_batch_size 64 \
--attn_implementation "sdpa" \
--dataloader_num_workers 4 \
--output_dir "./" \
--load_in_4bit \
--push_to_hub \
--hub_dataset_id "parler-tts/libritts-r-tags-and-text-generated"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment