Unverified Commit 9ef35aa6 authored by Yoach Lacombe's avatar Yoach Lacombe Committed by GitHub
Browse files

Merge branch 'sanchit-gandhi:main' into add-training

parents 75ae54a8 a7231794
#!/usr/bin/env bash
accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=8 run_prompt_creation.py \
--dataset_name "ylacombe/libritts_r_tags_tagged_10k" \
--dataset_config_name "clean" \
--model_name_or_path "mistralai/Mistral-7B-Instruct-v0.2" \
--per_device_eval_batch_size 64 \
--attn_implementation "sdpa" \
--dataloader_num_workers 4 \
--output_dir "./libritts_r_tags_tagged_10k_generated" \
--load_in_4bit \
--push_to_hub \
--hub_dataset_id "stable-speech/libritts_r_tags_tagged_10k_generated"
accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=8 run_prompt_creation.py \
--dataset_name "ylacombe/libritts_r_tags_tagged_10k" \
--dataset_config_name "other" \
--model_name_or_path "mistralai/Mistral-7B-Instruct-v0.2" \
--per_device_eval_batch_size 64 \
--attn_implementation "sdpa" \
--dataloader_num_workers 4 \
--output_dir "./libritts_r_tags_tagged_10k_generated" \
--load_in_4bit \
--push_to_hub \
--hub_dataset_id "stable-speech/libritts_r_tags_tagged_10k_generated"
accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=8 run_prompt_creation.py \
--dataset_name "ylacombe/mls-eng-10k-tags_tagged_10k" \
--model_name_or_path "mistralai/Mistral-7B-Instruct-v0.2" \
--per_device_eval_batch_size 64 \
--attn_implementation "sdpa" \
--dataloader_num_workers 4 \
--output_dir "./mls-eng-10k-tags_tagged_10k_generated" \
--load_in_4bit \
--push_to_hub \
--hub_dataset_id "stable-speech/mls-eng-10k-tags_tagged_10k_generated"
......@@ -7,6 +7,7 @@ from typing import Any, Dict, List, Optional, Union
import torch
from accelerate import Accelerator
from accelerate.logging import get_logger
from datasets import DatasetDict, load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
......@@ -18,7 +19,7 @@ from transformers import (
)
logger = logging.getLogger(__name__)
logger = get_logger(__name__, log_level="INFO")
@dataclass
......@@ -207,6 +208,23 @@ class DataCollatorWithPadding:
return batch
# TODO(SG): add accent keyword
PROMPT = """You will be given six descriptive keywords related to an audio sample of a person's speech. These keywords include:
1. The gender (e.g., male, female)
2. The level of reverberation (e.g., very roomy sounding, quite roomy sounding, slightly roomy sounding, moderate reverberation, slightly confined sounding, quite confined sounding, very confined sounding)
3. The amount of noise the sample (e.g., very noisy, quite noisy, slightly noisy, moderate ambient sound, slightly clear, quite clear, very clear)
4. The tone of the speaker's voice (e.g., very monotone, quite monotone, slightly monotone, moderate intonation, slightly expressive, quite expressive, very expressive)
5. The pace of the speaker's delivery (e.g., very slowly, quite slowly, slightly slowly, moderate speed, slightly fast, quite fast, very fast)
6. The pitch of the speaker's voice (e.g., very low pitch, quite low pitch, slightly low pitch, moderate pitch, slightly high pitch, quite high pitch, very high pitch)
Your task is to create a text description using these keywords that accurately describes the speech sample while ensuring the description remains grammatically correct and easy to understand. You should rearrange the keyword order as necessary, and substitute synonymous terms where appropriate. If the amount of noise is 'very noisy' and the level of reverberation is 'very roomy sounding', include terms like 'very bad recording' in the description. Likewise, if the amount of noise is 'very clear' and the level of reverberation is 'very confined sounding', include terms like 'very good recording' in the description. Otherwise, do not add extra details beyond what has been provided, and only return the generated description.
For example, given the following keywords: 'female', 'slightly roomy sounding', 'slightly noisy', 'very expressive', 'slightly low pitch', 'very slowly', a valid description would be: 'a woman with a deep voice speaks slowly but has an animated delivery in an echoey room with some background noise'.
For the keywords: '[gender]', '[reverberation]', '[noise]', '[speech_monotony]', '[pitch]', '[speaking_rate]', the corresponding description is:"
"""
def main():
# 1. Parse input arguments
parser = HfArgumentParser((ModelArguments, DataArguments))
......@@ -219,7 +237,6 @@ def main():
# 2. Setup logging
# Make one log on every process with the configuration for debugging.
logger.setLevel(logging.INFO)
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
......@@ -323,20 +340,6 @@ def main():
tokenizer.pad_token_id = tokenizer.bos_token_id
model.generation_config.pad_token_id = model.generation_config.eos_token_id
# TODO(SG): add accent keyword
PROMPT = (
"You will be given six descriptive keywords related to an audio sample of a person's speech. These keywords include:\n"
"1. The gender (e.g., male, female)\n"
"2. The level of reverberation (e.g., very roomy sounding, quite roomy sounding, slightly roomy sounding, moderate reverberation, slightly confined sounding, quite confined sounding, very confined sounding)\n"
"3. The amount of noise the sample (e.g., very noisy, quite noisy, slightly noisy, moderate ambient sound, slightly clear, quite clear, very clear)\n"
"4. The tone of the speaker's voice (e.g., very monotone, quite monotone, slightly monotone, moderate intonation, slightly expressive, quite expressive, very expressive)\n"
"5. The pace of the speaker's delivery (e.g., very slowly, quite slowly, slightly slowly, moderate speed, slightly fast, quite fast, very fast)\n"
"6. The pitch of the speaker's voice (e.g., very low pitch, quite low pitch, slightly low pitch, moderate pitch, slightly high pitch, quite high pitch, very high pitch)\n"
"Your task is to create a text description using these keywords that accurately describes the speech sample while ensuring the description remains grammatically correct and easy to understand. You can rearrange the keyword order as necessary, and substitute synonymous terms where appropriate. If the amount of noise is 'very noisy' and the level of reverberation is 'very roomy sounding', include the term 'very bad recording' in the description. Likewise, if the amount of noise is 'very clear' and the level of reverberation is 'very confined sounding', include the term 'very good recording' in the description. Otherwise, do not add extra details beyond what has been provided, and only return the generated description.\n"
"For example, given the following keywords: 'female', 'slightly roomy sounding', 'slightly noisy', 'quite monotone', 'slightly low pitch', 'very slowly', a valid description would be: 'a woman with a deep voice speaking slowly and somewhat monotonously in an echoey room with background noise'.\n"
"For the keywords: '[gender]', '[reverberation]', '[noise]', '[speech_monotony]', '[pitch]', '[speaking_rate]', the corresponding description is:"
)
def prepare_dataset(sample):
sample_prompt = PROMPT
for key in EXPECTED_COLUMNS:
......@@ -366,6 +369,12 @@ def main():
output_ids = accelerator.pad_across_processes(output_ids, dim=1, pad_index=tokenizer.pad_token_id)
return output_ids
def postprocess_dataset(sample):
prompt_text = tokenizer.decode(sample["input_ids"], skip_special_tokens=True)
generated_text = tokenizer.decode(sample["generated_ids"], skip_special_tokens=True)
sample["text_description"] = generated_text[len(prompt_text) :]
return sample
for split in vectorized_datasets:
data_loader = DataLoader(
vectorized_datasets[split],
......@@ -380,21 +389,16 @@ def main():
for batch in tqdm(data_loader, disable=not accelerator.is_local_main_process):
generated_ids = generate_step(batch)
generated_ids = accelerator.gather_for_metrics(generated_ids)
all_generated_ids.extend(generated_ids.cpu())
all_generated_ids.extend(generated_ids.cpu().numpy())
def postprocess_dataset(sample, idx):
prompt_text = tokenizer.decode(sample["input_ids"], skip_special_tokens=True)
generated_text = tokenizer.decode(all_generated_ids[idx], skip_special_tokens=True)
sample["text_description"] = generated_text[len(prompt_text) :]
return sample
vectorized_datasets[split] = vectorized_datasets[split].add_column("generated_ids", all_generated_ids)
if accelerator.is_main_process:
vectorized_datasets[split] = vectorized_datasets[split].map(
postprocess_dataset,
num_proc=data_args.preprocessing_num_workers,
desc="Postprocessing dataset",
remove_columns=["input_ids"],
with_indices=True,
remove_columns=["input_ids", "generated_ids"],
)
if accelerator.is_main_process:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment