"magic_pdf/model/vscode:/vscode.git/clone" did not exist on "d0558abb43844102ba4e7d7b56c7953531b33d67"
Commit 32092d53 authored by sanchit-gandhi's avatar sanchit-gandhi
Browse files

finish prompt creation

parent b4768a87
#!/usr/bin/env bash
python run_prompt_creation.py \
--dataset_name "ylacombe/libritts_r_test_tag" \
--dataset_config_name "default" \
--dataset_split_name "dev.clean" \
--model_name_or_path "mistralai/Mistral-7B-Instruct-v0.2" \
--per_device_eval_batch_size 512 \
--dataloader_num_workers 4 \
--output_dir "./" \
--load_in_4bit \
--push_to_hub \
--hub_dataset_id "sanchit-gandhi/libritts_r_test_tag_generated"
#!/usr/bin/env bash
python run_prompt_creation.py \
--dataset_name "ylacombe/libritts_r_test_tag" \
--dataset_config_name "default" \
--dataset_split_name "dev.clean" \
--max_eval_samples 32 \
--model_name_or_path "TinyLlama/TinyLlama-1.1B-Chat-v1.0" \
--per_device_eval_batch_size 2 \
--output_dir "./" \
--load_in_4bit \
--push_to_hub \
--hub_dataset_id "sanchit-gandhi/libritts_r_test_tag_generated"
import logging
import os
import shutil
import sys
from typing import Optional, Dict
import logging
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
import torch
from accelerate import Accelerator
from datasets import load_dataset
from transformers import AutoModelForCausalLM, HfArgumentParser, BitsAndBytesConfig, AutoTokenizer
from dataclasses import dataclass, field
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
HfArgumentParser,
)
logger = logging.getLogger(__name__)
@dataclass
class ModelArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
"""
model_name_or_path: str = field(
default=None,
metadata={"help": "The name of the model to use (via the transformers library) for the prompt annotation."},
)
per_device_eval_batch_size: int = field(
metadata={"help": "The per-device batch size to use for inference."},
)
model_variant: str = field(
default=None,
metadata={"help": "If specified load weights from `variant` filename, *e.g.* pytorch_model.<variant>.bin. "},
......@@ -45,13 +58,17 @@ class ModelArguments:
default="sdpa",
metadata={"help": "Which attn type to use: ['eager', 'sdpa', 'flash_attention_2']"},
)
load_in_8bit: Optional[bool] = field(default=False, metadata={"help": "Whether to use 8-bit precision for inference."})
load_in_4bit: Optional[bool] = field(default=False, metadata={"help": "Whether to use 4-bit precision for inference."})
load_in_8bit: Optional[bool] = field(
default=False, metadata={"help": "Whether to use 8-bit precision for inference."}
)
load_in_4bit: Optional[bool] = field(
default=False, metadata={"help": "Whether to use 4-bit precision for inference."}
)
bnb_4bit_quant_type: Optional[str] = field(
default="nf4", metadata={"help": "precise the quantization type (fp4 or nf4)"}
)
use_bnb_nested_quant: Optional[bool] = field(default=False, metadata={"help": "use nested quantization"})
trust_remote_code: Optional[bool] = field(
trust_remote_code: Optional[bool] = field(
default=False,
metadata={
"help": (
......@@ -61,7 +78,23 @@ class ModelArguments:
)
},
)
use_fast_tokenizer: Optional[bool] = field(default=True, metadata={"help": "Use fast tokenizer for encoding/decoding input ids"})
use_fast_tokenizer: Optional[bool] = field(
default=True, metadata={"help": "Use fast tokenizer for encoding/decoding input ids"}
)
token: str = field(
default=None,
metadata={
"help": (
"The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
"generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
)
},
)
do_sample: Optional[bool] = field(default=True, metadata={"help": "Whether to use sampling mode for generation"})
temperature: Optional[float] = field(default=0.6, metadata={"help": "Temperature for sampling-based generation"})
max_new_tokens: Optional[int] = field(
default=256, metadata={"help": "Maximum number of new tokens during generation"}
)
@dataclass
......@@ -70,12 +103,16 @@ class DataArguments:
Arguments pertaining to what data we are going to input our model for training and eval.
"""
dataset_name: str = field(
default=None,
output_dir: str = field(
metadata={
"help": "The name of the dataset to use (via the datasets library)"
"help": "Where to save the processed dataset to disk. If unspecified, uses a 'pretty' version of the "
"original dataset name. E.g. 'facebook/voxpopuli' will be saved under 'voxpopuli'."
},
)
dataset_name: str = field(
default=None,
metadata={"help": "The name of the dataset to use (via the datasets library)"},
)
dataset_config_name: Optional[str] = field(
default=None,
metadata={"help": "The configuration name of the dataset to use (via the datasets library)."},
......@@ -88,9 +125,9 @@ class DataArguments:
default=None,
metadata={"help": "Path to cache directory for saving and loading datasets"},
)
samples_per_dataset: Optional[int] = field(
max_eval_samples: Optional[int] = field(
default=None,
metadata={"help": "Number of samples per dataset used to measure speed."},
metadata={"help": "Maximum number of samples for generation - use for debugging purposes."},
)
overwrite_cache: bool = field(
default=False,
......@@ -100,8 +137,29 @@ class DataArguments:
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
)
dataloader_num_workers: Optional[int] = field(
default=0,
metadata={"help": "The number of processes to use for the dataloader."},
)
push_to_hub: Optional[bool] = field(
default=False,
metadata={"help": "Whether or not to push the processed dataset to the Hub."},
)
hub_dataset_id: Optional[str] = field(
default=None,
metadata={"help": "Repository namespace if pushing to the Hugging Face Hub."},
)
overwrite_output_dir: Optional[bool] = field(
default=False,
metadata={"help": "Overwrite the content of the output directory each time the script is run."},
)
def __post_init__(self):
if self.push_to_hub and self.hub_dataset_id is None:
raise ValueError("You must specify the `hub_dataset_id` when setting `--push_to_hub=True`")
def get_quantization_config(model_args: ModelArguments) -> BitsAndBytesConfig | None:
def get_quantization_config(model_args: ModelArguments) -> Union[BitsAndBytesConfig, None]:
if model_args.load_in_4bit:
compute_dtype = torch.float16
if model_args.torch_dtype not in {"auto", None}:
......@@ -122,14 +180,33 @@ def get_quantization_config(model_args: ModelArguments) -> BitsAndBytesConfig |
return quantization_config
def get_current_device() -> int:
"""Get the current device. For GPU we return the local process index to enable multiple GPU training."""
return Accelerator().local_process_index if torch.cuda.is_available() else "cpu"
def get_kbit_device_map() -> Dict[str, int] | None:
def get_kbit_device_map() -> Union[Dict[str, int], None]:
"""Useful for running inference with quantized models by setting `device_map=get_peft_device_map()`"""
return {"": get_current_device()} if torch.cuda.is_available() else None
@dataclass
class DataCollatorWithPadding:
"""
Data collator that will dynamically pad the inputs received.
"""
tokenizer: Any
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
# split inputs and labels since they have to be of different lengths and need
# different padding methods
input_ids = {"input_ids": [feature["input_ids"] for feature in features]}
batch = self.tokenizer.pad(input_ids, return_tensors="pt", padding="longest", return_attention_mask=True)
return batch
def main():
# 1. Parse input arguments
parser = HfArgumentParser((ModelArguments, DataArguments))
......@@ -149,6 +226,12 @@ def main():
handlers=[logging.StreamHandler(sys.stdout)],
)
accelerator = Accelerator()
if data_args.overwrite_output_dir and os.path.exists(data_args.output_dir) and os.path.isdir(data_args.output_dir):
logger.info("Cleaning output dir from previous run...")
shutil.rmtree(data_args.output_dir)
# 3. Load pre-trained model
logger.info("*** Load pretrained model ***")
torch_dtype = (
......@@ -166,13 +249,125 @@ def main():
device_map=get_kbit_device_map() if quantization_config is not None else None,
quantization_config=quantization_config,
low_cpu_mem_usage=True,
)
token=model_args.token,
).eval()
tokenizer = AutoTokenizer.from_pretrained(
model_args.model_name_or_path,
revision=model_args.model_revision,
trust_remote_code=model_args.trust_remote_code,
use_fast=model_args.use_fast_tokenizer,
padding_side="left",
)
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = tokenizer.bos_token_id
model.generation_config.pad_token_id = model.generation_config.eos_token_id
# 4. Load annotation dataset
raw_dataset = load_dataset(data_args.dataset_name, data_args.dataset_config)
raw_datasets = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
split=data_args.dataset_split_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
trust_remote_code=model_args.trust_remote_code,
num_proc=data_args.preprocessing_num_workers,
)
raw_datasets_features = set(raw_datasets.features.keys())
if data_args.max_eval_samples:
raw_datasets = raw_datasets.select(range(data_args.max_eval_samples))
EXPECTED_COLUMNS = {"speaking_rate", "noise", "reverberation", "speech_monotony"}
if not EXPECTED_COLUMNS.issubset(raw_datasets_features):
missing_columns = EXPECTED_COLUMNS - raw_datasets_features
raise ValueError(
f"Missing columns {missing_columns} from the dataset features. Got dataset features {raw_datasets_features}"
)
PROMPT = """ We have seven keywords that describe different attributes of an audio sample spoken by a given speaker: the speaker's gender, the speaker's accent, the amount of reverberation in the sample (high or low reverberation), the amount of noise in the sample (how clear or noisy), how monotone or animated the sample is, the speaker's pitch (high or low voice), the speaker's speed (how fast or slow the speaker is speaking).
Given these keywords, form a coherent sentence that summarises the seven attributes in a meaningful way. You can change the order of the keywords in the sentence and use common synonyms for these words, provided that the sentence summarises the attributes clearly. Keep the sentence simple - don't introduce additional information other than the keywords provided. Only return the generated sentence, not any other assistant remarks.
For example, given the following descriptors: 'female', 'Hungarian', 'slightly roomy sounding', 'fairly noisy', 'quite monotone', 'fairly low pitch', 'very slowly', a valid sentence would be: 'a woman with a deep voice speaking slowly and somewhat monotonously with a Hungarian accent in an echoey room with background noise'. Note how the seven attributes have been combined together in a simple sentence, with the ordering changed but no additional information added.
For the descriptors: {gender}, {accent}, {reverberation}, {noise}, {monotony}, {pitch}, {speaking_rate}, the corresponding sentence is:"""
def prepare_dataset(sample):
sample_prompt = PROMPT.replace("{gender}", sample["gender"])
sample_prompt = sample_prompt.replace("{accent}", sample["accent"])
sample_prompt = sample_prompt.replace("{reverberation}", sample["reverberation"])
sample_prompt = sample_prompt.replace("{noise}", sample["noise"])
sample_prompt = sample_prompt.replace("{monotony}", sample["monotony"])
sample_prompt = sample_prompt.replace("{pitch}", sample["pitch"])
sample_prompt = sample_prompt.replace("{speaking_rate}", sample["speaking_rate"])
sample_prompt = [{"role": "user", "content": sample_prompt}]
token_ids = tokenizer.apply_chat_template(sample_prompt)
sample["prompt_ids"] = token_ids
return sample
DUMMY_PROMPT = """ We have seven keywords that describe different attributes of an audio sample spoken by a given speaker: the speaker's gender, the speaker's accent, the amount of reverberation in the sample (high or low reverberation), the amount of noise in the sample (how clear or noisy), how monotone or animated the sample is, the speaker's pitch (high or low voice), the speaker's speed (how fast or slow the speaker is speaking).
Given these keywords, form a coherent sentence that summarises the seven attributes in a meaningful way. You can change the order of the keywords in the sentence and use common synonyms for these words, provided that the sentence summarises the attributes clearly. Keep the sentence simple - don't introduce additional information other than the keywords provided. Only return the generated sentence, not any other assistant remarks.
For example, given the following descriptors: 'female', 'Hungarian', 'slightly roomy sounding', 'fairly noisy', 'quite monotone', 'fairly low pitch', 'very slowly', a valid sentence would be: 'a woman with a deep voice speaking slowly and somewhat monotonously with a Hungarian accent in an echoey room with background noise'. Note how the seven attributes have been combined together in a simple sentence, with the ordering changed but no additional information added.
For the descriptors: [gender], [accent], [reverberation], [noise], [monotony], [pitch], [speaking_rate], the corresponding sentence is:"""
def prepare_dummy_dataset(sample):
sample_prompt = DUMMY_PROMPT
for key in EXPECTED_COLUMNS:
sample_prompt = sample_prompt.replace(f"[{key}]", sample[key])
sample_prompt = [{"role": "user", "content": sample_prompt}]
token_ids = tokenizer.apply_chat_template(sample_prompt)
sample["input_ids"] = token_ids
return sample
with accelerator.main_process_first():
vectorized_datasets = raw_datasets.map(
prepare_dummy_dataset, num_proc=data_args.preprocessing_num_workers, desc="Preparing prompts"
)
data_collator = DataCollatorWithPadding(tokenizer)
data_loader = DataLoader(
vectorized_datasets,
batch_size=model_args.per_device_eval_batch_size,
collate_fn=data_collator,
num_workers=data_args.dataloader_num_workers,
pin_memory=True,
)
# Prepare everything with our `accelerator`
model, data_loader = accelerator.prepare(model, data_loader)
def generate_step(batch):
output_ids = accelerator.unwrap_model(model).generate(
batch["input_ids"],
attention_mask=batch["attention_mask"],
do_sample=model_args.do_sample,
temperature=model_args.temperature,
max_new_tokens=model_args.max_new_tokens,
)
output_ids = accelerator.pad_across_processes(output_ids, dim=1, pad_index=tokenizer.pad_token_id)
return output_ids
all_generated_ids = []
for batch in tqdm(data_loader, disable=not accelerator.is_local_main_process):
generated_ids = generate_step(batch)
all_generated_ids.extend(generated_ids.cpu())
accelerator.end_training()
def postprocess_dataset(sample, idx):
prompt_text = tokenizer.decode(sample["input_ids"], skip_special_tokens=True)
generated_text = tokenizer.decode(all_generated_ids[idx], skip_special_tokens=True)
sample["text_description"] = generated_text[len(prompt_text) :]
return sample
if accelerator.is_main_process:
vectorized_datasets = vectorized_datasets.map(
postprocess_dataset,
num_proc=data_args.preprocessing_num_workers,
desc="Postprocessing dataset",
remove_columns=["input_ids"],
with_indices=True,
)
vectorized_datasets.save_to_disk(data_args.output_dir)
if data_args.push_to_hub:
vectorized_datasets.push_to_hub(data_args.hub_dataset_id)
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment