Commit f75058c7 authored by Rayyyyy's avatar Rayyyyy
Browse files

First add.

parents
Pipeline #1411 canceled with stages
import os
import datasets
import time
import torch
from datetime import timedelta
from typing import Optional
from collections import defaultdict
from dataclasses import dataclass, field, asdict
from accelerate import Accelerator, InitProcessGroupKwargs
from transformers import HfArgumentParser
from torch.utils.data import DataLoader
from src import ModelArgs, DatasetProcessFn, DefaultDataCollator, FileLogger, get_model_and_tokenizer, makedirs, split_file_dir_name_ext, evaluate_perplexity
@dataclass
class Args(ModelArgs):
eval_data: str = field(
default="activation-beacon:lm/pg19.json",
metadata={'help': 'The evaluation json data path.'}
)
output_dir: str = field(
default="data/results/lm/",
metadata={'help': 'Output directory for results and logs.'}
)
retokenize: bool = field(
default=False,
metadata={'help': 'Retokenize the corpus?'}
)
tokenize_max_char: Optional[int] = field(
default=None,
metadata={'help': 'The number of chars to truncate.'}
)
batch_size: int = field(
default=1,
metadata={'help': 'Evaluation batch size.'}
)
padding_side: str = field(
default="right",
metadata={'help': 'Which side to pad?'}
)
stride: int = field(
default=2048,
metadata={'help': 'Streaming stride when evaluating perplexity.'}
)
max_sample_num: int = field(
default=100,
metadata={'help': 'How many samples to evaluate in eval_data?'}
)
min_length: Optional[int] = field(
default=None,
metadata={'help': 'Minimum length for input_ids.'}
)
def process_lm_pre(tokenizer, tokenize_max_char=None):
@DatasetProcessFn()
def _process(text, **kwds):
if tokenize_max_char is not None:
text = text[:tokenize_max_char]
output = {"input_ids": tokenizer.encode(text, add_special_tokens=False)}
return output
return _process
def process_lm(tokenizer, max_length=4096, stride=1024, min_length=None):
# stride=0 indicates we just use one forward pass with max_length for each text
if stride == 0:
stride = max_length
jump = True
else:
jump = False
test = tokenizer.encode("test")
has_bos = False
if test[0] == tokenizer.bos_token_id:
# NOTE: subtract 1 because it will be occupied by the bos token
max_length -= 1
has_bos = True
@DatasetProcessFn(augment=True)
def _process(input_ids, _index, **kwds):
outputs = defaultdict(list)
seq_len = len(input_ids)
prev_end_loc = 0
if min_length is not None and seq_len < min_length:
return
for start_loc in range(0, seq_len, stride):
end_loc = min(start_loc + max_length, seq_len)
sub_seq_len = end_loc - start_loc
sub_trg_len = end_loc - prev_end_loc # may be different from stride on last loop
sub_input_ids = input_ids[start_loc: end_loc]
sub_attention_mask = [1 for _ in range(sub_seq_len)]
if has_bos:
sub_input_ids.insert(0, tokenizer.bos_token_id)
sub_attention_mask.insert(0, 1)
sub_seq_len += 1
sub_labels = sub_input_ids.copy()
sub_labels[:-sub_trg_len] = [-100 for _ in range(sub_seq_len - sub_trg_len)]
sub_inputs = {
"index": _index,
"input_ids": sub_input_ids,
"attention_mask": sub_attention_mask,
"labels": sub_labels,
}
for k, v in sub_inputs.items():
outputs[k].append(v)
prev_end_loc = end_loc
# NOTE: when end_loc is just the same as seq_len, jump out
if end_loc == seq_len or jump:
break
return outputs
return _process
def main():
parser = HfArgumentParser([Args])
args: Args = parser.parse_args_into_dataclasses()[0]
# increase timeout to avoid error
accelerator = Accelerator(cpu=args.cpu, kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=100000))])
model, tokenizer = get_model_and_tokenizer(args, accelerator=accelerator)
_, dataset_name, _ = split_file_dir_name_ext(args.eval_data)
tokenized_dataset_path = os.path.join(args.output_dir, dataset_name, "tokenized_inputs")
with accelerator.main_process_first():
if not os.path.exists(tokenized_dataset_path) or args.retokenize:
pre_process_fn = process_lm_pre(tokenizer=tokenizer, tokenize_max_char=args.tokenize_max_char)
raw_dataset = datasets.load_dataset("json", data_files=args.eval_data, cache_dir=args.dataset_cache_dir, split="train")
tokenized_dataset = raw_dataset.map(pre_process_fn, batched=True, num_proc=32, remove_columns=raw_dataset.column_names, batch_size=32)
tokenized_dataset.save_to_disk(tokenized_dataset_path)
tokenized_dataset = datasets.load_from_disk(tokenized_dataset_path)
process_fn = process_lm(tokenizer, max_length=args.max_length, stride=args.stride, min_length=args.min_length)
if len(tokenized_dataset) > args.max_sample_num:
# slice out the first max_sample_num samples
tokenized_dataset = tokenized_dataset.train_test_split(args.max_sample_num, shuffle=False)["test"]
dataset = tokenized_dataset.map(process_fn, batched=True, num_proc=32, remove_columns=tokenized_dataset.column_names, keep_in_memory=True, with_indices=True)
data_collator = DefaultDataCollator(tokenizer=tokenizer)
dataloader = DataLoader(
dataset,
batch_size=args.batch_size,
collate_fn=data_collator,
# only pin memory when no gpu
pin_memory=not args.cpu,
)
t1 = time.time()
perplexity = evaluate_perplexity(model, dataloader, accelerator)
t2 = time.time()
memory = torch.cuda.max_memory_allocated() / 1024**2
metrics = {"perplexity": perplexity, "time": round((t2 - t1) / len(dataset), 4), "memory": memory}
if accelerator.process_index == 0:
log_path = os.path.join(args.output_dir, f"{dataset_name}.log")
file_logger = FileLogger(makedirs(log_path))
file_logger.log(metrics, Args=asdict(args))
if __name__ == "__main__":
main()
import os
import datasets
import json
import torch
from tqdm import tqdm
from typing import Optional, Dict, List
from collections import defaultdict
from dataclasses import dataclass, field, asdict
from accelerate import Accelerator
from transformers import HfArgumentParser
from transformers.utils import logging
from torch.utils.data import DataLoader
from src import ModelArgs, DatasetProcessFn, DefaultDataCollator, FileLogger, get_model_and_tokenizer, makedirs
from .longbench_utils import DATASET2PROMPT, DATASET2MAXNEWTOKENS, DATASET2CATEGORY, scorer, scorer_e
logger = logging.get_logger(__name__)
@dataclass
class Args(ModelArgs):
eval_data: str = field(
default="activation-beacon:longbench/test.json",
metadata={'help': 'The evaluation json data path.'}
)
output_dir: str = field(
default="data/results/longbench/",
metadata={'help': 'Output directory for results and logs.'}
)
batch_size: int = field(
default=1,
metadata={'help': 'Evaluation batch size.'}
)
dataset_names: List[str] = field(
default_factory=lambda: ['narrativeqa', 'qasper', 'multifieldqa_en', 'hotpotqa', '2wikimqa', 'musique', 'gov_report', 'qmsum', 'multi_news', 'trec', 'triviaqa', 'samsum', 'lcc', 'repobench-p'],
metadata={'help': 'Which dataset to evaluate?'}
)
model_name_or_path: str = field(
default="meta-llama/Llama-2-7b-chat-hf",
metadata={'help': 'Model name on huggingface.'}
)
max_length: int = field(
default=3500,
metadata={'help': 'Max input length.'}
)
truncate_from_middle: bool = field(
default=True,
metadata={'help': 'Truncate inputs from the middle.'}
)
load_result: bool = field(
default=False,
metadata={'help': 'Load result from saved files?'}
)
def process_longbench(tokenizer, prompt_templates:Optional[Dict]=None, max_length=3500, add_chat_inst=False, truncate_from_middle=True):
@DatasetProcessFn()
def _process(input:str, context:str, dataset:str, all_classes:Optional[List], answers:List[str], length:int, _index:int, **kwds):
output = {}
if dataset.endswith("_e"):
dataset = dataset[:-2]
prompt_template = prompt_templates[dataset]
prompt = prompt_template.format(input=input, context=context)
if truncate_from_middle:
tokenized_prompt = tokenizer.encode(prompt)
if len(tokenized_prompt) > max_length:
half = int(max_length / 2)
prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True) + tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True)
else:
tokenized_prompt = tokenizer.encode(prompt)
prompt = tokenizer.decode(tokenized_prompt[-max_length:], skip_special_tokens=True)
# chat models are better off without build prompts on these tasks
if add_chat_inst:
prompt = f"[INST]{prompt}[/INST]"
output = tokenizer(prompt, padding=False, truncation=False)
output["dataset"] = dataset
output["idx"] = _index
return output
return _process
@torch.no_grad()
def main():
parser = HfArgumentParser([Args])
args = parser.parse_args_into_dataclasses()[0]
if ".e." in args.eval_data:
args.output_dir = args.output_dir.replace("longbench", "longbench_e")
else:
args.output_dir = args.output_dir.replace("longbench_e", "longbench")
result_dir_components = [args.output_dir, args.model_name_or_path.strip(os.sep).replace(os.sep, "--"), str(args.max_length)]
result_dir = os.path.join(*result_dir_components)
accelerator = Accelerator(cpu=args.cpu)
model, tokenizer = get_model_and_tokenizer(args, accelerator=accelerator)
with accelerator.main_process_first():
process_fn = process_longbench(
tokenizer,
max_length=args.max_length,
prompt_templates=DATASET2PROMPT,
add_chat_inst=args.add_chat_inst,
truncate_from_middle=args.truncate_from_middle,
)
raw_dataset = datasets.load_dataset("json", data_files=args.eval_data, cache_dir=args.dataset_cache_dir, split="train")
dataset = raw_dataset.map(process_fn, batched=True, num_proc=32, with_indices=True, remove_columns=raw_dataset.column_names)
groupby_dataset = dataset.to_pandas().groupby("dataset")
metrics = {}
if args.dataset_names is None:
dataset_names = [key for key, _ in groupby_dataset]
else:
dataset_names = args.dataset_names
for i, dataset_name in enumerate(dataset_names):
if accelerator.process_index == 0:
logger.info(f"Evaluating {dataset_name} ({i + 1} / {len(dataset_names)})...")
result_path = os.path.join(result_dir, f"{dataset_name}.json")
if args.load_result and os.path.exists(result_path):
if accelerator.process_index == 0:
with open(result_path, encoding="utf-8") as f:
score = json.loads(f.readline())
logger.info(f"{dataset_name}: {score}")
metrics[dataset_name] = score
else:
dataset = datasets.Dataset.from_pandas(groupby_dataset.get_group(dataset_name), preserve_index=False)
data_collator = DefaultDataCollator(tokenizer=tokenizer)
dataloader = DataLoader(
dataset,
batch_size=args.batch_size,
collate_fn=data_collator,
# only pin memory when no gpu
pin_memory=not args.cpu,
)
dataloader = accelerator.prepare(dataloader)
indices = []
preds = []
max_new_tokens = DATASET2MAXNEWTOKENS[dataset_name]
for i, x in enumerate(tqdm(dataloader, desc="Generating")):
x.pop("dataset")
idx = x.pop("idx")[0]
input_length = x["input_ids"].shape[1]
# NOTE: important to reset memory for every batch
if hasattr(model, "memory") and model.memory is not None:
model.memory.reset()
# NOTE: very important to include \n as an eos token for QA and trec, otherwise the F1 score is devastating
if dataset_name in ["2wikimqa", "hotpotqa", "musique", "multifieldqa_en", "qasper", "narrativeqa", "samsum"]:
output = model.generate(
**x,
max_new_tokens=max_new_tokens,
num_beams=1,
do_sample=False,
eos_token_id=[tokenizer.eos_token_id, tokenizer.encode("\n", add_special_tokens=False)[-1]],
# prevent warning
temperature=1.0,
top_p=1.0,
)
else:
output = model.generate(
**x,
max_new_tokens=max_new_tokens,
num_beams=1,
do_sample=False,
temperature=1.0,
top_p=1.0,
)
# 1, max_new_tokens
output = output[:, input_length:]
# pad across device to the same length
output = accelerator.pad_across_processes(output.contiguous(), pad_index=tokenizer.pad_token_id, dim=1)
# num_device, max_new_tokens
output = accelerator.gather_for_metrics(output)
idx = accelerator.gather_for_metrics(idx).tolist()
if accelerator.process_index == 0:
pred = tokenizer.batch_decode(output, skip_special_tokens=True)
preds.extend(pred)
if isinstance(idx, list):
indices.extend(idx)
else:
# single process
indices.append(idx)
if accelerator.process_index == 0:
raw_dataset_subset = raw_dataset[indices]
answers = raw_dataset_subset["answers"]
lengths = raw_dataset_subset["length"]
all_classes = raw_dataset_subset["all_classes"][0]
if '.e.' in args.eval_data:
score = scorer_e(dataset_name, preds, answers, lengths, all_classes)
else:
score = scorer(dataset_name, preds, answers, all_classes)
logger.info(f"{dataset_name}: {score}")
metrics[dataset_name] = score
with open(makedirs(result_path), "w", encoding="utf-8") as f:
f.write(json.dumps(score, ensure_ascii=False) + "\n")
for idx, pred in zip(indices, preds):
sample = raw_dataset[idx]
del sample["all_classes"]
del sample["context"]
del sample["language"]
del sample["_id"]
sample["pred"] = pred
f.write(json.dumps(sample, ensure_ascii=False) + "\n")
if accelerator.process_index == 0:
log_path = os.path.join(args.output_dir, "metrics.log")
# compute category score
category_metrics = defaultdict(list)
for dataset, metric in metrics.items():
category = DATASET2CATEGORY[dataset]
category_metrics[category].append(metric)
for k, v in category_metrics.items():
# when evaluating on longbench_e, each metric is a dict of float
if isinstance(v[0], dict):
category_metric = {}
for kk in v[0].keys():
vv = [v[j][kk] for j in range(len(v))]
category_metric[kk] = round(sum(vv) / len(vv), 2)
category_metrics[k] = category_metric
else:
category_metrics[k] = round(sum(v) / len(v), 2)
# compute average score
if isinstance(next(iter(metrics.values())), dict):
avg = defaultdict(list)
for k, v in metrics.items():
for kk, vv in v.items():
avg[kk].append(vv)
for k, v in avg.items():
avg[k] = round(sum(v) / len(v), 2)
else:
avg = round(sum(metrics.values()) / len(metrics), 2)
metrics["avg"] = avg
file_logger = FileLogger(makedirs(log_path))
file_logger.log(metrics, Args=asdict(args), Category_Metrics=category_metrics)
if __name__ == "__main__":
main()
# modified based on https://github.com/DachengLi1/LongChat/blob/longeval/longeval/eval.py
import os
import torch
import datasets
from tqdm import tqdm
from typing import List
from accelerate import Accelerator
from transformers import HfArgumentParser
from transformers.utils import logging
from torch.utils.data import DataLoader
from dataclasses import dataclass, field, asdict
from src import ModelArgs, DatasetProcessFn, DefaultDataCollator, FileLogger, get_model_and_tokenizer, makedirs, split_file_dir_name_ext
from .longbench_utils import qa_f1_score
logger = logging.get_logger(__name__)
@dataclass
class Args(ModelArgs):
eval_data: str = field(
default="activation-beacon:longeval/topic_retrieval.json",
metadata={'help': 'Evaluation json data.'}
)
output_dir: str = field(
default="data/results/longeval/",
metadata={'help': 'Output directory for results and logs.'}
)
topic_num: List[int] = field(
default_factory=lambda: [5, 10, 15, 20, 25],
metadata={'help': 'How many topics to in the conversation?'}
)
line_num: List[int] = field(
default_factory=lambda: [200, 300, 400, 500, 600, 680],
metadata={'help': 'How many lines to in the conversation?'}
)
def process_longeval(tokenizer, data_type, topic_num, line_num):
@DatasetProcessFn()
def _process(prompt, target, num_topics=None, num_lines=None, **kwds):
# filter out samples that do not have proper number of topics/lines
if data_type == "topic" and num_topics not in topic_num:
return
elif data_type == "line" and num_lines not in line_num:
return
else:
inputs = tokenizer(prompt)
inputs["target"] = target
inputs["input_length"] = len(inputs.input_ids)
if num_topics is not None:
inputs["num"] = num_topics
elif num_lines is not None:
inputs["num"] = num_lines
else:
raise ValueError(f"Either num_topics or num_lines must appear in the dataset field!")
return inputs
return _process
@torch.no_grad()
def main():
parser = HfArgumentParser([Args])
args: Args = parser.parse_args_into_dataclasses()[0]
# topic or line
data_type = split_file_dir_name_ext(args.eval_data)[1].split("_")[0]
accelerator = Accelerator(cpu=args.cpu)
model, tokenizer = get_model_and_tokenizer(args, accelerator=accelerator)
with accelerator.main_process_first():
process_fn = process_longeval(
tokenizer,
data_type=data_type,
topic_num=args.topic_num,
line_num=args.line_num
)
raw_dataset = datasets.load_dataset("json", data_files=args.eval_data, cache_dir=args.dataset_cache_dir, split="train")
dataset = raw_dataset.map(process_fn, batched=True, num_proc=32, remove_columns=raw_dataset.column_names)
groupby_dataset = dataset.to_pandas().groupby("num")
data_collator = DefaultDataCollator(tokenizer=tokenizer)
metrics = {}
for num, dataset in groupby_dataset:
dataset = datasets.Dataset.from_pandas(groupby_dataset.get_group(num), preserve_index=False)
all_targets = dataset["target"]
# remove unnecessary columns
dataset = dataset.remove_columns(["target", "num"])
dataloader = DataLoader(
dataset,
batch_size=args.batch_size,
collate_fn=data_collator,
# only pin memory when no gpu
pin_memory=not args.cpu,
)
# shard dataloader
dataloader = accelerator.prepare(dataloader)
all_lengths = []
all_outputs = []
for i, x in enumerate(tqdm(dataloader, desc=f"Evaluating {num} {data_type}")):
# NOTE: important to reset memory for every batch
if hasattr(model, "memory") and model.memory is not None:
model.memory.reset()
input_length = x.pop("input_length")
outputs = model.generate(
**x,
max_new_tokens=50,
do_sample=False,
num_beams=1,
temperature=1.0,
top_p=1.0,
)
start_idx = x["input_ids"].shape[1]
outputs = outputs[:, start_idx:]
# must be contiguous
outputs = outputs.contiguous()
outputs = accelerator.pad_across_processes(outputs, pad_index=tokenizer.pad_token_id, dim=1)
outputs = accelerator.gather_for_metrics(outputs).tolist()
input_length = accelerator.gather_for_metrics(input_length).tolist()
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
all_outputs.extend(outputs)
all_lengths.extend(input_length)
# if accelerator.process_index == 0:
# print(f"Target:{all_targets[i]}\nPred:{repr(all_outputs[i])}")
accuracy = 0
f1 = 0
for output, target in zip(all_outputs, all_targets):
if target.lower() in output.lower():
accuracy += 1
else:
accuracy += 0
f1 += round(qa_f1_score(output, target), 4)
accuracy /= len(all_outputs)
f1 /= len(all_outputs)
length = int(sum(all_lengths) / len(all_lengths))
metrics[length] = {
"accuracy": accuracy,
"f1": f1,
}
# if accelerator.process_index == 0:
# print(f"Accuracy of {num} {data_type}s (average length {length}): {accuracy}")
if accelerator.process_index == 0:
log_path = os.path.join(args.output_dir, f"{data_type}_retrieval.log")
file_logger = FileLogger(makedirs(log_path))
file_logger.log(metrics, Args=asdict(args))
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment