Commit f75058c7 authored by Rayyyyy's avatar Rayyyyy
Browse files

First add.

parents
Pipeline #1411 canceled with stages
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: FSDP
downcast_bf16: 'no'
fsdp_config:
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
fsdp_backward_prefetch: BACKWARD_PRE
fsdp_cpu_ram_efficient_loading: false
fsdp_forward_prefetch: false
fsdp_offload_params: false
fsdp_sharding_strategy: FULL_SHARD
fsdp_state_dict_type: FULL_STATE_DICT
fsdp_sync_module_states: true
fsdp_use_orig_params: true
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
{
"mixture": {
"commoncrawl": 52.2,
"c4": 26.7,
"github": 5.2,
"book": 4.2,
"arxiv": 4.6,
"wiki": 3.8,
"stackexchange": 3.3
},
"num_tokens_avg": {
"commoncrawl": 1207,
"c4": 378,
"wiki": 393,
"stackexchange": 309,
"github": 436,
"book": 89373,
"arxiv": 7375
}
}
\ No newline at end of file
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
gradient_accumulation_steps: 1
offload_optimizer_device: cpu
offload_param_device: cpu
zero3_init_flag: false
zero3_save_16bit_model: true
zero_stage: 3
distributed_type: DEEPSPEED
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
compute_environment: LOCAL_MACHINE
debug: false
deepspeed_config:
gradient_accumulation_steps: 1
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: false
zero3_save_16bit_model: true
zero_stage: 3
distributed_type: DEEPSPEED
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
{
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"total_num_steps": "auto",
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"bf16": {
"enabled": "auto",
"loss_scale": 0,
"initial_scale_power": 16,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 1e8,
"reduce_scatter": true,
"reduce_bucket_size": 1e8,
"overlap_comm": true,
"contiguous_gradients": true,
"offload_optimizer": {
"device": "cpu"
},
"round_robin_gradients": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
{
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"total_num_steps": "auto",
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
}
},
"bf16": {
"enabled": "auto",
"loss_scale": 0,
"initial_scale_power": 16,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"zero_optimization": {
"stage": 2,
"allgather_partitions": true,
"allgather_bucket_size": 1e9,
"reduce_scatter": true,
"reduce_bucket_size": 1e9,
"overlap_comm": true,
"contiguous_gradients": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 2000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
{
"zero_optimization": {
"stage": 3,
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
}
},
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"initial_scale_power": 10,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto",
"loss_scale": 0,
"initial_scale_power": 10,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto",
"total_num_steps": "auto"
}
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 1000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
\ No newline at end of file
{
"zero_optimization": {
"stage": 3,
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
}
},
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"initial_scale_power": 10,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto",
"loss_scale": 0,
"initial_scale_power": 10,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto",
"total_num_steps": "auto"
}
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 1000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
\ No newline at end of file
{
"zero_optimization": {
"stage": 3,
"overlap_comm": true,
"contiguous_gradients": true,
"sub_group_size": 1e9,
"reduce_bucket_size": "auto",
"stage3_prefetch_bucket_size": "auto",
"stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_gather_16bit_weights_on_model_save": true
},
"fp16": {
"enabled": "auto",
"loss_scale": 0,
"initial_scale_power": 10,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"bf16": {
"enabled": "auto",
"loss_scale": 0,
"initial_scale_power": 10,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"scheduler": {
"type": "WarmupDecayLR",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto",
"total_num_steps": "auto"
}
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",
"steps_per_print": 1000,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
# Evaluation
Make sure you have created the environment and downloaded the data according to [README](../README.md).
```bash
conda activate beacon
model=namespace-Pt/beacon-qwen-2-7b-instruct
# language modeling perplexity
torchrun --nproc_per_node 8 -m main.eval_lm --max_length 100000 --stride 32768 --model_name_or_path $model --enable_beacon --beacon_ratio_mix adapt-1024
# passkey retrieval accuracy
torchrun --nproc_per_node 8 -m main.eval_passkey --model_name_or_path $model --enable_beacon --beacon_ratio_mix adapt-1024
# needle-in-a-haystack accuracy
OPENAI_API_KEY="<you_api_key>" torchrun --nproc_per_node 8 -m main.eval_needle --model_name_or_path $model --enable_beacon --beacon_ratio_mix adapt-1024 --gpt_eval
# topic retrieval accuracy
torchrun --nproc_per_node 8 -m main.eval_topic --model_name_or_path $model --enable_beacon --beacon_ratio_mix adapt-1024
# longbench
torchrun --nproc_per_node 8 -m main.eval_longbench --model_name_or_path $model --enable_beacon --beacon_ratio_mix adapt-1024
# infinitebench
torchrun --nproc_per_node 8 -m main.eval_infbench --model_name_or_path $model --enable_beacon --beacon_ratio_mix adapt-1024
```
All evaluation results will be saved at `data/results`.
# Training
There are two stages in training:
- Pretrain
- 1B token from [redpajama](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample) with auto-regressive language modeling
- Add eos to each document and no packing
- 20K context length at maximum
- Finetune
- 5K samples from [LongAlpaca](https://huggingface.co/datasets/Yukang/LongAlpaca-12k), 2K samples from [Booksum](https://huggingface.co/datasets/kmfoda/booksum), 16K synthetic long-context QA data from GPT-3.5, and 5K samples from pretraining data
- 20K context length at maximum
## Prerequisite
Make sure you have created the environment and downloaded the data according to [README](../README.md).
### Mistral
#### Pretrain
```bash
output_name=beacon-mistral-pretrain
torchrun --nproc_per_node 8 $DDP -m main.train \
--output_dir data/outputs/$output_name \
--model_name_or_path mistralai/Mistral-7B-Instruct-v0.2 \
--train_data long-llm:redpajama/train.json \
--min_length 2400 \
--max_length 20000 \
--group_by_stride strict \
--enable_beacon \
--beacon_window 2048 \
--beacon_stride 2048 \
--beacon_attn full-coverage \
--beacon_attend_prev True \
--beacon_sink_size 0 \
--beacon_ratio 2 4 8 16 32 \
--beacon_ratio_mix step-random \
--beacon_param q k v \
--beacon_pos interleave \
--attn_impl flash_attention_2 \
--gradient_checkpointing \
--use_reentrant False \
--save_only_model \
--save_strategy epoch \
--evaluation_strategy steps \
--num_train_epochs 1 \
--logging_steps 50 \
--bf16 \
--deepspeed data/deepspeed/stage2.json
```
#### Finetune
```bash
output_name=beacon-mistral-finetune
torchrun --nproc_per_node 8 $DDP -m main.train \
--output_dir data/outputs/$output_name \
--model_name_or_path data/outputs/beacon-mistral-pretrain/* \
--train_data long-llm:gpt/one_detail_book.train.16K.json long-llm:gpt/one_detail_paper.train.16K.json long-llm:longalpaca/train.json long-llm:booksum/train.16K.json long-llm:needle/train.16K.json long-llm:redpajama/train.json[5000] \
--max_length 20000 \
--min_length 7200 \
--group_by_stride strict \
--enable_beacon \
--beacon_window 2048 \
--beacon_stride 2048 \
--beacon_attn full-coverage \
--beacon_attend_prev True \
--beacon_sink_size 0 \
--beacon_ratio 2 4 8 \
--beacon_ratio_mix step-random \
--beacon_param q k v \
--beacon_pos interleave \
--attn_impl flash_attention_2 \
--learning_rate 1e-5 \
--gradient_checkpointing \
--use_reentrant False \
--save_only_model \
--num_train_epochs 1 \
--save_strategy epoch \
--logging_steps 50 \
--bf16 \
--deepspeed data/deepspeed/stage2.json \
--chat_template mistral
```
### Llama-3
NOTE: according to our experiment, Llama-3 requires attention sink.
#### Pretrain
```bash
output_name=beacon-llama3-pretrain
torchrun --nproc_per_node 8 $DDP -m main.train \
--output_dir data/outputs/$output_name \
--model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
--train_data long-llm:redpajama/train.json \
--min_length 2400 \
--max_length 20000 \
--group_by_stride strict \
--enable_beacon \
--beacon_window 1024 \
--beacon_stride 1024 \
--beacon_attn full-coverage \
--beacon_attend_prev True \
--beacon_sink_size 1 \
--beacon_ratio 2 4 8 16 32 \
--beacon_ratio_mix step-random \
--beacon_param q k v \
--beacon_pos interleave \
--attn_impl flash_attention_2 \
--gradient_checkpointing \
--use_reentrant False \
--save_only_model \
--save_strategy epoch \
--evaluation_strategy steps \
--num_train_epochs 1 \
--logging_steps 50 \
--bf16 \
--deepspeed data/deepspeed/stage2.json
```
#### Finetune
```bash
output_name=beacon-llama3-finetune
torchrun --nproc_per_node 8 $DDP -m main.train \
--output_dir data/outputs/$output_name \
--model_name_or_path data/outputs/beacon-llama3-pretrain/* \
--train_data long-llm:gpt/one_detail_book.train.16K.json long-llm:gpt/one_detail_paper.train.16K.json long-llm:longalpaca/train.json long-llm:booksum/train.16K.json long-llm:needle/train.16K.json long-llm:redpajama/train.json[5000] \
--max_length 20000 \
--min_length 7200 \
--group_by_stride strict \
--enable_beacon \
--beacon_window 1024 \
--beacon_stride 1024 \
--beacon_attn full-coverage \
--beacon_attend_prev True \
--beacon_sink_size 1 \
--beacon_ratio 2 4 8 \
--beacon_ratio_mix step-random \
--beacon_param q k v \
--beacon_pos interleave \
--attn_impl flash_attention_2 \
--learning_rate 1e-5 \
--gradient_checkpointing \
--use_reentrant False \
--save_only_model \
--num_train_epochs 1 \
--save_strategy epoch \
--logging_steps 50 \
--bf16 \
--deepspeed data/deepspeed/stage2.json \
--chat_template llama-3
```
### Qwen-2
#### Pretrain
```bash
output_name=beacon-qwen2-pretrain
torchrun --nproc_per_node 8 $DDP -m main.train \
--output_dir data/outputs/$output_name \
--model_name_or_path Qwen/Qwen2-7B-Instruct \
--train_data long-llm:redpajama/train.json \
--min_length 2400 \
--max_length 20000 \
--group_by_stride strict \
--enable_beacon \
--beacon_window 2048 \
--beacon_stride 2048 \
--beacon_attn full-coverage \
--beacon_attend_prev True \
--beacon_sink_size 0 \
--beacon_ratio 2 4 8 16 32 \
--beacon_ratio_mix step-random \
--beacon_param q k v \
--beacon_pos interleave \
--attn_impl flash_attention_2 \
--gradient_checkpointing \
--use_reentrant False \
--save_only_model \
--save_strategy epoch \
--evaluation_strategy steps \
--num_train_epochs 1 \
--logging_steps 50 \
--bf16 \
--deepspeed data/deepspeed/stage2.json
```
#### Finetune
```bash
torchrun --nproc_per_node 8 $DDP -m main.train \
--output_dir data/outputs/$output_name \
--model_name_or_path data/outputs/beacon-qwen2-pretrain/* \
--train_data long-llm:gpt/one_detail_book.train.16K.json long-llm:gpt/one_detail_paper.train.16K.json long-llm:longalpaca/train.json long-llm:booksum/train.16K.json long-llm:needle/train.16K.json long-llm:redpajama/train.json[5000] \
--max_length 20000 \
--min_length 7200 \
--group_by_stride strict \
--enable_beacon \
--beacon_window 2048 \
--beacon_stride 2048 \
--beacon_attn full-coverage \
--beacon_attend_prev True \
--beacon_sink_size 0 \
--beacon_ratio 2 4 8 \
--beacon_ratio_mix step-random \
--beacon_param q k v \
--beacon_pos interleave \
--attn_impl flash_attention_2 \
--learning_rate 1e-5 \
--gradient_checkpointing \
--use_reentrant False \
--save_only_model \
--num_train_epochs 1 \
--save_strategy epoch \
--logging_steps 50 \
--bf16 \
--deepspeed data/deepspeed/stage2.json \
--chat_template qwen
```
import os
import torch
from typing import List, Optional
from accelerate import Accelerator
from transformers import HfArgumentParser
from transformers.utils import logging
from torch.utils.data import DataLoader
from dataclasses import dataclass, field, asdict
from src.data import Data
from src.metrics import Metric
from src import ModelArgs, DefaultDataCollator, FileLogger, get_model_and_tokenizer, makedirs, evaluate_generation, split_file_dir_name_ext
logger = logging.get_logger(__name__)
@dataclass
class Args(ModelArgs):
eval_data: Optional[str] = field(
default=None,
metadata={'help': 'Evaluation json data.'}
)
output_dir: str = field(
default="data/results/generation/",
metadata={'help': 'The base directory for saving results and logs.'}
)
result_dir: Optional[str] = field(
default=None,
metadata={'help': 'The directory relative to output_dir for saving results.'}
)
min_length: int = field(
default=0,
metadata={'help': 'How many tokens at minimum for evaluation?'}
)
max_length: int = field(
default=None,
metadata={'help': 'How many tokens at maximum for evaluation?'}
)
seed: int = field(
default=42
)
max_num: int = field(
default=None,
metadata={'help': 'Max number of instances to evaluate.'}
)
metrics: List[str] = field(
default_factory=lambda: ["save_result"],
metadata={'help': 'List of metrics. {rouge, save_result}'}
)
@torch.no_grad()
def main():
parser = HfArgumentParser([Args])
args: Args = parser.parse_args_into_dataclasses()[0]
accelerator = Accelerator(cpu=args.cpu)
model, tokenizer = get_model_and_tokenizer(args, device=accelerator.device)
with accelerator.main_process_first():
dataset = Data.prepare_eval_data(
args.eval_data,
tokenizer=tokenizer,
max_length=args.max_length,
min_length=args.min_length,
chat_template=args.chat_template,
seed=args.seed,
max_eval_num=args.max_num,
cache_dir=args.dataset_cache_dir,
)
# get labels (the target generation result)
labels = dataset["labels"]
dataset = dataset.remove_columns(["labels"])
data_collator = DefaultDataCollator(tokenizer=tokenizer)
dataloader = DataLoader(
dataset,
batch_size=args.batch_size,
collate_fn=data_collator,
# only pin memory when no gpu
pin_memory=not args.cpu,
)
# NOTE: prepare dataloader so the data moves to GPU automatically
dataloader = accelerator.prepare(dataloader)
save_path = Metric.get_save_path(
args.eval_data,
os.path.join(args.output_dir, args.result_dir) if args.result_dir is not None else args.output_dir
)
compute_metrics_fn = Metric.get_metric_fn(
metrics=args.metrics,
save_path=save_path
)
indices, outputs = evaluate_generation(
model,
dataloader,
accelerator=accelerator,
tokenizer=tokenizer,
)
if accelerator.process_index == 0:
metrics = compute_metrics_fn(outputs, labels, indices=indices)
config_save_path = os.path.join(split_file_dir_name_ext(save_path)[0], "config.json")
args.save(config_save_path)
file_logger = FileLogger(makedirs(os.path.join(args.output_dir, "metrics.log")))
file_logger.log(metrics, Args=asdict(args))
if __name__ == "__main__":
main()
import os
import datasets
import json
import torch
import pandas as pd
from tqdm import tqdm
from functools import partial
from typing import Optional, Dict, List
from dataclasses import dataclass, field, asdict
from accelerate import Accelerator
from transformers import HfArgumentParser, AutoTokenizer
from transformers.utils import logging
from torch.utils.data import DataLoader
from src import ModelArgs, DefaultDataCollator, FileLogger, get_model_and_tokenizer, makedirs, apply_chat_template
from .infbench_utils import TASK_TO_PATH, TASK_TO_MAX_NEW_TOKENS, get_score_one, create_prompt, get_answer
logger = logging.get_logger(__name__)
@dataclass
class Args(ModelArgs):
eval_data: str = field(
default="long-llm:infbench",
metadata={'help': 'The directory of all infbench evaluation data.'}
)
output_dir: str = field(
default="data/results/infbench/",
metadata={'help': 'The base directory for saving results and logs.'}
)
result_dir: Optional[str] = field(
default=None,
metadata={'help': 'The directory relative to output_dir for saving results.'}
)
tasks: List[str] = field(
default_factory=lambda: ['longbook_qa_eng', 'longbook_sum_eng'],
metadata={'help': 'Which dataset to evaluate?'}
)
prompt_template: str = field(
default="mistral",
metadata={'help': 'Which prompt template to use? (See infbench_utils.py for reference.)'}
)
max_length: int = field(
default=128000,
metadata={'help': 'Max input length.'}
)
truncate_from_middle: bool = field(
default=True,
metadata={'help': 'Truncate inputs from the middle.'}
)
load_result: bool = field(
default=False,
metadata={'help': 'Load result from saved files?'}
)
do_sample: bool = False
def process_infbench(data, indices, tokenizer, chat_template, task:str, prompt_template:str="mistral", max_length=100000, truncate_from_middle=True):
outputs = {'input_ids': [], 'attention_mask': [], "index": [], "answer": []}
# NOTE: high version datasets use LazyBatch to wrap data, which cannot be reverted to list of dicts, thus, we need to convert it to dict first
data = pd.DataFrame(dict(data)).to_dict(orient="records")
for sample, index in zip(data, indices):
prompt = create_prompt(sample, task, prompt_template)
answer = get_answer(sample, task)
if truncate_from_middle:
tokenized_prompt = tokenizer.encode(prompt, add_special_tokens=False)
if len(tokenized_prompt) > max_length:
half = int(max_length / 2)
prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True) + tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True)
else:
tokenized_prompt = tokenizer.encode(prompt, add_special_tokens=False)
prompt = tokenizer.decode(tokenized_prompt[-max_length:], skip_special_tokens=True)
encoded = apply_chat_template(
chat_template,
messages=[{'role': 'user', 'content': prompt}],
tokenizer=tokenizer,
add_generation_prompt=True,
).encoded
outputs["input_ids"].append(encoded["input_ids"])
outputs["attention_mask"].append(encoded["attention_mask"])
outputs["index"].append(index)
outputs["answer"].append(answer)
return outputs
@torch.no_grad()
def main():
parser = HfArgumentParser([Args])
args = parser.parse_args_into_dataclasses()[0]
accelerator = Accelerator(cpu=args.cpu)
model, tokenizer = get_model_and_tokenizer(args, device=accelerator.device)
if args.tasks == ["all"]:
tasks = list(TASK_TO_PATH.keys())
else:
tasks = args.tasks
with accelerator.main_process_first():
all_datasets = {}
for task in tasks:
process_fn = partial(
process_infbench,
tokenizer=tokenizer,
chat_template=args.chat_template,
max_length=args.max_length,
task=task,
prompt_template=args.prompt_template,
truncate_from_middle=args.truncate_from_middle,
)
path = os.path.join(args.eval_data, TASK_TO_PATH[task])
raw_dataset = datasets.load_dataset("json", data_files=path, cache_dir=args.dataset_cache_dir, split="train")
dataset = raw_dataset.map(process_fn, batched=True, num_proc=32, batch_size=10, with_indices=True, remove_columns=raw_dataset.column_names)
all_datasets[task] = dataset
result_dir = os.path.join(args.output_dir, args.result_dir)
metrics = {}
for i, (task, dataset) in enumerate(all_datasets.items()):
if accelerator.process_index == 0:
logger.info(f"Evaluating {task} ({i + 1} / {len(all_datasets)})...")
result_path = os.path.join(result_dir, f"{task}.json")
# get answers in advance
labels = dataset["answer"]
dataset = dataset.remove_columns(["answer"])
if not (args.load_result and os.path.exists(result_path)):
data_collator = DefaultDataCollator(tokenizer=tokenizer)
dataloader = DataLoader(
dataset,
batch_size=args.batch_size,
collate_fn=data_collator,
# only pin memory when no gpu
pin_memory=not args.cpu,
)
# NOTE: prepare dataloader so the data moves to GPU automatically
dataloader = accelerator.prepare(dataloader)
indices = []
preds = []
max_new_tokens = TASK_TO_MAX_NEW_TOKENS[task]
for j, x in enumerate(tqdm(dataloader, desc="Generating")):
index = x.pop("index").tolist()
input_length = x["input_ids"].shape[1]
# NOTE: important to reset memory for every batch
if hasattr(model, "memory"):
model.memory.reset()
output = model.generate(
**x,
max_new_tokens=max_new_tokens,
)
if isinstance(output, torch.Tensor):
# 1, max_new_tokens
output = output[:, input_length:]
output = tokenizer.batch_decode(output, skip_special_tokens=True)
elif isinstance(output, list):
pass
if accelerator.num_processes > 1:
output = accelerator.gather_for_metrics(output)
index = accelerator.gather_for_metrics(index)
if accelerator.process_index == 0:
preds.extend(output)
indices.extend(index)
else:
if accelerator.process_index == 0:
preds = []
indices = []
with open(result_path, "r", encoding="utf-8") as f:
# the first line is metric
f.readline()
for line in f:
item = json.loads(line)
preds.append(item["pred"])
indices.append(len(indices))
if accelerator.process_index == 0:
scores = []
for label, pred in tqdm(zip(labels, preds)):
# NOTE: here we explicitly input model_name=None
score = get_score_one(pred, label, task, None)
scores.append(score)
score = round(sum(scores) / len(scores), 4)
logger.info(f"{task}: {score}")
metrics[task] = score
with open(makedirs(result_path), "w", encoding="utf-8") as f:
f.write(json.dumps(score, ensure_ascii=False) + "\n")
for index, pred, label in zip(indices, preds, labels):
item = {
"index": index,
"pred": pred,
"label": label,
}
f.write(json.dumps(item, ensure_ascii=False) + "\n")
if accelerator.process_index == 0:
# save config
args.save(os.path.join(result_dir, "config.json"))
avg = round(sum(metrics.values()) / len(metrics), 4)
metrics["avg"] = avg
file_logger = FileLogger(makedirs(os.path.join(args.output_dir, "metrics.log")))
file_logger.log(metrics, Args=asdict(args))
if __name__ == "__main__":
main()
import os
import datasets
import time
import torch
from typing import Optional
from collections import defaultdict
from dataclasses import dataclass, field, asdict
from accelerate import Accelerator
from transformers import HfArgumentParser
from torch.utils.data import DataLoader
from src import ModelArgs, DefaultDataCollator, FileLogger, get_model_and_tokenizer, makedirs, split_file_dir_name_ext, evaluate_perplexity
@dataclass
class Args(ModelArgs):
eval_data: str = field(
default="long-llm:lm/pg19.json",
metadata={'help': 'The evaluation json data path.'}
)
output_dir: str = field(
default="data/results/lm/",
metadata={'help': 'Output directory for results and logs.'}
)
retokenize: bool = field(
default=False,
metadata={'help': 'Retokenize the corpus?'}
)
padding_side: str = field(
default="right",
metadata={'help': 'Which side to pad?'}
)
stride: int = field(
default=2048,
metadata={'help': 'Streaming stride when evaluating perplexity.'}
)
max_sample_num: int = field(
default=100,
metadata={'help': 'How many samples to evaluate in eval_data?'}
)
min_length: Optional[int] = field(
default=None,
metadata={'help': 'Minimum length for input_ids.'}
)
def process_lm_pre(tokenizer, tokenize_max_char=None):
def _process(data):
outputs = {'input_ids': []}
for text in data['text']:
if tokenize_max_char is not None:
text = text[:tokenize_max_char]
outputs['input_ids'].append(tokenizer.encode(text, add_special_tokens=False))
return outputs
return _process
def process_lm(tokenizer, max_length=4096, stride=1024, min_length=None):
# stride=0 indicates we just use one forward pass with max_length for each text
if stride == 0:
stride = max_length
jump = True
else:
jump = False
test = tokenizer.encode("test")
has_bos = False
if test[0] == tokenizer.bos_token_id:
# NOTE: subtract 1 because it will be occupied by the bos token
max_length -= 1
has_bos = True
def _process(data, indices, **kwds):
outputs = defaultdict(list)
for text, index in zip(data["text"], indices):
input_ids = tokenizer.encode(text, add_special_tokens=False)
seq_len = len(input_ids)
prev_end_loc = 0
if min_length is not None and seq_len < min_length:
continue
for start_loc in range(0, seq_len, stride):
end_loc = min(start_loc + max_length, seq_len)
sub_seq_len = end_loc - start_loc
sub_trg_len = end_loc - prev_end_loc # may be different from stride on last loop
sub_input_ids = input_ids[start_loc: end_loc]
sub_attention_mask = [1 for _ in range(sub_seq_len)]
if has_bos:
sub_input_ids.insert(0, tokenizer.bos_token_id)
sub_attention_mask.insert(0, 1)
sub_seq_len += 1
sub_labels = sub_input_ids.copy()
sub_labels[:-sub_trg_len] = [-100 for _ in range(sub_seq_len - sub_trg_len)]
sub_inputs = {
"index": index,
"input_ids": sub_input_ids,
"attention_mask": sub_attention_mask,
"labels": sub_labels,
}
for k, v in sub_inputs.items():
outputs[k].append(v)
prev_end_loc = end_loc
# NOTE: when end_loc is just the same as seq_len, jump out
if end_loc == seq_len or jump:
break
return outputs
return _process
@torch.no_grad()
def main():
parser = HfArgumentParser([Args])
args: Args = parser.parse_args_into_dataclasses()[0]
# increase timeout to avoid error
accelerator = Accelerator(cpu=args.cpu)
model, tokenizer = get_model_and_tokenizer(args, device=accelerator.device)
_, dataset_name, _ = split_file_dir_name_ext(args.eval_data)
process_fn = process_lm(tokenizer, max_length=args.max_length, stride=args.stride, min_length=args.min_length)
dataset = datasets.load_dataset("json", data_files=args.eval_data, cache_dir=args.dataset_cache_dir, split="train")
if len(dataset) > args.max_sample_num:
# slice out the first max_sample_num samples
dataset = dataset.train_test_split(args.max_sample_num, shuffle=False)["test"]
dataset = dataset.map(process_fn, batched=True, num_proc=32, remove_columns=dataset.column_names, keep_in_memory=True, with_indices=True)
data_collator = DefaultDataCollator(tokenizer=tokenizer)
dataloader = DataLoader(
dataset,
batch_size=args.batch_size,
collate_fn=data_collator,
# only pin memory when no gpu
pin_memory=not args.cpu,
)
accelerator.wait_for_everyone()
# NOTE: prepare dataloader so the data moves to GPU automatically
dataloader = accelerator.prepare(dataloader)
t1 = time.time()
perplexity = evaluate_perplexity(model, dataloader, accelerator)
t2 = time.time()
memory = torch.cuda.max_memory_allocated() / 1024**2
metrics = {"perplexity": perplexity, "time": round((t2 - t1) / len(dataset), 4), "memory": memory}
if accelerator.process_index == 0:
log_path = os.path.join(args.output_dir, f"{dataset_name}.log")
file_logger = FileLogger(makedirs(log_path))
file_logger.log(metrics, Args=asdict(args))
if __name__ == "__main__":
main()
import os
import datasets
import json
import torch
from tqdm import tqdm
from typing import Optional, Dict, List
from functools import partial
from collections import defaultdict
from dataclasses import dataclass, field, asdict
from accelerate import Accelerator
from transformers import HfArgumentParser
from transformers.utils import logging
from torch.utils.data import DataLoader
from src import ModelArgs, DefaultDataCollator, FileLogger, get_model_and_tokenizer, makedirs, apply_chat_template
from .longbench_utils import DATASET2PROMPT, DATASET2MAXNEWTOKENS, DATASET2CATEGORY, scorer
logger = logging.get_logger(__name__)
@dataclass
class Args(ModelArgs):
eval_data: str = field(
default="long-llm:longbench/",
metadata={'help': 'The evaluation json data path.'}
)
output_dir: str = field(
default="data/results/longbench/",
metadata={'help': 'The base directory for saving results and logs.'}
)
result_dir: Optional[str] = field(
default=None,
metadata={'help': 'The directory relative to output_dir for saving results.'}
)
tasks: List[str] = field(
default_factory=lambda: ['narrativeqa', 'qasper', 'multifieldqa_en', 'hotpotqa', '2wikimqa', 'musique', 'gov_report', 'qmsum', 'multi_news', 'trec', 'triviaqa', 'samsum', 'lcc', 'repobench-p'],
metadata={'help': 'Which dataset to evaluate?'}
)
newline_as_eos: bool = field(
default=True,
metadata={'help': 'Whether to use new line as eos (for QA tasks only) or not.'}
)
max_length: int = field(
default=31500,
metadata={'help': 'Max input length.'}
)
truncate_from_middle: bool = field(
default=True,
metadata={'help': 'Truncate inputs from the middle.'}
)
load_result: bool = field(
default=False,
metadata={'help': 'Load result from saved files?'}
)
do_sample: bool = False
def process_longbench(data, indices, tokenizer, chat_template, task, max_length=3500, truncate_from_middle=True):
outputs = {'input_ids': [], 'attention_mask': [], "index": []}
for input, context, index in zip(data['input'], data['context'], indices):
prompt_template = DATASET2PROMPT[task]
prompt = prompt_template.format(input=input, context=context)
if truncate_from_middle:
tokenized_prompt = tokenizer.encode(prompt)
if len(tokenized_prompt) > max_length:
half = int(max_length / 2)
prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True) + tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True)
else:
tokenized_prompt = tokenizer.encode(prompt)
prompt = tokenizer.decode(tokenized_prompt[-max_length:], skip_special_tokens=True)
# in fewshot learning and code completion we do not need chat template
if not any(x in DATASET2CATEGORY[task] for x in ["Few-Shot Learning", "Code Completion"]):
encoded = apply_chat_template(
chat_template,
messages=[{'role': 'user', 'content': prompt}],
tokenizer=tokenizer,
add_generation_prompt=True,
).encoded
else:
encoded = tokenizer(prompt)
outputs["input_ids"].append(encoded["input_ids"])
outputs["attention_mask"].append(encoded["attention_mask"])
outputs["index"].append(index)
return outputs
@torch.no_grad()
def main():
parser = HfArgumentParser([Args])
args = parser.parse_args_into_dataclasses()[0]
accelerator = Accelerator(cpu=args.cpu)
model, tokenizer = get_model_and_tokenizer(args, device=accelerator.device)
if hasattr(model, "generation_config"):
eos_token_id = model.generation_config.eos_token_id
else:
eos_token_id = tokenizer.eos_token_id
if isinstance(eos_token_id, int):
eos_token_id = [eos_token_id]
# stop generation for QA tasks when \n appears
if args.newline_as_eos:
eos_token_id.append(tokenizer.encode("\n", add_special_tokens=False)[-1])
if args.tasks == ["all"]:
tasks = list(DATASET2PROMPT.keys())
else:
tasks = args.tasks
with accelerator.main_process_first():
all_datasets = {}
for task in tasks:
process_fn = partial(
process_longbench,
tokenizer=tokenizer,
chat_template=args.chat_template,
task=task,
max_length=args.max_length,
truncate_from_middle=args.truncate_from_middle,
)
path = os.path.join(args.eval_data, f"{task}.jsonl")
raw_dataset = datasets.load_dataset("json", data_files=path, cache_dir=args.dataset_cache_dir, split="train")
dataset = raw_dataset.map(process_fn, batched=True, num_proc=32, batch_size=10, with_indices=True, remove_columns=raw_dataset.column_names)
all_datasets[task] = (raw_dataset, dataset)
result_dir = os.path.join(args.output_dir, args.result_dir)
metrics = {}
for i, task in enumerate(all_datasets.keys()):
if accelerator.process_index == 0:
logger.info(f"Evaluating {task} ({i + 1} / {len(all_datasets)})...")
result_path = os.path.join(result_dir, f"{task}.json")
raw_dataset, dataset = all_datasets[task]
if not (args.load_result and os.path.exists(result_path)):
data_collator = DefaultDataCollator(tokenizer=tokenizer)
dataloader = DataLoader(
dataset,
batch_size=args.batch_size,
collate_fn=data_collator,
# only pin memory when no gpu
pin_memory=not args.cpu,
)
dataloader = accelerator.prepare(dataloader)
indices = []
preds = []
max_new_tokens = DATASET2MAXNEWTOKENS[task]
for i, x in enumerate(tqdm(dataloader, desc="Generating")):
index = x.pop("index").tolist()
input_length = x["input_ids"].shape[1]
# NOTE: important to reset memory for every batch
if hasattr(model, "memory"):
model.memory.reset()
kwargs = {"max_new_tokens": max_new_tokens}
if task in ["2wikimqa", "hotpotqa", "musique", "multifieldqa_en", "qasper", "narrativeqa", "samsum"]:
kwargs["eos_token_id"] = eos_token_id
# NOTE: very important to include \n as an eos token for QA tasks, otherwise the F1 score is devastating
output = model.generate(
**x,
**kwargs
)
if isinstance(output, torch.Tensor):
# 1, max_new_tokens
output = output[:, input_length:]
output = tokenizer.batch_decode(output, skip_special_tokens=True)
elif isinstance(output, list):
pass
if accelerator.num_processes > 1:
output = accelerator.gather_for_metrics(output)
index = accelerator.gather_for_metrics(index)
if accelerator.process_index == 0:
preds.extend(output)
indices.extend(index)
else:
if accelerator.process_index == 0:
preds = []
indices = []
with open(result_path, "r", encoding="utf-8") as f:
# the first line is the metric score
f.readline()
for line in f:
item = json.loads(line)
preds.append(item["pred"])
indices.append(len(indices))
if accelerator.process_index == 0:
answers = raw_dataset["answers"]
lengths = raw_dataset["length"]
all_classes = raw_dataset["all_classes"][0]
score = scorer(task, preds, answers, all_classes)
logger.info(f"{task}: {score}")
metrics[task] = score
with open(makedirs(result_path), "w", encoding="utf-8") as f:
f.write(json.dumps(score, ensure_ascii=False) + "\n")
for index, pred in zip(indices, preds):
sample = raw_dataset[index]
del sample["all_classes"]
del sample["context"]
del sample["language"]
del sample["_id"]
sample["pred"] = pred
f.write(json.dumps(sample, ensure_ascii=False) + "\n")
if accelerator.process_index == 0:
# save config
args.save(os.path.join(result_dir, "config.json"))
# compute category score
category_metrics = defaultdict(list)
for dataset, metric in metrics.items():
category = DATASET2CATEGORY[dataset]
category_metrics[category].append(metric)
for k, v in category_metrics.items():
# when evaluating on longbench_e, each metric is a dict of float
if isinstance(v[0], dict):
category_metric = {}
for kk in v[0].keys():
vv = [v[j][kk] for j in range(len(v))]
category_metric[kk] = round(sum(vv) / len(vv), 2)
category_metrics[k] = category_metric
else:
category_metrics[k] = round(sum(v) / len(v), 2)
# compute average score
if isinstance(next(iter(metrics.values())), dict):
avg = defaultdict(list)
for k, v in metrics.items():
for kk, vv in v.items():
avg[kk].append(vv)
for k, v in avg.items():
avg[k] = round(sum(v) / len(v), 2)
else:
avg = round(sum(metrics.values()) / len(metrics), 2)
metrics["avg"] = avg
file_logger = FileLogger(makedirs(os.path.join(args.output_dir, "metrics.log")))
file_logger.log(metrics, Args=asdict(args), Category_Metrics=category_metrics)
if __name__ == "__main__":
main()
import os
import copy
import json
import datasets
from typing import List, Optional, Union, Mapping
from accelerate import Accelerator
from torch.utils.data import DataLoader
from transformers import HfArgumentParser
from transformers.utils import logging
from dataclasses import dataclass, field
from collections import defaultdict
from src import ModelArgs, DefaultDataCollator, FileLogger, get_model_and_tokenizer, makedirs, apply_chat_template, evaluate_nll, remove_eos
logger = logging.get_logger(__name__)
SUBJECT_2_CATEGORY={"abstract_algebra": "STEM", "anatomy": "others", "astronomy": "STEM", "business_ethics": "others", "clinical_knowledge": "others", "college_biology": "STEM", "college_chemistry": "STEM", "college_computer_science": "STEM", "college_mathematics": "STEM", "college_medicine": "others", "college_physics": "STEM", "computer_security": "STEM", "conceptual_physics": "STEM", "econometrics": "Social Sciences", "electrical_engineering": "STEM", "elementary_mathematics": "STEM", "formal_logic": "Humanities", "global_facts": "others", "high_school_biology": "STEM", "high_school_chemistry": "STEM", "high_school_computer_science": "STEM", "high_school_european_history": "Humanities", "high_school_geography": "Social Sciences", "high_school_government_and_politics": "Social Sciences", "high_school_macroeconomics": "Social Sciences", "high_school_mathematics": "STEM", "high_school_microeconomics": "Social Sciences", "high_school_physics": "STEM", "high_school_psychology": "Social Sciences", "high_school_statistics": "STEM", "high_school_us_history": "Humanities", "high_school_world_history": "Humanities", "human_aging": "others", "human_sexuality": "Social Sciences", "international_law": "Humanities", "jurisprudence": "Humanities", "logical_fallacies": "Humanities", "machine_learning": "STEM", "management": "others", "marketing": "others", "medical_genetics": "others", "miscellaneous": "others", "moral_disputes": "Humanities", "moral_scenarios": "Humanities", "nutrition": "others", "philosophy": "Humanities", "prehistory": "Humanities", "professional_accounting": "others", "professional_law": "Humanities", "professional_medicine": "others", "professional_psychology": "Social Sciences", "public_relations": "Social Sciences", "security_studies": "Social Sciences", "sociology": "Social Sciences", "us_foreign_policy": "Social Sciences", "virology": "others", "world_religions": "Humanities"}
@dataclass
class Args(ModelArgs):
eval_data: str = field(
default="long-llm:mmlu/test.json",
metadata={'help': 'The evaluation json data path.'}
)
output_dir: str = field(
default="data/results/mmlu/",
metadata={'help': 'The base directory for saving results and logs.'}
)
result_dir: Optional[str] = field(
default=None,
metadata={'help': 'The directory relative to output_dir for saving results.'}
)
batch_size: int = field(
default=8,
metadata={'help': 'Batch size.'}
)
few_shot: int = field(
default=0,
metadata={'help': 'How many few shot train samples?'},
)
train_data: str = field(
default="long-llm:mmlu/dev.json",
metadata={'help': 'Path to the file containing training examples.'}
)
def remove_eos(inputs: Mapping, eos_token_ids: Union[List,int]):
if isinstance(eos_token_ids, int):
eos_token_ids = [eos_token_ids]
input_ids = inputs["input_ids"]
eos_idx = [i for i, x in enumerate(input_ids) if x in eos_token_ids]
if len(eos_idx):
eos_idx = eos_idx[-1]
else:
return inputs
for k, v in inputs.items():
inputs[k].pop(eos_idx)
return inputs
def process_mmlu(tokenizer, chat_template, eos_token_id, few_shot=0, train_data=None, cache_dir=None):
if few_shot > 0:
assert train_data is not None
train_data = datasets.load_dataset("json", data_files=train_data, cache_dir=cache_dir, split="train")
train_df = train_data.to_pandas()
# transform the dataframe into dict of dataframes
train_df = {k: v[:few_shot] for k, v in train_df.groupby("subject")}
options = ['A', 'B', 'C', 'D']
def _prepare_sample(query, choices, answer:str=None):
"""
<Question>
A. <Choices 1>
B. <Choices 2>
C. <Choices 3>
D. <Choices 4>
Answer: <Answer>
"""
# answer maybe int or numpy int64
if answer is not None and not isinstance(answer, str):
answer = options[answer]
option_components = []
for option, choice in zip(options, choices):
option_components.append(f'{option}. {choice}')
option_string = "\n".join(option_components)
if answer is None:
sample = f"{query}\n{option_string}\nAnswer:"
else:
sample = f"{query}\n{option_string}\nAnswer: {answer}"
return sample
def _process(data, indices):
"""Yield key and query with a prompt template"""
outputs = {"input_ids": [], "attention_mask": [], "labels": [], "index": []}
for index, query, subject, choices, answer in zip(indices, data["query"], data["subject"], data["choices"], data["answer"]):
query = query.strip()
head = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
if few_shot > 0:
train_samples = ""
for i in range(few_shot):
if i >= len(train_df[subject]):
break
train_sample = train_df[subject].iloc[i][['query', 'choices', 'answer']]
train_sample = _prepare_sample(**train_sample) + "\n\n"
train_samples += train_sample
else:
train_samples = ""
for option in options:
prompt = head + train_samples + _prepare_sample(query, choices)
answer = option
encoded = apply_chat_template(
chat_template,
[{"role": "user", "content": prompt}, {"role": "assistant", "content": answer}],
tokenizer=tokenizer,
return_labels=True
).encoded
encoded = remove_eos(encoded, eos_token_id)
encoded["index"] = index
for k, v in encoded.items():
outputs[k].append(v)
return outputs
return _process
def evaluate_mmlu(eval_data, save_path, eval_preds):
makedirs(save_path)
tasks = defaultdict(list)
samples = {}
with open(eval_data) as f:
for line in f:
sample = json.loads(line.strip())
samples[sample["query_id"]] = sample
with open(makedirs(save_path), "w") as f:
for k, v in eval_preds.items():
output = min(enumerate(v), key=lambda x: x[1])[0]
sample = samples[k]
sample["output"] = output
tasks[sample["subject"]].append((output, sample["answer"]))
f.write(json.dumps(sample, ensure_ascii=False) + "\n")
metrics = defaultdict(list)
for task_name, task_eval_preds in tasks.items():
accuracy = 0
for pred, label in task_eval_preds:
accuracy += int(pred == label)
accuracy /= len(task_eval_preds)
category = SUBJECT_2_CATEGORY[task_name]
metrics[f"{category}"].append(accuracy)
metrics["all"].append(accuracy)
for k, v in metrics.items():
metrics[k] = sum(v) / len(v)
# for printing
metrics = {
"STEM": metrics["STEM"],
"Social Sciences": metrics["Social Sciences"],
"Humanities": metrics["Humanities"],
"Others": metrics["others"],
"All": metrics["all"],
}
return dict(metrics)
def main():
parser = HfArgumentParser([Args])
args = parser.parse_args_into_dataclasses()[0]
accelerator = Accelerator(cpu=args.cpu)
model, tokenizer = get_model_and_tokenizer(args, device=accelerator.device)
result_dir = os.path.join(args.output_dir, args.result_dir)
eval_data = args.eval_data
with accelerator.main_process_first():
dataset = datasets.load_dataset("json", data_files=eval_data, split="train", cache_dir=args.dataset_cache_dir)
dataset = dataset.map(process_mmlu(
tokenizer,
chat_template=args.chat_template,
# strip eos
eos_token_id=model.generation_config.eos_token_id,
few_shot=args.few_shot,
train_data=args.train_data,
cache_dir=args.dataset_cache_dir,
), remove_columns=dataset.column_names, batched=True, num_proc=32, with_indices=True)
data_collator = DefaultDataCollator(tokenizer=tokenizer)
dataloader = DataLoader(
dataset,
batch_size=args.batch_size,
collate_fn=data_collator,
pin_memory=True,
)
dataloader = accelerator.prepare(dataloader)
# a dict, key is index, value is negative log likelihood of the answer
outputs = evaluate_nll(model, dataloader, accelerator)
if accelerator.process_index == 0:
file_logger = FileLogger(makedirs(os.path.join(args.output_dir, "metrics.log")))
metrics = evaluate_mmlu(eval_data, os.path.join(result_dir, "results.json"), outputs)
# save config
args.save(os.path.join(result_dir, "config.json"))
file_logger.log(metrics, Args=args.to_dict())
if __name__ == "__main__":
main()
import os
import json
import torch
import datasets
from rouge import Rouge
from tqdm import tqdm
from typing import List, Optional
from accelerate import Accelerator
from transformers import HfArgumentParser
from transformers.utils import logging
from torch.utils.data import DataLoader
from dataclasses import dataclass, field, asdict
from collections import defaultdict
from functools import partial
from src import ModelArgs, DefaultDataCollator, FileLogger, get_model_and_tokenizer, makedirs, split_file_dir_name_ext, apply_chat_template, normalize_text
from .longbench_utils import qa_f1_score
logger = logging.get_logger(__name__)
@dataclass
class Args(ModelArgs):
eval_data: str = field(
default="long-llm:memgpt/msc.json",
metadata={'help': 'Evaluation json data.'}
)
output_dir: str = field(
default="data/results/msc/",
metadata={'help': 'The base directory for saving results and logs.'}
)
result_dir: Optional[str] = field(
default=None,
metadata={'help': 'The directory relative to output_dir for saving results.'}
)
chat_template: str = field(
default='no'
)
max_length: int = field(
default=None
)
do_sample: bool = False
max_new_tokens: int = 20
def process_msc(data, tokenizer, max_length, chat_template):
outputs = {'input_ids': [], 'attention_mask': [], 'target': []}
for context, input_, output in zip(data['context'], data['input'], data['output']):
prompt = context + "\n" + input_
if max_length is not None:
prompt = tokenizer.decode(tokenizer.encode(prompt, add_special_tokens=False)[-max_length:])
encoded = apply_chat_template(chat_template, [{'role': 'user', 'content': prompt}], tokenizer=tokenizer, add_generation_prompt=True).encoded
encoded["target"] = output
for k, v in encoded.items():
outputs[k].append(v)
return outputs
@torch.no_grad()
def main():
parser = HfArgumentParser([Args])
args: Args = parser.parse_args_into_dataclasses()[0]
accelerator = Accelerator(cpu=args.cpu)
model, tokenizer = get_model_and_tokenizer(args, device=accelerator.device)
with accelerator.main_process_first():
process_fn = partial(process_msc, tokenizer=tokenizer, chat_template=args.chat_template, max_length=args.max_length)
raw_dataset = datasets.load_dataset("json", data_files=args.eval_data, cache_dir=args.dataset_cache_dir, split="train")
dataset = raw_dataset.map(process_fn, batched=True, num_proc=32, remove_columns=raw_dataset.column_names)
data_collator = DefaultDataCollator(tokenizer=tokenizer)
results = []
all_targets = dataset["target"]
dataset = dataset.remove_columns(["target"])
dataloader = DataLoader(
dataset,
batch_size=args.batch_size,
collate_fn=data_collator,
# only pin memory when no gpu
pin_memory=not args.cpu,
)
if not args.enable_tp:
# NOTE: prepare model only once
if len(accelerator._models) == 0:
model, dataloader = accelerator.prepare(model, dataloader)
model = accelerator.unwrap_model(model)
else:
dataloader = accelerator.prepare(dataloader)
else:
# NOTE: prepare dataloader so the data moves to GPU automatically
dataloader = accelerator.prepare(dataloader)
all_outputs = []
for i, x in enumerate(tqdm(dataloader)):
# NOTE: important to reset memory for every batch
if hasattr(model, "memory"):
model.memory.reset()
output = model.generate(**x)
if isinstance(output, torch.Tensor):
# 1, max_new_tokens
output = output[:, x['input_ids'].shape[1]:]
output = tokenizer.batch_decode(output, skip_special_tokens=True)
elif isinstance(output, list):
pass
if accelerator.num_processes > 1:
output = accelerator.gather_for_metrics(output)
all_outputs.extend(output)
if accelerator.process_index == 0:
rouge = Rouge()
score = rouge.get_scores(normalize_text(all_outputs), normalize_text(all_targets), avg=True)["rouge-l"]["r"]
for output, target in zip(all_outputs, all_targets):
results.append({"target": target, "prediction": output})
result_dir = os.path.join(args.output_dir, args.result_dir) if args.result_dir is not None else args.output_dir
with open(makedirs(os.path.join(result_dir, "results.json")), "w", encoding='utf-8') as f:
json.dump(results, f)
# also save config
args.save(os.path.join(result_dir, "config.json"))
file_logger = FileLogger(makedirs(os.path.join(args.output_dir, "metrics.log")))
file_logger.log({'rouge': score}, Args=asdict(args))
if __name__ == "__main__":
main()
import os
import torch
import time
import datasets
from typing import List, Optional
from accelerate import Accelerator
from transformers import HfArgumentParser
from transformers.utils import logging
from torch.utils.data import DataLoader
from dataclasses import dataclass, field, asdict
from functools import partial
from src.data import Data
from src.metrics import Metric
from src import ModelArgs, DefaultDataCollator, FileLogger, get_model_and_tokenizer, makedirs, evaluate_perplexity, split_file_dir_name_ext, apply_chat_template
logger = logging.get_logger(__name__)
@dataclass
class Args(ModelArgs):
eval_data: Optional[str] = field(
default="long-llm:sharegpt/3-turn.json",
metadata={'help': 'Evaluation json data.'}
)
output_dir: str = field(
default="data/results/multiturn/",
metadata={'help': 'The base directory for saving results and logs.'}
)
min_length: int = field(
default=0,
metadata={'help': 'How many tokens at minimum for evaluation?'}
)
# no more than 1536 tokens because gist cannot process more
max_length: int = field(
default=100000,
metadata={'help': 'How many tokens at maximum for evaluation?'}
)
num_turn: int = field(
default=3,
metadata={'help': 'How many turns?'}
)
breakdown: bool = field(
default=False,
)
def process_multiturn(data, indices, tokenizer, chat_template, min_length, max_length, num_turn=None, breakdown=False):
outputs = {'input_ids': [], 'attention_mask': [], "labels": [], "length": [], "index": []}
# accumulative
if breakdown:
for i, source in enumerate(data['accum_conversations']):
# break the multi-turn conversation
if num_turn is None:
num_turn = len(source) // 2
# skip conversations that do not have enough turns
if num_turn * 2 > len(source):
continue
for j in range(0, 2 * num_turn, 2):
turn_source = source[j: j + 2]
encoded = apply_chat_template(
chat_template,
turn_source,
tokenizer=tokenizer,
return_labels=True,
).encoded
# skip data that not fall in between min_length and max_length
if min_length is not None and len(encoded["input_ids"]) < min_length:
continue
if max_length is not None and len(encoded["input_ids"]) > max_length:
continue
for k, v in encoded.items():
outputs[k].append(v)
outputs['length'].append(len(encoded['input_ids']))
# NOTE: the breakdown conversations belong to the same root
outputs['index'].append(indices[i])
return outputs
else:
for i, source in enumerate(data['conversations']):
if num_turn is not None:
source = source[:2 * num_turn]
encoded = apply_chat_template(
chat_template,
source,
tokenizer=tokenizer,
return_labels=True,
).encoded
# skip data that not fall in between min_length and max_length
if min_length is not None and len(encoded["input_ids"]) < min_length:
continue
if max_length is not None and len(encoded["input_ids"]) > max_length:
continue
for k, v in encoded.items():
outputs[k].append(v)
outputs['length'].append(len(encoded['input_ids']))
outputs['index'].append(indices[i])
return outputs
@torch.no_grad()
def main():
parser = HfArgumentParser([Args])
args: Args = parser.parse_args_into_dataclasses()[0]
accelerator = Accelerator(cpu=args.cpu)
model, tokenizer = get_model_and_tokenizer(args, device=accelerator.device)
with accelerator.main_process_first():
raw_dataset = datasets.load_dataset("json", data_files=args.eval_data, split="train", cache_dir=args.dataset_cache_dir)
process_fn = partial(
process_multiturn,
tokenizer=tokenizer,
chat_template=args.chat_template,
max_length=args.max_length,
min_length=args.min_length,
num_turn=args.num_turn,
breakdown=args.breakdown,
)
dataset = raw_dataset.map(process_fn, batched=True, num_proc=32, batch_size=10, with_indices=True, remove_columns=raw_dataset.column_names)
# get labels (the target generation result)
data_collator = DefaultDataCollator(tokenizer=tokenizer)
dataloader = DataLoader(
dataset,
batch_size=args.batch_size,
collate_fn=data_collator,
# only pin memory when no gpu
pin_memory=not args.cpu,
)
if not args.enable_tp:
model, dataloader = accelerator.prepare(model, dataloader)
# NOTE: unwrap because we just use the model for evaluation
model = accelerator.unwrap_model(model)
else:
# NOTE: prepare dataloader so the data moves to GPU automatically
dataloader = accelerator.prepare(dataloader)
accelerator.wait_for_everyone()
accelerator.print(dataset['index'])
t1 = time.time()
perplexity = evaluate_perplexity(model, dataloader, accelerator)
t2 = time.time()
t = [t2 - t1]
if accelerator.num_processes > 1:
t = accelerator.gather_for_metrics(t)
t = sum(t)
metrics = {"perplexity": perplexity, "time": round(t, 4)}
if accelerator.process_index == 0:
log_path = os.path.join(args.output_dir, f"metrics.log")
file_logger = FileLogger(makedirs(log_path))
file_logger.log(metrics, Args=asdict(args))
if __name__ == "__main__":
main()
import os
import math
import torch
import json
import datasets
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from rouge import Rouge
from glob import glob
from typing import List, Optional
from tqdm import tqdm
from accelerate import Accelerator
from transformers import HfArgumentParser
from transformers.utils import logging
from dataclasses import dataclass, field, asdict
from src import ModelArgs, DefaultDataCollator, FileLogger, get_model_and_tokenizer, makedirs, apply_chat_template
logger = logging.get_logger(__name__)
@dataclass
class Args(ModelArgs):
haystack_path: str = field(
default="long-llm:needle/PaulGrahamEssays",
metadata={'help': 'The context for evaluation.'}
)
output_dir: str = field(
default="data/results/needle/",
metadata={'help': 'The base directory for saving results and logs.'}
)
result_dir: Optional[str] = field(
default=None,
metadata={'help': 'The directory relative to output_dir for saving results.'}
)
min_length: int = field(
default=8192,
metadata={'help': 'Minimum context length in evaluation.'}
)
max_length: int = field(
default=131072,
metadata={'help': 'Maximum context length in evaluation.'}
)
num_length_interval: int = field(
default=10,
metadata={'help': 'Number of invervals between min_length and max_length.'}
)
test_length: List[int] = field(
default=None,
metadata={'help': 'Specified evaluation lengths.'}
)
min_depth: float = field(
default=0,
metadata={'help': 'Minimum pass key depth in the context.'}
)
max_depth: float = field(
default=100,
metadata={'help': 'Maximum pass key depth in the context.'}
)
num_depth_interval: int = field(
default=10,
metadata={'help': 'Number of invervals between min_depth and max_depth.'}
)
test_depth: List[int] = field(
default=None,
metadata={'help': 'Specified evaluation depths.'}
)
needle: str = field(
default="\n\nThe best thing to do in San Francisco is sitting in Dolores Park and eating a hamburg on a sunny day.\n\n",
metadata={'help': 'The needle content'}
)
prompt: str = field(
default='\n\nWhat is the best thing to do in San Francisco?\nAnswer:',
metadata={'help': 'The needle content'}
)
gpt_eval: bool = field(
default=False,
metadata={'help': 'Use GPT4 to evaluate accuracy.'}
)
load_result: bool = field(
default=False,
metadata={'help': 'Load previous results?'}
)
do_sample: bool = False
max_new_tokens: int = 50
def __post_init__(self):
super().__post_init__()
self.haystack_path = self.resolve_path(self.haystack_path)
class OpenAIEvaluator:
DEFAULT_MODEL_KWARGS: dict = dict(temperature=0)
CRITERIA = {"accuracy": """
Score 1: The answer is completely unrelated to the reference.
Score 3: The answer has minor relevance but does not align with the reference.
Score 5: The answer has moderate relevance but contains inaccuracies.
Score 7: The answer aligns with the reference but has minor omissions.
Score 10: The answer is completely accurate and aligns perfectly with the reference.
Only respond with a numberical score"""}
def __init__(self,
model_name: str = "gpt-3.5-turbo-0125",
model_kwargs: dict = DEFAULT_MODEL_KWARGS,
true_answer: str = None,
question_asked: str = None):
"""
:param model_name: The name of the model.
:param model_kwargs: Model configuration. Default is {temperature: 0}
:param true_answer: The true answer to the question asked.
:param question_asked: The question asked to the model.
"""
from langchain_openai import ChatOpenAI
# from langchain_community.chat_models import ChatOpenAI
if (not true_answer) or (not question_asked):
raise ValueError("true_answer and question_asked must be supplied with init.")
self.model_name = model_name
self.model_kwargs = model_kwargs
self.true_answer = true_answer
self.question_asked = question_asked
api_key = os.getenv('OPENAI_API_KEY')
if (not api_key):
raise ValueError("OPENAI_API_KEY must be in env for using openai evaluator.")
proxy = os.getenv('http_proxy')
if proxy:
logger.info(f"Using proxy {proxy}...")
self.evaluator = ChatOpenAI(model=self.model_name,
openai_api_key=api_key,
openai_proxy=proxy,
**self.model_kwargs)
def evaluate_response(self, response: str) -> int:
from langchain.evaluation import load_evaluator
evaluator = load_evaluator(
"labeled_score_string",
criteria=self.CRITERIA,
llm=self.evaluator,
)
eval_result = evaluator.evaluate_strings(
# The models response
prediction=response,
# The actual answer
reference=self.true_answer,
# The question asked
input=self.question_asked,
)
return int(eval_result['score'])
def generate_sample(
tokenizer,
chat_template,
context,
context_length,
needle_depth,
needle="\n\nThe best thing to do in San Francisco is sitting in Dolores Park and eating a hamburg on a sunny day.\n\n",
prompt='\n\nWhat is the best thing to do in San Francisco?\nAnswer:'
):
num_words = len(context.split())
if context_length > num_words:
context = context * math.ceil(context_length / num_words)
description = "There is an important infomation hidden in the following context. Find the information and memorize it. I will quiz you about the important information there.\n"
description_input_ids = tokenizer.encode(description, add_special_tokens=False)
needle_input_ids = tokenizer.encode(needle, add_special_tokens=False)
prompt_input_ids = tokenizer.encode(prompt, add_special_tokens=False)
description_length = len(description_input_ids)
needle_length = len(needle_input_ids)
prompt_length = len(prompt_input_ids)
# must leave room for information and prompt
minimum_pos = description_length
maximum_pos = context_length - prompt_length - needle_length - 1
if minimum_pos > context_length or maximum_pos < 0:
raise ValueError(f"The length {context_length} is too small. Please increase interval!")
needle_pos = minimum_pos + round((maximum_pos - minimum_pos) * needle_depth / 100)
context_input_ids = tokenizer.encode(context, max_length=context_length - description_length - needle_length - prompt_length, truncation=True, add_special_tokens=False)
input_ids = sum([description_input_ids, context_input_ids[:needle_pos], needle_input_ids, context_input_ids[needle_pos:], prompt_input_ids], [])
inputs = tokenizer.decode(input_ids)
inputs = apply_chat_template(chat_template, messages=[{'role': 'user', 'content': inputs}], tokenizer=tokenizer, add_generation_prompt=True).raw
return inputs, prompt, needle
@torch.no_grad()
def main():
parser = HfArgumentParser([Args])
args: Args = parser.parse_args_into_dataclasses()[0]
accelerator = Accelerator(cpu=args.cpu)
result_dir = os.path.join(args.output_dir, args.result_dir)
if args.load_result:
with open(makedirs(os.path.join(result_dir, "results.json")), "r", encoding='utf-8') as f:
results = json.load(f)
else:
model, tokenizer = get_model_and_tokenizer(args, device=accelerator.device)
if args.test_length is None:
test_lengths = np.linspace(args.min_length, args.max_length, args.num_length_interval, endpoint=True).astype(int).tolist()
else:
test_lengths = args.test_length
if args.test_depth is None:
test_depths = np.linspace(args.min_depth, args.max_depth, args.num_depth_interval, endpoint=True).astype(int).tolist()
else:
test_depths = args.test_depth
if os.path.isfile(args.haystack_path):
with open(args.haystack_path) as f:
context = f.read().strip()
elif os.path.isdir(args.haystack_path):
context = ""
num_tokens = 0
for file in glob(f"{args.haystack_path}/*.txt"):
with open(file, 'r') as f:
this_file_context = f.read()
num_tokens += len(tokenizer.encode(this_file_context, add_special_tokens=False))
context += this_file_context
if num_tokens > max(test_lengths):
break
else:
raise ValueError(f"Cannot find haystack: {args.haystack_path}")
all_inputs = []
for length in tqdm(test_lengths, desc="Constructing Data"):
for depth in test_depths:
inputs, prompt, needle = generate_sample(
tokenizer=tokenizer,
chat_template=args.chat_template,
context=context,
context_length=length,
needle_depth=depth,
needle=args.needle,
prompt=args.prompt
)
all_inputs.append({'inputs': inputs, 'prompt': prompt, 'needle': needle, 'length': length, 'depth': depth})
dataset = datasets.Dataset.from_list(all_inputs)
dataloader = torch.utils.data.DataLoader(
# length and depth are useless in forward computation
dataset.remove_columns(['length', 'depth', 'needle']),
batch_size=args.batch_size,
collate_fn=DefaultDataCollator(tokenizer),
pin_memory=not args.cpu,
)
# NOTE: prepare dataloader so the data moves to GPU automatically
dataloader = accelerator.prepare(dataloader)
accelerator.wait_for_everyone()
all_outputs = []
for x in tqdm(dataloader, desc="Evaluating"):
prompt = x.pop("prompt")
inputs = x.pop("inputs")
# TODO: retrieval
# NOTE: important to reset memory for every batch
if hasattr(model, "memory"):
model.memory.reset()
inputs = tokenizer(inputs, return_tensors="pt").to(model.device)
output = model.generate(**inputs)
if isinstance(output, torch.Tensor):
# 1, max_new_tokens
output = output[:, inputs['input_ids'].shape[1]:]
output = tokenizer.batch_decode(output, skip_special_tokens=True)
elif isinstance(output, list):
pass
if accelerator.num_processes > 1:
output = accelerator.gather_for_metrics(output)
all_outputs.extend(output)
if accelerator.process_index == 0:
results = {l: {d: [] for d in test_depths} for l in test_lengths}
all_lengths = dataset['length']
all_depths = dataset['depth']
all_needles = dataset['needle']
for l, d, n, o in zip(all_lengths, all_depths, all_needles, all_outputs):
results[l][d].append({'target': n, 'prediction': o})
with open(makedirs(os.path.join(result_dir, "results.json")), "w", encoding='utf-8') as f:
json.dump(results, f)
# also save config
args.save(os.path.join(result_dir, "config.json"))
if accelerator.process_index == 0:
rouge = Rouge()
rouge_score = {l: {d: [] for d in v.keys()} for l, v in results.items()}
if args.gpt_eval:
evaluator = OpenAIEvaluator(question_asked=args.prompt.strip(), true_answer=args.needle.strip())
gpt_score = {l: {d: [] for d in v.keys()} for l, v in results.items()}
for l, lv in results.items():
for d, dv in lv.items():
for v in dv:
prediction = v["prediction"].strip("\n").split("\n")[0]
target = v["target"].strip("\n")
try:
score = rouge.get_scores([prediction], [target], avg=True)["rouge-l"]["r"]
except:
score = 0
rouge_score[l][d].append(score)
if args.gpt_eval:
while 1:
try:
gpt_score[l][d].append(evaluator.evaluate_response(prediction))
break
except ValueError:
pass
rouge_score[l][d] = round(sum(rouge_score[l][d]) / len(dv), 2)
if args.gpt_eval:
while 1:
try:
gpt_score[l][d] = round(sum(gpt_score[l][d]) / len(dv), 2)
break
except ValueError:
pass
metrics = {'rouge': rouge_score}
if args.gpt_eval:
metrics["gpt"] = gpt_score
file_logger = FileLogger(makedirs(os.path.join(args.output_dir, "metrics.log")))
file_logger.log(metrics, Args=asdict(args))
for metric_key, metric_value in metrics.items():
# Copied from https://github.com/gkamradt/LLMTest_NeedleInAHaystack/blob/main/viz/CreateVizFromLLMTesting.ipynb
cmap = LinearSegmentedColormap.from_list("custom_cmap", ["#F0496E", "#EBB839", "#0CD79F"])
# Create the heatmap with better aesthetics
sns.set(rc={"figure.figsize": (17.5, 8), "axes.titlesize":14, "axes.labelsize":12}, style="whitegrid", palette="colorblind")
data = pd.DataFrame(metric_value)
if metric_key == "rouge":
vmin = 0
vmax = 1.0
label = "Rouge"
elif metric_key == "gpt":
vmin = 1
vmax = 10.0
label = "Accuracy"
annot = data.copy().astype(str)
annot[annot == str(vmax)] = ""
ax = sns.heatmap(
data,
cmap=cmap,
vmin=vmin,
vmax=vmax,
annot=annot,
fmt="",
linewidth=.5,
annot_kws={"fontsize":10},
)
cbar = ax.collections[0].colorbar
cbar.set_label(label, size=14)
# More aesthetics
plt.title('Needle In A HayStack') # Adds a title
plt.xlabel('Context Length', fontsize=14) # X-axis label
plt.ylabel('Depth Percent', fontsize=14) # Y-axis label
plt.xticks(rotation=45, fontsize=10) # Rotates the x-axis labels to prevent overlap
plt.yticks(rotation=0, fontsize=10) # Ensures the y-axis labels are horizontal
plt.tight_layout() # Fits everything neatly into the figure area
# save to result_dir
plt.savefig(os.path.join(result_dir, f"{metric_key}.png"), format='png', bbox_inches='tight')
plt.close()
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment