Commit 6e3ef5ff authored by Benjamin Fattori's avatar Benjamin Fattori
Browse files

Merge remote-tracking branch 'upstream/big-refactor' into refactor-more-tasks

parents 026d2c21 070b6b9c
......@@ -44,10 +44,10 @@ To install additional multilingual tokenization and text segmentation packages,
pip install -e ".[multilingual]"
```
To support loading GPTQ quantized models, install the package with the `auto-gptq` extra:
To support loading GPTQ quantized models, install the package with the `gptq` extra:
```bash
pip install -e ".[auto-gptq]"
pip install -e ".[gptq]"
```
## Basic Usage
......@@ -94,6 +94,25 @@ accelerate launch main.py \
This will perform *data-parallel evaluation*: that is, placing a **single full copy** of your model onto each available GPU and *splitting batches across GPUs* to evaluate on K GPUs K times faster than on one.
However, if your model *is too large to be run on a single one of your GPUs*, then we provide an alternative method to run these large models: use of the `parallelize` argument.
```
python main.py \
--model hf \
--model_args pretrained=EleutherAI/pythia-12b,parallelize=True
--tasks lambada_openai,arc_easy \
--batch_size 16
```
To pass even more advanced keyword arguments to `accelerate`, we allow for the following arguments as well:
- `device_map_option`: How to split model weights across available GPUs. defaults to "auto".
- `max_memory_per_gpu`: the max GPU memory to use per GPU in loading the model.
- `max_cpu_memory`: the max amount of CPU memory to use when offloading the model weights to RAM.
- `offload_folder`: a folder where model weights will be offloaded to disk if needed.
Using this setting helps for massive models like BLOOM which require, or to avoid exceeding your total system RAM (by default, with `accelerate launch` one copy of the model for each GPU is initialized in RAM before moving it to GPU, resulting in large RAM usage spikes around the start of the script that may cause errors such as `Killed`.) However, it naively splits models across GPUs, resulting in only a single GPU performing work at any point in time, and so is much slower than launching with `accelerate launch`, possibly by a factor of the total # of GPUs.
**Note that this option requires launching evaluation via `python main.py` rather than `accelerate launch main.py`.**
### Commercial APIs
......@@ -141,17 +160,17 @@ For models loaded with the HuggingFace `transformers` library, any arguments pr
```bash
python main.py \
--model hf \
--model_args pretrained=EleutherAI/gpt-j-6b,peft=nomic-ai/gpt4all-j-lora \
--model_args pretrained=EleutherAI/gpt-j-6b,parallelize=True,load_in_4bit=True,peft=nomic-ai/gpt4all-j-lora \
--tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \
--device cuda:0
```
GPTQ quantized models can be loaded by specifying their file names in `,quantized=NAME` (or `,quantized=True` for default names) in the `model_args` argument:
[GPTQ](https://github.com/PanQiWei/AutoGPTQ) quantized models can be loaded by specifying their file names in `,gptq=NAME` (or `,gptq=True` for default names) in the `model_args` argument:
```bash
python main.py \
--model hf \
--model_args pretrained=model-name-or-path,quantized=model.safetensors,gptq_use_triton=True \
--model_args pretrained=model-name-or-path,gptq=model.safetensors,gptq_use_triton=True \
--tasks hellaswag
```
......
......@@ -19,6 +19,9 @@ class LM(abc.ABC):
(inputs/outputs should be tokenization-agnostic.)
"""
# set rank and world size to a single process, by default.
self._rank = 0
self._world_size = 1
self.cache_hook = CacheHook(None)
@abc.abstractmethod
......@@ -118,14 +121,14 @@ class LM(abc.ABC):
# used in the case of parallelism. Hardcoded to
# ensure no errors arise using API models which do
# not support multi-device parallelism nor expect it.
return 0
return self._rank
@property
def world_size(self):
# used in the case of parallelism. Hardcoded to
# ensure no errors arise using API models which do
# not support multi-device parallelism nor expect it.
return 1
return self._world_size
def set_cache_hook(self, cache_hook):
self.cache_hook = cache_hook
......
......@@ -283,7 +283,7 @@ class Task(abc.ABC):
else:
eval_logger.warning(
"has_training_docs and has_validation_docs are False"
", using test_docs but this is not recommended."
", using test_docs as fewshot_docs but this is not recommended."
)
return self.test_docs()
......@@ -354,7 +354,8 @@ class Task(abc.ABC):
fewshot_ctx = self.fewshot_context(
doc, self._config.num_fewshot, rnd=random.Random()
)
# TODO: we should override this if doing greedy gen so users don't waste time+compute
# TODO: we should override self._config.repeats if doing greedy gen so users don't waste time+compute
inst = self.construct_requests(
doc=doc,
ctx=fewshot_ctx,
......
......@@ -195,11 +195,6 @@ def evaluate(
versions[task_name] = task.VERSION
configs[task_name] = dict(task.dump_config())
# deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
# task_docs = list(task_doc_func())
# rnd = random.Random()
# rnd.seed(42)
# rnd.shuffle(task_docs)
if limit is not None:
if task.has_test_docs():
task_docs = task.test_docs()
......@@ -257,13 +252,12 @@ def evaluate(
task.apply_filters()
### Collect values of metrics on all datapoints ###
# TODO: make metric configurable, add metric registry
vals = collections.defaultdict(list)
# unpack results and sort back in order and return control to Task
for task_name, task in task_dict.items():
# calculate values for each filter setup (TODO: make getting list of keys cleaner)
# TODO: make it possible to use a different metric per key
# TODO: make it possible to use a different metric per filter
# iterate over different filters used
for key in task.instances[0].filtered_resps.keys():
doc_iterator = (
itertools.islice(
......@@ -286,6 +280,7 @@ def evaluate(
"doc_id": doc_id,
"doc": doc,
"target": target,
"arguments": requests[0].args,
"resps": [req.resps for req in requests],
"filtered_resps": [req.filtered_resps[key] for req in requests],
}
......@@ -296,6 +291,15 @@ def evaluate(
if lm.world_size > 1:
# if multigpu, then gather data across all ranks
# first gather logged samples across all ranks
for task_name, task_samples in list(samples.items()):
full_samples = [None] * lm.world_size
torch.distributed.all_gather_object(full_samples, task_samples)
samples[task_name] = list(itertools.chain.from_iterable(full_samples))
# then collect metrics across all ranks
vals_torch = collections.defaultdict(list)
for (task_name, key, metric), items in vals.items():
......
import torch
import transformers
from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
from peft import __version__ as PEFT_VERSION, PeftModel
import copy
from collections import defaultdict
from tqdm import tqdm
from pathlib import Path
import torch.nn.functional as F
......@@ -16,7 +18,32 @@ from lm_eval.api.registry import register_model
from lm_eval.utils import MultiTokenEOSCriteria, stop_sequences_criteria
from accelerate import Accelerator
from typing import List, Union
from typing import List, Optional, Union
def _get_accelerate_args(
device_map_option: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None,
offload_folder: Optional[str] = "./offload",
) -> dict:
"""Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
max_memory = {}
if max_memory_per_gpu is not None:
max_memory_per_gpu_map = {
device_idx: max_memory_per_gpu
for device_idx in range(torch.cuda.device_count())
}
max_memory.update(max_memory_per_gpu_map)
if max_cpu_memory is not None:
max_memory["cpu"] = max_cpu_memory
args = {}
if max_memory:
args["max_memory"] = max_memory
args["device_map"] = device_map_option
args["offload_folder"] = offload_folder
return args
@register_model("hf-auto", "hf", "huggingface")
......@@ -33,14 +60,31 @@ class HFLM(LM):
def __init__(
self,
device="cuda",
pretrained="gpt2",
revision="main",
low_cpu_mem_usage=None,
max_length=None,
subfolder=None,
tokenizer=None,
batch_size=1,
pretrained: Optional[str] = "gpt2",
revision: Optional[str] = "main",
subfolder: Optional[str] = None,
tokenizer: Optional[str] = None,
max_length: Optional[int] = None,
device: Optional[str] = "cuda",
dtype: Optional[Union[str, torch.dtype]] = "auto",
batch_size: Optional[int] = 1,
low_cpu_mem_usage: Optional[bool] = True,
trust_remote_code: Optional[bool] = False,
# arguments used for splitting a model across GPUs naively.
# only used if `parallelize=True`.
parallelize: Optional[bool] = False,
device_map_option: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None,
offload_folder: Optional[str] = "./offload",
# PEFT and quantization options
peft: Optional[str] = None,
load_in_8bit: Optional[bool] = False,
load_in_4bit: Optional[bool] = False,
bnb_4bit_quant_type: Optional[str] = None,
bnb_4bit_compute_dtype: Optional[Union[str, torch.dtype]] = None,
gptq: Optional[Union[bool, str]] = False,
gptq_use_triton: Optional[bool] = False,
):
super().__init__()
......@@ -49,10 +93,16 @@ class HFLM(LM):
assert isinstance(batch_size, int)
gpus = torch.cuda.device_count()
accelerator = Accelerator()
if gpus <= 1:
if not (parallelize or accelerator.num_processes > 1):
# use user-passed device
device_list = set(
["cuda", "cpu"]
+ [f"cuda:{i}" for i in range(torch.cuda.device_count())]
)
if device:
if device not in ["cuda", "cpu"]:
if device not in device_list:
device = int(device)
self._device = torch.device(device)
eval_logger.info(f"Using device '{device}'")
......@@ -64,19 +114,29 @@ class HFLM(LM):
if torch.cuda.is_available()
else torch.device("cpu")
)
self._rank = 0
self._world_size = 1
else:
self._device = "cpu"
eval_logger.info(
f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model."
)
# TODO: include in warning that `load_in_8bit` etc. affect this too
self._device = device
model_kwargs = {}
if parallelize:
model_kwargs = _get_accelerate_args(
device_map_option,
max_memory_per_gpu,
max_cpu_memory,
offload_folder,
)
# TODO: update this to be less of a hack once subfolder is fixed in HF
revision = revision + ("/" + subfolder if subfolder is not None else "")
# get config
self._config = transformers.AutoConfig.from_pretrained(
pretrained,
revision=revision,
trust_remote_code=trust_remote_code,
)
if getattr(self._config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
......@@ -89,15 +149,67 @@ class HFLM(LM):
transformers.AutoModelForSeq2SeqLM,
]
self._model = self.AUTO_MODEL_CLASS.from_pretrained(
pretrained, revision=revision, low_cpu_mem_usage=low_cpu_mem_usage
).to(self.device)
if not gptq:
if load_in_4bit:
assert (
transformers.__version__ >= "4.30.0"
), "load_in_4bit requires transformers >= 4.30.0"
if transformers.__version__ >= "4.30.0":
model_kwargs["load_in_4bit"] = load_in_4bit
if load_in_4bit:
if bnb_4bit_quant_type:
model_kwargs["bnb_4bit_quant_type"] = bnb_4bit_quant_type
if bnb_4bit_compute_dtype:
model_kwargs["bnb_4bit_compute_dtype"] = utils.get_dtype(
bnb_4bit_compute_dtype
)
self._model = self.AUTO_MODEL_CLASS.from_pretrained(
pretrained,
revision=revision,
torch_dtype=utils.get_dtype(dtype),
low_cpu_mem_usage=low_cpu_mem_usage,
trust_remote_code=trust_remote_code,
load_in_8bit=load_in_8bit,
**model_kwargs,
)
else:
try:
from auto_gptq import AutoGPTQForCausalLM
except ModuleNotFoundError:
raise Exception(
"Tried to load auto_gptq, but auto-gptq is not installed ",
"please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]",
)
self._model = AutoGPTQForCausalLM.from_quantized(
pretrained,
model_basename=None if gptq is True else Path(gptq).stem,
low_cpu_mem_usage=low_cpu_mem_usage,
trust_remote_code=trust_remote_code,
use_safetensors=True if gptq is True else gptq.endswith(".safetensors"),
use_triton=gptq_use_triton,
warmup_triton=gptq_use_triton,
**model_kwargs,
)
if peft:
if load_in_4bit:
assert PEFT_VERSION >= "0.4.0", "load_in_4bit requires peft >= 0.4.0"
self._model = PeftModel.from_pretrained(
self._model, peft, revision=revision
)
# forever after, access self._model through self.model property
self.model.eval()
self.model.tie_weights()
if gpus <= 1 and not parallelize:
# place model onto device, if not using HF Accelerate in any form
self.model.to(self.device)
self.tokenizer = transformers.AutoTokenizer.from_pretrained(
pretrained if tokenizer is None else tokenizer,
revision=revision,
trust_remote_code=trust_remote_code,
)
self.vocab_size = self.tokenizer.vocab_size
......@@ -106,12 +218,18 @@ class HFLM(LM):
self._max_length = max_length
# multithreading and batching
self.batch_size_per_gpu = batch_size # todo: adaptive batch size
self.batch_size_per_gpu = batch_size
# multigpu support with accelerate
# multigpu data-parallel support when launched with accelerate
if gpus > 1:
accelerator = Accelerator()
if gpus > accelerator.num_processes:
if parallelize:
if accelerator.num_processes > 1:
raise RuntimeError(
"Attempted to use both a HF Accelerate `device_map` and to launch via `accelerate launch`. If this is the case, please either remove `parallelize=True` from --model_args or launch outside of the Accelerate launcher."
)
else:
pass
elif gpus > accelerator.num_processes:
# TODO: make sure there's still never an edge case where we unintentionally default to CPU
eval_logger.warning(
"WARNING: The number of total system GPUs does not match the number of spawned processes. "
......@@ -302,16 +420,27 @@ class HFLM(LM):
return logits
def _encode_pair(self, context, continuation):
n_spaces = len(context) - len(context.rstrip())
if n_spaces > 0:
continuation = context[-n_spaces:] + continuation
context = context[:-n_spaces]
whole_enc = self.tok_encode(context + continuation)
context_enc = self.tok_encode(context)
context_enc_len = len(context_enc)
continuation_enc = whole_enc[context_enc_len:]
return context_enc, continuation_enc
def loglikelihood(self, requests):
new_reqs = []
for context, continuation in [req.args for req in requests]:
if context == "":
# end of text as context
context_enc = [self.eot_token_id]
context_enc, continuation_enc = [self.eot_token_id], self.tok_encode(
continuation
)
else:
context_enc = self.tok_encode(context)
continuation_enc = self.tok_encode(continuation)
context_enc, continuation_enc = self._encode_pair(context, continuation)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
......@@ -383,7 +512,6 @@ class HFLM(LM):
tqdm(re_ord.get_reordered(), disable=(disable_tqdm or (self.rank != 0))),
self.batch_size,
):
inps = []
cont_toks_list = []
inplens = []
......@@ -480,12 +608,11 @@ class HFLM(LM):
multi_logits = F.log_softmax(
self._model_call(batched_inps, **call_kwargs), dim=-1
).cpu() # [batch, padding_length (inp or cont), vocab]
) # [batch, padding_length (inp or cont), vocab]
for (cache_key, _, _), logits, inplen, cont_toks in zip(
chunk, multi_logits, inplens, cont_toks_list
):
# Slice to original seq length
contlen = len(cont_toks)
# take only logits in the continuation
......@@ -500,7 +627,9 @@ class HFLM(LM):
# Check if per-token argmax is exactly equal to continuation
greedy_tokens = logits.argmax(dim=-1)
cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(
cont_toks = torch.tensor(
cont_toks, dtype=torch.long, device=self.device
).unsqueeze(
0
) # [1, seq]
max_equal = (greedy_tokens == cont_toks).all()
......
# v1.0 Tasks
This list keeps track of which tasks' implementations have been ported to YAML / v2.0 of the Eval Harness.
Boxes should be checked iff tasks are implemented in the refactor and tested for regression. Tasks should be struck through if checked *against original introducing paper* implementation or popularizing implementation.
Boxes should be checked iff tasks are implemented in the refactor and tested for regression. Tasks should be struck through if checked *against original introducing paper* implementation or popularizing implementation. (WIP) Denotes that there exists a PR or person working on this task already.
- [ ] Glue (WIP)
- [x] SuperGlue
......@@ -12,7 +12,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [ ] Lambada (Multilingual)
- [x] Wikitext
- [x] PiQA
- [ ] PROST
- [ ] PROST (WIP)
- [ ] MCTACO
- [x] Pubmed QA
- [x] SciQ
......@@ -21,11 +21,16 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [ ] TriviaQA
- [x] AI2 ARC
- [ ] LogiQA
- [ ] HellaSwag
- [x] HellaSwag
- [x] SWAG
- [x] OpenBookQA
- [ ] SQuADv2
- [x] RACE
- [ ] LogiQA (WIP)
- [x] HellaSwag
- [ ] SWAG (WIP)
- [x] OpenBookQA
- [ ] SQuADv2 (WIP)
- [ ] HeadQA
- [ ] MathQA
- [ ] WebQs
......@@ -35,7 +40,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [ ] Hendrycks Ethics
- [ ] TruthfulQA
- [ ] MuTual
- [ ] Hendrycks Math
- [ ] Hendrycks Math (WIP)
- [ ] Asdiv
- [ ] GSM8k
- [x] Arithmetic
......@@ -45,6 +50,8 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [x] ~~Pile (perplexity)~~
- [ ] BLiMP
- [ ] ToxiGen
- [ ] StoryCloze
- [ ] NaturalQs
- [ ] CrowS-Pairs
- [ ] XCopa
- [ ] BIG-Bench
......@@ -53,6 +60,9 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [ ] PAWS-X
- [ ] XNLI
- [ ] MGSM
- [ ] SCROLLS
- [ ] JSON Task (reference: https://github.com/EleutherAI/lm-evaluation-harness/pull/481)
- [ ] Babi
# Novel Tasks
Tasks added in the revamped harness that were not previously available. Again, a strikethrough denotes checking performed *against the original task's implementation or published results introducing the task*.
......
......@@ -124,28 +124,6 @@ def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
get_task_name_from_object(task_element): task_element,
}
# task_name_from_registry_dict = {
# task_name: get_task(
# task_name=task_name,
# task_config=config
# )
# for group_name in task_name_list for task_name in GROUP_REGISTRY[group_name]
# if (isinstance(group_name, str)) and (group_name in GROUP_REGISTRY)
# }
# task_name_from_config_dict = {
# get_task_name_from_config(task_config): ConfigurableTask(
# config=task_config
# )
# for task_config in task_name_list
# if isinstance(task_config, dict)
# }
# # TODO: Do we still need this?
# task_name_from_object_dict = {
# get_task_name_from_object(task_object): task_object
# for task_object in task_name_list
# if isinstance(task_object, Task)
# }
assert set(task_name_from_registry_dict.keys()).isdisjoint(
set(task_name_from_object_dict.keys())
)
......
......@@ -25,7 +25,7 @@ metric_list:
regexes_to_ignore:
- ","
- "\\$"
delimiter: "\n\n"
fewshot_delimiter: "\n\n"
generation_kwargs:
until:
- "Q:"
......
......@@ -7,7 +7,7 @@ output_type: multiple_choice
training_split: train
validation_split: validation
test_split: null
template_aliases: "{% set gold = label %}{% set answer_choices = endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', ' ', ' ')|list %}"
template_aliases: "{% set gold = label | int %}{% set answer_choices = endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', ' ', ' ')|list %}"
doc_to_text: "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace(' ', ' ')}}"
doc_to_target: "{{answer_choices[gold]}}"
gold_alias: "{{gold}}"
......
group:
- super-glue-promptsource
task: "GPT-3 Style"
dataset_path: super_glue
dataset_name: boolq
training_split: train
validation_split: validation
use_prompt: "promptsource:GPT-3 Style"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
include: promptsource-00.yaml
group:
- super-glue-promptsource
task: "based on the previous passage"
use_prompt: "promptsource:based on the previous passage"
include: promptsource-00.yaml
group:
- super-glue-promptsource
task: "based on the following passage"
use_prompt: "promptsource:based on the following passage"
group:
- super-glue-lm-eval-v1
- super-glue-lm-eval-v1-seq2seq
task: "boolq-seq2seq"
dataset_path: super_glue
dataset_name: boolq
......
group:
- super-glue-lm-eval-v1
task: "default"
task: "cb"
dataset_path: super_glue
dataset_name: cb
output_type: multiple_choice
training_split: train
validation_split: validation
doc_to_text: "{{premise}}\nQuestion: {{hypothesis}}. True, False, or Neither?\nAnswer:"
doc_to_target: "{{answer_choices[labe]}}"
doc_to_target: "{{answer_choices[label]}}"
gold_alias: "{{label}}" # this will be cast to an int.
template_aliases: "{% set answer_choices = ['True', 'False', 'Neither'] %}"
metric_list:
......
group:
- super-glue-promptsource
task: "GPT-3 style"
dataset_path: super_glue
dataset_name: cb
training_split: train
validation_split: validation
use_prompt: "promptsource:GPT-3 style"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
include: promptsource-00.yaml
group:
- super-glue-promptsource
task: "MNLI crowdsource"
use_prompt: "promptsource:MNLI crowdsource"
include: promptsource-00.yaml
group:
- super-glue-promptsource
task: "based on the previous passage"
use_prompt: "promptsource:based on the previous passage"
group:
- super-glue-t5-prompt
task: t5-prompt
reference: "From Raffel et. al. 2019"
task: super_glue-cb-t5-prompt
dataset_path: super_glue
dataset_name: cb
training_split: train
......
group:
- super-glue-lm-eval-v1-
task: "copa"
dataset_path: super_glue
dataset_name: copa
output_type: multiple_choice
training_split: train
validation_split: validation
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
gold_alias: "{{label}}" # this will be cast to an int.
template_aliases: "{% set answer_choices = [{{doc.choice1}}, 'b'] %} {{answer_choices}}"
metric_list:
- metric: acc
group:
- super-glue-promptsource
task: "C1 or C2? premise, so/because…"
dataset_path: super_glue
dataset_name: copa
training_split: train
validation_split: validation
use_prompt: "promptsource:C1 or C2? premise, so/because…"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment