Commit 21cf6508 authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'big-refactor' of...

Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into mmlu_subgroups
parents 7d496612 06e69149
......@@ -90,7 +90,7 @@ python -m lm_eval \
--batch_size 8
```
Models that are loaded via either `transformers.AutoModelForCausalLM` (autoregressive, decoder-only GPT style models) or `transformers.AutoModelForSeq2SeqLM` (such as encoder-decoder models like T5) in Huggingface are supported via Support for this model type is currently pending.
Models that are loaded via both `transformers.AutoModelForCausalLM` (autoregressive, decoder-only GPT style models) and `transformers.AutoModelForSeq2SeqLM` (such as encoder-decoder models like T5) in Huggingface are supporteded.
Batch size selection can be automated by setting the ```--batch_size``` flag to ```auto```. This will perform automatic detection of the largest batch size that will fit on your device. On tasks where there is a large difference between the longest and shortest example, it can be helpful to periodically recompute the largest batch size, to gain a further speedup. To do this, append ```:N``` to above flag to automatically recompute the largest batch size ```N``` times. For example, to recompute the batch size 4 times, the command would be:
......@@ -155,14 +155,14 @@ A full accounting of the supported and planned libraries + APIs can be seen belo
| API or Inference Server | Implemented? | `--model <xxx>` name | Models supported: | Request Types: |
|-----------------------------|---------------------------------|----------------------------------------------------------------------------------|--------------------------------------|----------------------------------------------------------|
| OpenAI Completions | :heavy_check_mark: | `openai`, `openai-completions`, `gooseai` | up to `code-davinci-002` | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
| OpenAI ChatCompletions | :x: Not yet - needs help! | N/A | (link here?) | `greedy_until` (no logprobs) |
| Anthropic | :heavy_check_mark: | `anthropic` | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model) | `greedy_until` (no logprobs) |
| GooseAI | :heavy_check_mark: (not separately maintained) | `openai`, `openai-completions`, `gooseai` (same interface as OpenAI Completions) | | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
| Textsynth | Needs testing | `textsynth` | ??? | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
| Cohere | :hourglass: - blocked on Cohere API bug | N/A | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models) | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
| GGML | :hourglass: [PR](https://github.com/EleutherAI/lm-evaluation-harness/pull/617) | N/A | ??? | `greedy_until`, `loglikelihood`, `loglikelihood_rolling` |
| vLLM | :x: Not yet - needs help! | N/A | All HF models | `greedy_until` (no logprobs) |
| OpenAI Completions | :heavy_check_mark: | `openai`, `openai-completions`, `gooseai` | up to `code-davinci-002` | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| OpenAI ChatCompletions | :x: Not yet - needs help! | N/A | (link here?) | `generate_until` (no logprobs) |
| Anthropic | :heavy_check_mark: | `anthropic` | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model) | `generate_until` (no logprobs) |
| GooseAI | :heavy_check_mark: (not separately maintained) | `openai`, `openai-completions`, `gooseai` (same interface as OpenAI Completions) | | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Textsynth | Needs testing | `textsynth` | ??? | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| Cohere | :hourglass: - blocked on Cohere API bug | N/A | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models) | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| GGML | :hourglass: [PR](https://github.com/EleutherAI/lm-evaluation-harness/pull/617) | N/A | ??? | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
| vLLM | :x: Not yet - needs help! | N/A | All HF models | `generate_until` (no logprobs) |
| Your inference server here! | ... | ... | ... | ... | | ... |
It is on our roadmap to create task variants designed to enable models which do not serve logprobs/loglikelihoods to be compared with generation performance of open-source models.
......
......@@ -57,7 +57,7 @@ import lm_eval
my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
...
lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.greedy_until()`
lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.generate_until()`
results = lm_eval.simple_evaluate( # call simple_evaluate
model=lm_obj,
......@@ -83,7 +83,7 @@ from my_tasks import MyTask1 # suppose you've defined a custom lm_eval.api.Task
my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
...
lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.greedy_until()`
lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.generate_until()`
......
......@@ -44,26 +44,24 @@ class MyCustomLM(LM):
#...
def greedy_until(self, requests: list[Instance]) -> list[str]:
def generate_until(self, requests: list[Instance]) -> list[str]:
#...
#...
```
Where `Instance` is a dataclass defined in [`lm_eval.api.instance`](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/api/instance.py) with property `args` which returns a tuple of (context, continuation).
We support
We support three types of requests, consisting of different interactions / measurements with an autoregressive LM.
The three types of
All three request types take as input `requests` of type `list[Instance]` that have a matching `Instance.request_type` to the method name.
- `generate_until`
- Each request contains `Instance.args : Tuple[str, dict]` containing 1. an input string to the LM and 2. a dictionary of keyword arguments used to control generation parameters.
-
- `loglikelihood`
-
smth smth tokenizer-agnostic
3 reqtypes
- greedy_until, and the arguments passed to it
- loglikelihood, and args passed to it
- loglikelihood_rolling, and args passed to it
- `loglikelihood_rolling`, and args passed to it
## Registration
......
......@@ -32,7 +32,7 @@ Prompting / in-context formatting options:
- **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text, doc_to_target, and doc_to_choice.
- **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate input for the model
- **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate target output for the model. For multiple choice tasks, this should return an index into
- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `greedy_until` tasks.
- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks.
- **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples.
- **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested.
......@@ -42,7 +42,7 @@ Runtime configuration options:
Scoring details:
- **metric_list** (`str`, *optional*, defaults to None) — A list of metrics to use for evaluation. See docs for expected format.
- **output_type** (`str`, *optional*, defaults to "greedy_until") — Selects the type of model output for the given task. Options are `greedy_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
- **output_type** (`str`, *optional*, defaults to "generate_until") — Selects the type of model output for the given task. Options are `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
- **generation_kwargs** (`dict`, *optional*) — Auxiliary arguments for the `generate` function from HF transformers library. Advanced keyword arguments may not be supported for non-HF LM classes.
- **repeats** (`int`, *optional*, defaults to 1) — Number of repeated runs through model for each sample. can be used for cases such as self-consistency.
- **filter_list** (`Union[str, list]`, *optional*) — List of filters to postprocess model outputs. See below for further detail on the filter API.
......
......@@ -4,7 +4,7 @@ from typing import Literal, Tuple
@dataclass
class Instance:
request_type: Literal["loglikelihood", "loglikelihood_rolling", "greedy_until"]
request_type: Literal["loglikelihood", "loglikelihood_rolling", "generate_until"]
doc: dict
arguments: tuple
idx: int
......
......@@ -5,6 +5,7 @@ import numpy as np
import sacrebleu
import sklearn.metrics
import random
import evaluate
from lm_eval.api.registry import register_metric, register_aggregation
......@@ -135,6 +136,19 @@ def acc_mutual_info_fn(items): # This is a passthrough function
return items
exact_match = evaluate.load("exact_match")
@register_metric(
metric="exact_match",
higher_is_better=True,
output_type="generate_until",
aggregation="mean",
)
def exact_match_fn(**kwargs):
return exact_match.compute(**kwargs)
@register_metric(
metric="perplexity",
higher_is_better=False,
......@@ -212,7 +226,7 @@ def f1_fn(items): # This is a passthrough function
@register_metric(
metric="bleu",
higher_is_better=True,
output_type="greedy_until",
output_type="generate_until",
aggregation="bleu",
)
def bleu_fn(items): # This is a passthrough function
......@@ -222,7 +236,7 @@ def bleu_fn(items): # This is a passthrough function
@register_metric(
metric="chrf",
higher_is_better=True,
output_type="greedy_until",
output_type="generate_until",
aggregation="chrf",
)
def chrf_fn(items): # This is a passthrough function
......@@ -232,7 +246,7 @@ def chrf_fn(items): # This is a passthrough function
@register_metric(
metric="ter",
higher_is_better=True,
output_type="greedy_until",
output_type="generate_until",
aggregation="ter",
)
def ter_fn(items): # This is a passthrough function
......
......@@ -211,12 +211,12 @@ class CachingLM:
)
for req in tqdm(requests):
hsh = hash_args(attr, req.args)
if attr == "greedy_until" and req.args[1].get("do_sample", False):
if attr == "generate_until" and req.args[1].get("do_sample", False):
# when we are doing non-greedy generation, don't use the cache
# (else every "randomly sampled" generation would be identical for repeats > 1).
if not warned:
eval_logger.warning(
f"Arguments to lm.greedy_until() '{req.args[1]}' include non-deterministic sampling. Caching will not be performed for such requests."
f"Arguments to lm.generate_until() '{req.args[1]}' include non-deterministic sampling. Caching will not be performed for such requests."
)
warned = True
res.append(None)
......
......@@ -68,10 +68,10 @@ def register_group(name):
return decorate
AGGREGATION_REGISTRY = {}
DEFAULT_AGGREGATION_REGISTRY = {}
METRIC_REGISTRY = {}
OUTPUT_TYPE_REGISTRY = {}
METRIC_REGISTRY = {}
METRIC_AGGREGATION_REGISTRY = {}
AGGREGATION_REGISTRY = {}
HIGHER_IS_BETTER_REGISTRY = {}
DEFAULT_METRIC_REGISTRY = {
......@@ -81,7 +81,7 @@ DEFAULT_METRIC_REGISTRY = {
],
"loglikelihood_rolling": ["word_perplexity", "byte_perplexity", "bits_per_byte"],
"multiple_choice": ["acc", "acc_norm"],
"greedy_until": ["exact_match"],
"generate_until": ["exact_match"],
}
......@@ -95,8 +95,7 @@ def register_metric(**args):
for key, registry in [
("metric", METRIC_REGISTRY),
("higher_is_better", HIGHER_IS_BETTER_REGISTRY),
# ("output_type", OUTPUT_TYPE_REGISTRY),
("aggregation", DEFAULT_AGGREGATION_REGISTRY),
("aggregation", METRIC_AGGREGATION_REGISTRY),
]:
if key in args:
......@@ -158,12 +157,13 @@ def get_aggregation(name):
)
def get_default_aggregation(metric_name):
def get_metric_aggregation(name):
try:
return DEFAULT_AGGREGATION_REGISTRY[metric_name]
return METRIC_AGGREGATION_REGISTRY[name]
except KeyError:
eval_logger.warning(
f"No default aggregation metric for metric '{metric_name}'!"
"{} metric is not assigned a default aggregation!".format(name),
)
......@@ -171,7 +171,6 @@ def is_higher_better(metric_name):
try:
return HIGHER_IS_BETTER_REGISTRY[metric_name]
except KeyError:
raise Warning(f"higher_is_better not specified for metric '{metric_name}'!")
eval_logger.warning(
f"higher_is_better not specified for metric '{metric_name}'!"
)
......@@ -33,7 +33,7 @@ from lm_eval.api.metrics import (
from lm_eval.api.registry import (
get_metric,
get_aggregation,
get_default_aggregation,
get_metric_aggregation,
is_higher_better,
DEFAULT_METRIC_REGISTRY,
OUTPUT_TYPE_REGISTRY,
......@@ -538,12 +538,14 @@ class ConfigurableTask(Task):
self._aggregation_list = {}
self._higher_is_better = {}
_metric_list = DEFAULT_METRIC_REGISTRY[self.config.output_type]
if self.config.metric_list is None:
# TODO: handle this in TaskConfig.__post_init__ ?
_metric_list = DEFAULT_METRIC_REGISTRY[self.config.output_type]
for metric_name in _metric_list:
self._metric_fn_list[metric_name] = get_metric(metric_name)
self._aggregation_list[metric_name] = get_default_aggregation(
self._metric_fn_kwargs[metric_name] = {}
self._aggregation_list[metric_name] = get_metric_aggregation(
metric_name
)
self._higher_is_better[metric_name] = is_higher_better(metric_name)
......@@ -586,7 +588,7 @@ class ConfigurableTask(Task):
]
else:
INV_AGG_REGISTRY = {v: k for k, v in AGGREGATION_REGISTRY.items()}
metric_agg = get_default_aggregation(metric_name)
metric_agg = get_metric_aggregation(metric_name)
eval_logger.warning(
f"[Task: {self._config.task}] metric {metric_name} is defined, but aggregation is not. "
f"using default "
......@@ -687,7 +689,10 @@ class ConfigurableTask(Task):
for choice in check_choices:
choice_has_whitespace = True if choice[0].isspace() else False
delimiter_has_whitespace = (
True if self.config.target_delimiter[-1].isspace() else False
True
if self.config.target_delimiter.rstrip()
== self.config.target_delimiter
else False
)
if delimiter_has_whitespace and choice_has_whitespace:
......
......@@ -661,8 +661,8 @@ class HFLM(LM):
# automatic (variable) batch size detection for vectorization
# pull longest context sample from request
for chunk in utils.chunks(
tqdm(re_ord.get_reordered(), disable=(disable_tqdm or (self.rank != 0))),
chunks = utils.chunks(
re_ord.get_reordered(),
n=self.batch_size
if self.batch_size != "auto"
else override_bs
......@@ -673,7 +673,9 @@ class HFLM(LM):
and n_reordered_requests > 0
and not override_bs
else None,
):
)
for chunk in tqdm(chunks, disable=(disable_tqdm or (self.rank != 0))):
inps = []
cont_toks_list = []
inplens = []
......@@ -844,8 +846,8 @@ class HFLM(LM):
adaptive_batch_size = batch_size
# for each different set of kwargs, we execute all requests, by batch.
for key, re_ord in re_ords.items():
for chunk in utils.chunks(
tqdm(re_ord.get_reordered(), disable=self.rank != 0),
chunks = utils.chunks(
re_ord.get_reordered(),
n=self.batch_size
if self.batch_size != "auto"
else adaptive_batch_size
......@@ -854,7 +856,8 @@ class HFLM(LM):
fn=self._batch_scheduler
if self.batch_size == "auto" and not adaptive_batch_size
else None,
):
)
for chunk in tqdm(chunks, disable=self.rank != 0):
contexts, all_gen_kwargs = zip(*chunk)
# we assume all gen kwargs in the batch are the same
# this is safe to assume because the `grouper` object ensures it.
......
......@@ -15,7 +15,8 @@ from lm_eval.api.registry import (
import logging
eval_logger = logging.getLogger('lm-eval')
eval_logger = logging.getLogger("lm-eval")
def register_configurable_task(config: Dict[str, str]) -> int:
SubClass = type(
......
......@@ -9,4 +9,4 @@ task:
- wsc
- ai2_arc
- blimp
- hendrycksTest*
- mmlu
......@@ -8,7 +8,8 @@ training_split: train
validation_split: validation
doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
doc_to_target: label
doc_to_choice: ['no', 'yes']
doc_to_choice: [' no', ' yes']
target_delimiter: ""
generation_kwargs:
until:
- "\n\n"
......
......@@ -38,13 +38,12 @@ dependencies = [
"zstandard",
]
[tool.setuptools]
packages = ["lm_eval"]
[tool.setuptools.packages.find]
include = ["lm_eval*"]
# required to include yaml files in pip installation
[tool.setuptools.package-data]
lm_eval = ["**/*.yaml", "tasks/**/*"]
examples = ["**/*.yaml"]
[project.scripts]
lm-eval = "lm_eval.__main__:cli_evaluate"
......
......@@ -23,7 +23,7 @@ class DryrunLM(LM):
return res
def greedy_until(self, requests):
def generate_until(self, requests):
res = []
for ctx, _ in requests:
......
......@@ -5,6 +5,8 @@ import os
import random
from lm_eval import tasks
from lm_eval.utils import join_iters
from lm_eval.tasks import include_path
from lm_eval.logger import eval_logger
EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"
......@@ -17,6 +19,12 @@ def parse_args():
parser.add_argument("--num_fewshot", type=int, default=1)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--num_examples", type=int, default=1)
parser.add_argument(
"--include_path",
type=str,
default=None,
help="Additional path to include if there are external tasks to include.",
)
return parser.parse_args()
......@@ -24,6 +32,10 @@ def main():
args = parse_args()
np.random.seed(args.seed)
if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
include_path(args.include_path)
if args.tasks == "all_tasks":
task_names = tasks.ALL_TASKS
else:
......
......@@ -15,10 +15,10 @@ class Test_HFLM:
multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")() # type: ignore
multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
greedy_until_task = tasks.TASK_REGISTRY.get("gsm8k_yaml")() # type: ignore
greedy_until_task.build_all_requests(limit=10, rank=0, world_size=1)
greedy_until_task._config.generation_kwargs["max_gen_toks"] = 10
GREEDY_UNTIL: list[Instance] = greedy_until_task.instances
generate_until_task = tasks.TASK_REGISTRY.get("gsm8k_yaml")() # type: ignore
generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
generate_until: list[Instance] = generate_until_task.instances
rolling_task = tasks.TASK_REGISTRY.get("wikitext")() # type: ignore
rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
ROLLING: list[Instance] = rolling_task.instances
......@@ -65,7 +65,7 @@ class Test_HFLM:
-52.70050811767578,
-56.25089645385742,
]
GREEDY_UNTIL_RES = [
generate_until_RES = [
" The average of $2.50 each is $",
" A robe takes 2 bolts of blue fiber and half",
" $50,000 in repairs.",
......@@ -109,9 +109,9 @@ class Test_HFLM:
), np.argmax(np.array(_res).reshape(-1, 4), axis=1)
assert (argmax_RES == argmax_res).all()
def test_greedy_until(self) -> None:
res = self.LM.greedy_until(self.GREEDY_UNTIL)
assert res == self.GREEDY_UNTIL_RES
def test_generate_until(self) -> None:
res = self.LM.generate_until(self.generate_until)
assert res == self.generate_until_RES
def test_logliklihood_rolling(self) -> None:
res = self.LM.loglikelihood_rolling(self.ROLLING)
......
......@@ -78,7 +78,7 @@ def test_gpt2():
# test empty context
gpt2.loglikelihood([("", "test")])
(gen,) = gpt2.greedy_until(
(gen,) = gpt2.generate_until(
[("The quick brown fox jumps over the lazy", [".", "\n"])]
)
......@@ -204,7 +204,7 @@ def test_gpt3():
# test empty context
gpt3.loglikelihood([("", "test")])
(gen,) = gpt3.greedy_until(
(gen,) = gpt3.generate_until(
[("The quick brown fox jumps over the lazy", [".", "\n"])]
)
......@@ -300,7 +300,7 @@ def test_textsynth():
# test empty context
textsynth.loglikelihood([("", "test")])
(gen,) = textsynth.greedy_until(
(gen,) = textsynth.generate_until(
[("The quick brown fox jumps over the lazy", [".", "\n"])]
)
......
......@@ -98,9 +98,9 @@ def test_versions_stable(taskname, task_class):
return res
def greedy_until(reqs):
def generate_until(reqs):
res = []
assert_target_hashed(f"{taskname}-v{task_class.VERSION}-greedy_until", reqs)
assert_target_hashed(f"{taskname}-v{task_class.VERSION}-generate_until", reqs)
for ctx, _ in [req.args for req in reqs]:
res.append("lol")
......@@ -110,7 +110,7 @@ def test_versions_stable(taskname, task_class):
lm.loglikelihood = ll_fn
lm.loglikelihood_rolling = ll_perp_fn
lm.greedy_until = greedy_until
lm.generate_until = generate_until
limit = None
result = evaluator.evaluate(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment