Commit f77a3a27 authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'big-refactor' of...

Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into mmlu_subgroups
parents 109ed1c7 f8342178
...@@ -43,7 +43,7 @@ jobs: ...@@ -43,7 +43,7 @@ jobs:
# # mypy turned off for now # # mypy turned off for now
# - name: Lint with mypy # - name: Lint with mypy
# run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable # run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
Job 2: # Job 2
testcpu: testcpu:
name: CPU Tests name: CPU Tests
runs-on: ubuntu-latest runs-on: ubuntu-latest
......
...@@ -23,8 +23,12 @@ Features: ...@@ -23,8 +23,12 @@ Features:
- Many tasks implemented, 200+ tasks [implemented in the old framework](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md) which require porting to the new setup as described in [the new task guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md). - Many tasks implemented, 200+ tasks [implemented in the old framework](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md) which require porting to the new setup as described in [the new task guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md).
- Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (including quantization via [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface. - Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (including quantization via [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
- Support for commercial APIs including [OpenAI](https://openai.com), [goose.ai](https://goose.ai), and [TextSynth](https://textsynth.com/). - Support for commercial APIs including [OpenAI](https://openai.com), [goose.ai](https://goose.ai), and [TextSynth](https://textsynth.com/).
- Support for evaluation on adapters (e.g. LoRa) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft). - Support for evaluation on adapters (e.g. LoRA) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).
- Evaluating with publicly available prompts ensures reproducibility and comparability between papers. - Support for local models and benchmarks.
- Evaluation with publicly available prompts ensures reproducibility and comparability between papers.
The Language Model Evaluation Harness is the backend for 🤗 Hugging Face's popular [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and is used internally by dozens of companies including NVIDIA, Cohere, Booz Allen Hamilton, and Mosaic ML.
## Install ## Install
......
...@@ -97,6 +97,12 @@ def parse_eval_args() -> argparse.Namespace: ...@@ -97,6 +97,12 @@ def parse_eval_args() -> argparse.Namespace:
default=None, default=None,
help="Additional path to include if there are external tasks to include.", help="Additional path to include if there are external tasks to include.",
) )
parser.add_argument(
"--verbosity",
type=str,
default="INFO",
help="Log error when tasks are not registered.",
)
return parser.parse_args() return parser.parse_args()
...@@ -105,6 +111,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -105,6 +111,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
# we allow for args to be passed externally, else we parse them ourselves # we allow for args to be passed externally, else we parse them ourselves
args = parse_eval_args() args = parse_eval_args()
eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
if args.limit: if args.limit:
......
...@@ -44,7 +44,7 @@ ALL_OUTPUT_TYPES = [ ...@@ -44,7 +44,7 @@ ALL_OUTPUT_TYPES = [
"loglikelihood", "loglikelihood",
"multiple_choice", "multiple_choice",
"loglikelihood_rolling", "loglikelihood_rolling",
"greedy_until", "generate_until",
] ]
...@@ -80,7 +80,7 @@ class TaskConfig(dict): ...@@ -80,7 +80,7 @@ class TaskConfig(dict):
num_fewshot: int = 0 num_fewshot: int = 0
# scoring options # scoring options
metric_list: list = None metric_list: list = None
output_type: str = "greedy_until" output_type: str = "generate_until"
generation_kwargs: dict = None generation_kwargs: dict = None
repeats: int = 1 repeats: int = 1
filter_list: Union[str, list] = None filter_list: Union[str, list] = None
...@@ -97,11 +97,11 @@ class TaskConfig(dict): ...@@ -97,11 +97,11 @@ class TaskConfig(dict):
self.dataset_path = inspect.getfile(import_module(self.dataset_path)) self.dataset_path = inspect.getfile(import_module(self.dataset_path))
if self.generation_kwargs is not None: if self.generation_kwargs is not None:
if self.output_type != "greedy_until": if self.output_type != "generate_until":
eval_logger.warning( eval_logger.warning(
f"[{self.task}] passed `generation_kwargs`, but not using `output_type: greedy_until`!" f"[{self.task}] passed `generation_kwargs`, but not using `output_type: generate_until`!"
) )
assert self.output_type != "greedy_until" assert self.output_type != "generate_until"
if "temperature" in self.generation_kwargs: if "temperature" in self.generation_kwargs:
self.generation_kwargs["temperature"] = float( self.generation_kwargs["temperature"] = float(
...@@ -111,7 +111,7 @@ class TaskConfig(dict): ...@@ -111,7 +111,7 @@ class TaskConfig(dict):
if "until" not in self.generation_kwargs: if "until" not in self.generation_kwargs:
self.generation_kwargs["until"] = [self.fewshot_delimiter] self.generation_kwargs["until"] = [self.fewshot_delimiter]
else: else:
if self.output_type == "greedy_until": if self.output_type == "generate_until":
# ensure that we greedily generate in absence of explicit arguments otherwise # ensure that we greedily generate in absence of explicit arguments otherwise
self.generation_kwargs = { self.generation_kwargs = {
"until": None "until": None
...@@ -958,7 +958,7 @@ class ConfigurableTask(Task): ...@@ -958,7 +958,7 @@ class ConfigurableTask(Task):
) )
return request_list return request_list
elif self.OUTPUT_TYPE == "greedy_until": elif self.OUTPUT_TYPE == "generate_until":
arguments = (ctx, self.config.generation_kwargs) arguments = (ctx, self.config.generation_kwargs)
return Instance( return Instance(
...@@ -1070,7 +1070,7 @@ class ConfigurableTask(Task): ...@@ -1070,7 +1070,7 @@ class ConfigurableTask(Task):
acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0 acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0
result_dict["acc_mutual_info"] = acc_mutual_info result_dict["acc_mutual_info"] = acc_mutual_info
elif self.OUTPUT_TYPE == "greedy_until": elif self.OUTPUT_TYPE == "generate_until":
gold = self.doc_to_target(doc) gold = self.doc_to_target(doc)
result = results[0] result = results[0]
if self.config.doc_to_choice is not None: if self.config.doc_to_choice is not None:
...@@ -1134,7 +1134,7 @@ class ConfigurableTask(Task): ...@@ -1134,7 +1134,7 @@ class ConfigurableTask(Task):
else: else:
raise ValueError( raise ValueError(
f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ", f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",
"'loglikelihood', 'loglikelihood_rolling', 'greedy_until' or 'multiple_choice'", "'loglikelihood', 'loglikelihood_rolling', 'generate_until' or 'multiple_choice'",
) )
return result_dict return result_dict
......
import os
import yaml
from lm_eval import utils
from lm_eval.tasks import register_configurable_task, check_prompt_config
from lm_eval.logger import eval_logger
from lm_eval.api.registry import (
TASK_REGISTRY,
GROUP_REGISTRY,
ALL_TASKS,
)
def include_benchmarks(task_dir: str) -> None:
for root, subdirs, file_list in os.walk(task_dir):
if (subdirs == [] or "__pycache__" in subdirs) and (len(file_list) > 0):
for f in file_list:
if f.endswith(".yaml"):
try:
benchmark_path = os.path.join(root, f)
with open(benchmark_path, "rb") as file:
yaml_config = yaml.full_load(file)
if "prompts" in yaml_config:
continue # Skip it
assert "group" in yaml_config
group = yaml_config["group"]
all_task_list = yaml_config["task"]
config_list = [
task for task in all_task_list if type(task) != str
]
task_list = [
task for task in all_task_list if type(task) == str
]
for task_config in config_list:
yaml_dir = os.path.dirname(benchmark_path)
task_config = utils.load_yaml_config(
yaml_config=task_config, yaml_dir=yaml_dir
)
if "use_prompt" in task_config:
if "yaml" in task_config["use_prompt"]:
task_config["use_prompt"] = os.path.join(
root, task_config["use_prompt"]
)
var_configs = check_prompt_config(
{
**task_config,
**{"group": group},
}
)
for config in var_configs:
register_configurable_task(config)
task_names = utils.pattern_match(task_list, ALL_TASKS)
for task in task_names:
if task in TASK_REGISTRY:
if group in GROUP_REGISTRY:
GROUP_REGISTRY[group].append(task)
else:
GROUP_REGISTRY[group] = [task]
ALL_TASKS.add(group)
except Exception as error:
eval_logger.warning(
"Failed to load benchmark in\n"
f" {benchmark_path}\n"
" Benchmark will not be added to registry\n"
f" Error: {error}"
)
task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
include_benchmarks(task_dir)
...@@ -138,7 +138,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e ...@@ -138,7 +138,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False): def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.") raise NotImplementedError("No support for logits.")
def greedy_until(self, requests) -> List[str]: def generate_until(self, requests) -> List[str]:
if not requests: if not requests:
return [] return []
...@@ -164,7 +164,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e ...@@ -164,7 +164,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
) )
res.append(response) res.append(response)
self.cache_hook.add_partial("greedy_until", request, response) self.cache_hook.add_partial("generate_until", request, response)
except anthropic.APIConnectionError as e: # type: ignore # noqa: F821 except anthropic.APIConnectionError as e: # type: ignore # noqa: F821
eval_logger.critical(f"Server unreachable: {e.__cause__}") eval_logger.critical(f"Server unreachable: {e.__cause__}")
break break
...@@ -179,7 +179,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e ...@@ -179,7 +179,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
raise NotImplementedError() raise NotImplementedError()
def _model_generate(self, context, max_length, eos_token_id): def _model_generate(self, context, max_length, eos_token_id):
# Isn't used because we override greedy_until # Isn't used because we override generate_until
raise NotImplementedError() raise NotImplementedError()
def loglikelihood(self, requests): def loglikelihood(self, requests):
......
...@@ -20,7 +20,7 @@ class DummyLM(LM): ...@@ -20,7 +20,7 @@ class DummyLM(LM):
return res return res
def greedy_until(self, requests): def generate_until(self, requests):
res = [] res = []
for ctx, _ in requests: for ctx, _ in requests:
......
...@@ -813,7 +813,7 @@ class HFLM(LM): ...@@ -813,7 +813,7 @@ class HFLM(LM):
return re_ord.get_original(res) return re_ord.get_original(res)
def greedy_until(self, requests): def generate_until(self, requests):
res = defaultdict(list) res = defaultdict(list)
re_ords = {} re_ords = {}
...@@ -930,7 +930,7 @@ class HFLM(LM): ...@@ -930,7 +930,7 @@ class HFLM(LM):
res[key].append(s) res[key].append(s)
self.cache_hook.add_partial( self.cache_hook.add_partial(
"greedy_until", (context, gen_kwargs), s "generate_until", (context, gen_kwargs), s
) )
pbar.update(1) pbar.update(1)
# reorder this group of results back to original unsorted form # reorder this group of results back to original unsorted form
......
...@@ -203,7 +203,7 @@ class OpenaiCompletionsLM(LM): ...@@ -203,7 +203,7 @@ class OpenaiCompletionsLM(LM):
self.cache_hook.add_partial("loglikelihood", cache_key, answer) self.cache_hook.add_partial("loglikelihood", cache_key, answer)
return re_ord.get_original(res) return re_ord.get_original(res)
def greedy_until(self, requests) -> List[str]: def generate_until(self, requests) -> List[str]:
if not requests: if not requests:
return [] return []
res = [] res = []
...@@ -260,7 +260,7 @@ class OpenaiCompletionsLM(LM): ...@@ -260,7 +260,7 @@ class OpenaiCompletionsLM(LM):
# partial caching # partial caching
self.cache_hook.add_partial( self.cache_hook.add_partial(
"greedy_until", (context, {"until": until_}), s "generate_until", (context, {"until": until_}), s
) )
res.append(s) res.append(s)
...@@ -271,7 +271,7 @@ class OpenaiCompletionsLM(LM): ...@@ -271,7 +271,7 @@ class OpenaiCompletionsLM(LM):
raise NotImplementedError() raise NotImplementedError()
def _model_generate(self, context, max_length, eos_token_id): def _model_generate(self, context, max_length, eos_token_id):
# Isn't used because we override greedy_until # Isn't used because we override generate_until
raise NotImplementedError() raise NotImplementedError()
def loglikelihood_rolling(self, requests) -> List[float]: def loglikelihood_rolling(self, requests) -> List[float]:
......
...@@ -58,7 +58,7 @@ class TextSynthLM(LM): ...@@ -58,7 +58,7 @@ class TextSynthLM(LM):
@property @property
def eot_token_id(self): def eot_token_id(self):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError() raise NotImplementedError()
@property @property
...@@ -72,20 +72,20 @@ class TextSynthLM(LM): ...@@ -72,20 +72,20 @@ class TextSynthLM(LM):
@property @property
def batch_size(self): def batch_size(self):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError() raise NotImplementedError()
@property @property
def device(self): def device(self):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError() raise NotImplementedError()
def tok_encode(self, string: str): def tok_encode(self, string: str):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError() raise NotImplementedError()
def tok_decode(self, tokens): def tok_decode(self, tokens):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError() raise NotImplementedError()
def loglikelihood(self, requests): def loglikelihood(self, requests):
...@@ -122,7 +122,7 @@ class TextSynthLM(LM): ...@@ -122,7 +122,7 @@ class TextSynthLM(LM):
"input tokenization support from TextSynth." "input tokenization support from TextSynth."
) )
def greedy_until(self, requests): def generate_until(self, requests):
if not requests: if not requests:
return [] return []
...@@ -146,7 +146,7 @@ class TextSynthLM(LM): ...@@ -146,7 +146,7 @@ class TextSynthLM(LM):
s = resp["text"] s = resp["text"]
res.append(s) res.append(s)
self.cache_hook.add_partial("greedy_until", (inp, request_args), s) self.cache_hook.add_partial("generate_until", (inp, request_args), s)
else: else:
logger.error( logger.error(
f"The following response does not contain generated `text`. " f"The following response does not contain generated `text`. "
...@@ -160,5 +160,5 @@ class TextSynthLM(LM): ...@@ -160,5 +160,5 @@ class TextSynthLM(LM):
raise NotImplementedError() raise NotImplementedError()
def _model_generate(self, context, max_length, eos_token_id): def _model_generate(self, context, max_length, eos_token_id):
# Isn't used because we override greedy_until # Isn't used because we override generate_until
raise NotImplementedError() raise NotImplementedError()
...@@ -4,7 +4,6 @@ from typing import List, Union, Dict ...@@ -4,7 +4,6 @@ from typing import List, Union, Dict
from lm_eval import utils from lm_eval import utils
from lm_eval import prompts from lm_eval import prompts
from lm_eval.logger import eval_logger
from lm_eval.api.task import TaskConfig, Task, ConfigurableTask from lm_eval.api.task import TaskConfig, Task, ConfigurableTask
from lm_eval.api.registry import ( from lm_eval.api.registry import (
register_task, register_task,
...@@ -14,6 +13,9 @@ from lm_eval.api.registry import ( ...@@ -14,6 +13,9 @@ from lm_eval.api.registry import (
ALL_TASKS, ALL_TASKS,
) )
import logging
eval_logger = logging.getLogger('lm-eval')
def register_configurable_task(config: Dict[str, str]) -> int: def register_configurable_task(config: Dict[str, str]) -> int:
SubClass = type( SubClass = type(
...@@ -98,7 +100,7 @@ def check_prompt_config( ...@@ -98,7 +100,7 @@ def check_prompt_config(
] ]
) )
}, },
**{"output_type": "greedy_until"}, **{"output_type": "generate_until"},
} }
) )
else: else:
...@@ -145,7 +147,7 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None: ...@@ -145,7 +147,7 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None:
except Exception as error: except Exception as error:
import traceback import traceback
eval_logger.warning( eval_logger.debug(
"Failed to load config in\n" "Failed to load config in\n"
f" {yaml_path}\n" f" {yaml_path}\n"
" Config will not be added to registry\n" " Config will not be added to registry\n"
......
task: babi task: babi
dataset_path: Muennighoff/babi dataset_path: Muennighoff/babi
dataset_name: null dataset_name: null
output_type: greedy_until output_type: generate_until
training_split: train training_split: train
validation_split: valid validation_split: valid
test_split: test test_split: test
......
group: bbh_flan_cot_fewshot group: bbh_flan_cot_fewshot
dataset_path: lukaemon/bbh dataset_path: lukaemon/bbh
output_type: greedy_until output_type: generate_until
test_split: test test_split: test
doc_to_target: "{{target}}" doc_to_target: "{{target}}"
metric_list: metric_list:
......
group: bbh_flan_cot_zeroshot group: bbh_flan_cot_zeroshot
dataset_path: lukaemon/bbh dataset_path: lukaemon/bbh
output_type: greedy_until output_type: generate_until
test_split: test test_split: test
doc_to_target: "{{target}}" doc_to_target: "{{target}}"
metric_list: metric_list:
......
group: bbh_flan_fewshot group: bbh_flan_fewshot
dataset_path: lukaemon/bbh dataset_path: lukaemon/bbh
output_type: greedy_until output_type: generate_until
test_split: test test_split: test
doc_to_target: "{{target}}" doc_to_target: "{{target}}"
metric_list: metric_list:
......
group: bbh_flan_zeroshot group: bbh_flan_zeroshot
dataset_path: lukaemon/bbh dataset_path: lukaemon/bbh
output_type: greedy_until output_type: generate_until
test_split: test test_split: test
doc_to_target: "{{target}}" doc_to_target: "{{target}}"
metric_list: metric_list:
......
...@@ -13,7 +13,7 @@ Homepage: https://github.com/facebookresearch/belebele ...@@ -13,7 +13,7 @@ Homepage: https://github.com/facebookresearch/belebele
```bibtex ```bibtex
@misc{bandarkar2023belebele, @misc{bandarkar2023belebele,
title={The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants}, title={The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants},
author={Lucas Bandarkar and Davis Liang and Benjamin Muller and Mikel Artetxe and Satya Narayan Shukla and Donald Husa and Naman Goyal and Abhinandan Krishnan and Luke Zettlemoyer and Madian Khabsa}, author={Lucas Bandarkar and Davis Liang and Benjamin Muller and Mikel Artetxe and Satya Narayan Shukla and Donald Husa and Naman Goyal and Abhinandan Krishnan and Luke Zettlemoyer and Madian Khabsa},
year={2023}, year={2023},
eprint={2308.16884}, eprint={2308.16884},
......
group: flan-cot group: flan-cot
output_type: greedy_until output_type: generate_until
validation_split: validation validation_split: validation
doc_to_target: "{{answer}}" doc_to_target: "{{answer}}"
metric_list: metric_list:
......
output_type: greedy_until output_type: generate_until
validation_split: validation validation_split: validation
metric_list: metric_list:
- metric: exact_match - metric: exact_match
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment