Commit f77a3a27 authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'big-refactor' of...

Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into mmlu_subgroups
parents 109ed1c7 f8342178
......@@ -43,7 +43,7 @@ jobs:
# # mypy turned off for now
# - name: Lint with mypy
# run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
Job 2:
# Job 2
testcpu:
name: CPU Tests
runs-on: ubuntu-latest
......
......@@ -23,8 +23,12 @@ Features:
- Many tasks implemented, 200+ tasks [implemented in the old framework](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md) which require porting to the new setup as described in [the new task guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md).
- Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (including quantization via [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
- Support for commercial APIs including [OpenAI](https://openai.com), [goose.ai](https://goose.ai), and [TextSynth](https://textsynth.com/).
- Support for evaluation on adapters (e.g. LoRa) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).
- Evaluating with publicly available prompts ensures reproducibility and comparability between papers.
- Support for evaluation on adapters (e.g. LoRA) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).
- Support for local models and benchmarks.
- Evaluation with publicly available prompts ensures reproducibility and comparability between papers.
The Language Model Evaluation Harness is the backend for 🤗 Hugging Face's popular [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and is used internally by dozens of companies including NVIDIA, Cohere, Booz Allen Hamilton, and Mosaic ML.
## Install
......
......@@ -97,6 +97,12 @@ def parse_eval_args() -> argparse.Namespace:
default=None,
help="Additional path to include if there are external tasks to include.",
)
parser.add_argument(
"--verbosity",
type=str,
default="INFO",
help="Log error when tasks are not registered.",
)
return parser.parse_args()
......@@ -105,6 +111,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
# we allow for args to be passed externally, else we parse them ourselves
args = parse_eval_args()
eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
os.environ["TOKENIZERS_PARALLELISM"] = "false"
if args.limit:
......
......@@ -44,7 +44,7 @@ ALL_OUTPUT_TYPES = [
"loglikelihood",
"multiple_choice",
"loglikelihood_rolling",
"greedy_until",
"generate_until",
]
......@@ -80,7 +80,7 @@ class TaskConfig(dict):
num_fewshot: int = 0
# scoring options
metric_list: list = None
output_type: str = "greedy_until"
output_type: str = "generate_until"
generation_kwargs: dict = None
repeats: int = 1
filter_list: Union[str, list] = None
......@@ -97,11 +97,11 @@ class TaskConfig(dict):
self.dataset_path = inspect.getfile(import_module(self.dataset_path))
if self.generation_kwargs is not None:
if self.output_type != "greedy_until":
if self.output_type != "generate_until":
eval_logger.warning(
f"[{self.task}] passed `generation_kwargs`, but not using `output_type: greedy_until`!"
f"[{self.task}] passed `generation_kwargs`, but not using `output_type: generate_until`!"
)
assert self.output_type != "greedy_until"
assert self.output_type != "generate_until"
if "temperature" in self.generation_kwargs:
self.generation_kwargs["temperature"] = float(
......@@ -111,7 +111,7 @@ class TaskConfig(dict):
if "until" not in self.generation_kwargs:
self.generation_kwargs["until"] = [self.fewshot_delimiter]
else:
if self.output_type == "greedy_until":
if self.output_type == "generate_until":
# ensure that we greedily generate in absence of explicit arguments otherwise
self.generation_kwargs = {
"until": None
......@@ -958,7 +958,7 @@ class ConfigurableTask(Task):
)
return request_list
elif self.OUTPUT_TYPE == "greedy_until":
elif self.OUTPUT_TYPE == "generate_until":
arguments = (ctx, self.config.generation_kwargs)
return Instance(
......@@ -1070,7 +1070,7 @@ class ConfigurableTask(Task):
acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0
result_dict["acc_mutual_info"] = acc_mutual_info
elif self.OUTPUT_TYPE == "greedy_until":
elif self.OUTPUT_TYPE == "generate_until":
gold = self.doc_to_target(doc)
result = results[0]
if self.config.doc_to_choice is not None:
......@@ -1134,7 +1134,7 @@ class ConfigurableTask(Task):
else:
raise ValueError(
f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",
"'loglikelihood', 'loglikelihood_rolling', 'greedy_until' or 'multiple_choice'",
"'loglikelihood', 'loglikelihood_rolling', 'generate_until' or 'multiple_choice'",
)
return result_dict
......
import os
import yaml
from lm_eval import utils
from lm_eval.tasks import register_configurable_task, check_prompt_config
from lm_eval.logger import eval_logger
from lm_eval.api.registry import (
TASK_REGISTRY,
GROUP_REGISTRY,
ALL_TASKS,
)
def include_benchmarks(task_dir: str) -> None:
for root, subdirs, file_list in os.walk(task_dir):
if (subdirs == [] or "__pycache__" in subdirs) and (len(file_list) > 0):
for f in file_list:
if f.endswith(".yaml"):
try:
benchmark_path = os.path.join(root, f)
with open(benchmark_path, "rb") as file:
yaml_config = yaml.full_load(file)
if "prompts" in yaml_config:
continue # Skip it
assert "group" in yaml_config
group = yaml_config["group"]
all_task_list = yaml_config["task"]
config_list = [
task for task in all_task_list if type(task) != str
]
task_list = [
task for task in all_task_list if type(task) == str
]
for task_config in config_list:
yaml_dir = os.path.dirname(benchmark_path)
task_config = utils.load_yaml_config(
yaml_config=task_config, yaml_dir=yaml_dir
)
if "use_prompt" in task_config:
if "yaml" in task_config["use_prompt"]:
task_config["use_prompt"] = os.path.join(
root, task_config["use_prompt"]
)
var_configs = check_prompt_config(
{
**task_config,
**{"group": group},
}
)
for config in var_configs:
register_configurable_task(config)
task_names = utils.pattern_match(task_list, ALL_TASKS)
for task in task_names:
if task in TASK_REGISTRY:
if group in GROUP_REGISTRY:
GROUP_REGISTRY[group].append(task)
else:
GROUP_REGISTRY[group] = [task]
ALL_TASKS.add(group)
except Exception as error:
eval_logger.warning(
"Failed to load benchmark in\n"
f" {benchmark_path}\n"
" Benchmark will not be added to registry\n"
f" Error: {error}"
)
task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
include_benchmarks(task_dir)
......@@ -138,7 +138,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.")
def greedy_until(self, requests) -> List[str]:
def generate_until(self, requests) -> List[str]:
if not requests:
return []
......@@ -164,7 +164,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
)
res.append(response)
self.cache_hook.add_partial("greedy_until", request, response)
self.cache_hook.add_partial("generate_until", request, response)
except anthropic.APIConnectionError as e: # type: ignore # noqa: F821
eval_logger.critical(f"Server unreachable: {e.__cause__}")
break
......@@ -179,7 +179,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
raise NotImplementedError()
def _model_generate(self, context, max_length, eos_token_id):
# Isn't used because we override greedy_until
# Isn't used because we override generate_until
raise NotImplementedError()
def loglikelihood(self, requests):
......
......@@ -20,7 +20,7 @@ class DummyLM(LM):
return res
def greedy_until(self, requests):
def generate_until(self, requests):
res = []
for ctx, _ in requests:
......
......@@ -813,7 +813,7 @@ class HFLM(LM):
return re_ord.get_original(res)
def greedy_until(self, requests):
def generate_until(self, requests):
res = defaultdict(list)
re_ords = {}
......@@ -930,7 +930,7 @@ class HFLM(LM):
res[key].append(s)
self.cache_hook.add_partial(
"greedy_until", (context, gen_kwargs), s
"generate_until", (context, gen_kwargs), s
)
pbar.update(1)
# reorder this group of results back to original unsorted form
......
......@@ -203,7 +203,7 @@ class OpenaiCompletionsLM(LM):
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
return re_ord.get_original(res)
def greedy_until(self, requests) -> List[str]:
def generate_until(self, requests) -> List[str]:
if not requests:
return []
res = []
......@@ -260,7 +260,7 @@ class OpenaiCompletionsLM(LM):
# partial caching
self.cache_hook.add_partial(
"greedy_until", (context, {"until": until_}), s
"generate_until", (context, {"until": until_}), s
)
res.append(s)
......@@ -271,7 +271,7 @@ class OpenaiCompletionsLM(LM):
raise NotImplementedError()
def _model_generate(self, context, max_length, eos_token_id):
# Isn't used because we override greedy_until
# Isn't used because we override generate_until
raise NotImplementedError()
def loglikelihood_rolling(self, requests) -> List[float]:
......
......@@ -58,7 +58,7 @@ class TextSynthLM(LM):
@property
def eot_token_id(self):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
# Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError()
@property
......@@ -72,20 +72,20 @@ class TextSynthLM(LM):
@property
def batch_size(self):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
# Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError()
@property
def device(self):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
# Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError()
def tok_encode(self, string: str):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
# Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError()
def tok_decode(self, tokens):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
# Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError()
def loglikelihood(self, requests):
......@@ -122,7 +122,7 @@ class TextSynthLM(LM):
"input tokenization support from TextSynth."
)
def greedy_until(self, requests):
def generate_until(self, requests):
if not requests:
return []
......@@ -146,7 +146,7 @@ class TextSynthLM(LM):
s = resp["text"]
res.append(s)
self.cache_hook.add_partial("greedy_until", (inp, request_args), s)
self.cache_hook.add_partial("generate_until", (inp, request_args), s)
else:
logger.error(
f"The following response does not contain generated `text`. "
......@@ -160,5 +160,5 @@ class TextSynthLM(LM):
raise NotImplementedError()
def _model_generate(self, context, max_length, eos_token_id):
# Isn't used because we override greedy_until
# Isn't used because we override generate_until
raise NotImplementedError()
......@@ -4,7 +4,6 @@ from typing import List, Union, Dict
from lm_eval import utils
from lm_eval import prompts
from lm_eval.logger import eval_logger
from lm_eval.api.task import TaskConfig, Task, ConfigurableTask
from lm_eval.api.registry import (
register_task,
......@@ -14,6 +13,9 @@ from lm_eval.api.registry import (
ALL_TASKS,
)
import logging
eval_logger = logging.getLogger('lm-eval')
def register_configurable_task(config: Dict[str, str]) -> int:
SubClass = type(
......@@ -98,7 +100,7 @@ def check_prompt_config(
]
)
},
**{"output_type": "greedy_until"},
**{"output_type": "generate_until"},
}
)
else:
......@@ -145,7 +147,7 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None:
except Exception as error:
import traceback
eval_logger.warning(
eval_logger.debug(
"Failed to load config in\n"
f" {yaml_path}\n"
" Config will not be added to registry\n"
......
task: babi
dataset_path: Muennighoff/babi
dataset_name: null
output_type: greedy_until
output_type: generate_until
training_split: train
validation_split: valid
test_split: test
......
group: bbh_flan_cot_fewshot
dataset_path: lukaemon/bbh
output_type: greedy_until
output_type: generate_until
test_split: test
doc_to_target: "{{target}}"
metric_list:
......
group: bbh_flan_cot_zeroshot
dataset_path: lukaemon/bbh
output_type: greedy_until
output_type: generate_until
test_split: test
doc_to_target: "{{target}}"
metric_list:
......
group: bbh_flan_fewshot
dataset_path: lukaemon/bbh
output_type: greedy_until
output_type: generate_until
test_split: test
doc_to_target: "{{target}}"
metric_list:
......
group: bbh_flan_zeroshot
dataset_path: lukaemon/bbh
output_type: greedy_until
output_type: generate_until
test_split: test
doc_to_target: "{{target}}"
metric_list:
......
......@@ -13,7 +13,7 @@ Homepage: https://github.com/facebookresearch/belebele
```bibtex
@misc{bandarkar2023belebele,
title={The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants},
title={The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants},
author={Lucas Bandarkar and Davis Liang and Benjamin Muller and Mikel Artetxe and Satya Narayan Shukla and Donald Husa and Naman Goyal and Abhinandan Krishnan and Luke Zettlemoyer and Madian Khabsa},
year={2023},
eprint={2308.16884},
......
group: flan-cot
output_type: greedy_until
output_type: generate_until
validation_split: validation
doc_to_target: "{{answer}}"
metric_list:
......
output_type: greedy_until
output_type: generate_until
validation_split: validation
metric_list:
- metric: exact_match
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment