Commit 02e841ce authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'main' of https://github.com/EleutherAI/lm-evaluation-harness into t5v2-alt-plus

parents 90ad5db7 e74ec966
......@@ -105,7 +105,7 @@ class OpenaiCompletionsLM(TemplateLM):
except ModuleNotFoundError:
raise Exception(
"attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
please install these via `pip install lm-eval[openai]` or `pip install -e .\"[openai]\"`",
)
self.model = model
self.base_url = base_url
......@@ -231,7 +231,7 @@ class OpenaiCompletionsLM(TemplateLM):
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
return re_ord.get_original(res)
def generate_until(self, requests) -> List[str]:
def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
if not requests:
return []
res = []
......@@ -258,7 +258,8 @@ class OpenaiCompletionsLM(TemplateLM):
# todo: more intelligent batching for heterogeneous `until`
for chunk, request_args in tqdm(
list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size))
list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size)),
disable=disable_tqdm,
):
inps = []
self._max_gen_toks = request_args.get("max_gen_toks", self.max_gen_toks)
......@@ -308,10 +309,12 @@ class OpenaiCompletionsLM(TemplateLM):
# Isn't used because we override generate_until
raise NotImplementedError()
def loglikelihood_rolling(self, requests) -> List[float]:
def loglikelihood_rolling(
self, requests, disable_tqdm: bool = False
) -> List[float]:
loglikelihoods = []
for (string,) in tqdm([req.args for req in requests]):
for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
rolling_token_windows = list(
map(
utils.make_disjoint_window,
......@@ -398,7 +401,7 @@ class OpenaiChatCompletionsLM(LM):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
def generate_until(self, requests) -> List[str]:
def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
res = defaultdict(list)
re_ords = {}
......@@ -412,7 +415,7 @@ class OpenaiChatCompletionsLM(LM):
[req.args for req in reqs], lambda x: (-len(x[0]), x[0])
)
pbar = tqdm(total=len(requests), disable=(self.rank != 0))
pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
for key, re_ord in re_ords.items():
# n needs to be 1 because messages in
# chat completion are not batch but
......@@ -471,8 +474,8 @@ class OpenaiChatCompletionsLM(LM):
return grouper.get_original(res)
def loglikelihood(self, requests):
def loglikelihood(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.")
def loglikelihood_rolling(self, requests):
def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.")
......@@ -95,9 +95,9 @@ class TextSynthLM(LM):
# Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError()
def loglikelihood(self, requests):
def loglikelihood(self, requests, disable_tqdm: bool = False):
res = []
for context, continuation in tqdm(requests):
for context, continuation in tqdm(requests, disable=disable_tqdm):
response = textsynth_completion(
url=self.api_url + "/v1/engines/" + self.engine + "/logprob",
headers={"Authorization": "Bearer " + self.api_key},
......@@ -119,7 +119,7 @@ class TextSynthLM(LM):
assert False
return res
def loglikelihood_rolling(self, requests):
def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
# TODO: The TextSynth API does not support tokenized inputs so we cannot
# manually partition long contexts into smaller rolling windows as
# done for other models derived from `BaseLM`. Override this method
......@@ -129,12 +129,12 @@ class TextSynthLM(LM):
"input tokenization support from TextSynth."
)
def generate_until(self, requests):
def generate_until(self, requests, disable_tqdm: bool = False):
if not requests:
return []
res = []
for request in tqdm(requests):
for request in tqdm(requests, disable=disable_tqdm):
inp = request[0]
request_args = request[1]
until = request_args["until"]
......
import collections
import fnmatch
import gc
import itertools
import time
from functools import wraps
from typing import (
......@@ -262,55 +263,44 @@ def stop_sequences_criteria(
)
def divide(iterable, n) -> List[Iterator]:
"""Divide the elements from *iterable* into *n* parts, maintaining
order.
def undistribute(iterable):
"""
Undoes https://more-itertools.readthedocs.io/en/stable/api.html#more_itertools.distribute .
>>> group_1, group_2 = divide([1, 2, 3, 4, 5, 6], 2)
Re-interleaves results that have been split using more_itertools.distribute:
>>> group_1, group_2 = distribute(2, [1, 2, 3, 4, 5, 6])
>>> list(group_1)
[1, 2, 3]
[1, 3, 5]
>>> list(group_2)
[4, 5, 6]
[2, 4, 6]
>>> undistribute([group_1, group_2])
[1, 2, 3, 4, 5, 6]
If the length of *iterable* is not evenly divisible by *n*, then the
length of the returned iterables will not be identical:
Handles non-uniform component lengths:
>>> children = divide([1, 2, 3, 4, 5, 6, 7], 3)
>>> children = distribute(3, [1, 2, 3, 4, 5, 6, 7])
>>> [list(c) for c in children]
[[1, 2, 3], [4, 5], [6, 7]]
[[1, 4, 7], [2, 5], [3, 6]]
>>> undistribute(children)
[1, 2, 3, 4, 5, 6, 7]
If the length of the iterable is smaller than n, then the last returned
iterables will be empty:
Also handles when some iterables are empty:
>>> children = divide([1, 2, 3], 5)
>>> children = distribute(5, [1, 2, 3])
>>> [list(c) for c in children]
[[1], [2], [3], [], []]
This function will exhaust the iterable before returning and may require
significant storage. If order is not important, see :func:`distribute`,
which does not first pull the iterable into memory.
>>> undistribute(children)
[1, 2, 3]
"""
if n < 1:
raise ValueError("n must be at least 1")
try:
iterable[:0]
except TypeError:
seq = tuple(iterable)
else:
seq = iterable
q, r = divmod(len(seq), n)
ret = []
stop = 0
for i in range(1, n + 1):
start = stop
stop += q + 1 if i <= r else q
ret.append(iter(seq[start:stop]))
return ret
return [
x
for x in itertools.chain.from_iterable(
itertools.zip_longest(*[list(x) for x in iterable])
)
if x is not None
]
def retry_on_specific_exceptions(
......
import copy
from importlib.metadata import version
from importlib.util import find_spec
from typing import List, Literal, Optional, Tuple, Union
from more_itertools import distribute
from packaging.version import parse as parse_version
from tqdm import tqdm
from lm_eval.api.instance import Instance
from lm_eval.api.model import TemplateLM
from lm_eval.api.registry import register_model
from lm_eval.models.utils import Collator, divide
from lm_eval.models.utils import Collator, undistribute
from lm_eval.utils import (
eval_logger,
get_rolling_token_windows,
......@@ -17,7 +20,6 @@ from lm_eval.utils import (
try:
import ray
from ray.util.multiprocessing import Pool
from vllm import LLM, SamplingParams
from vllm.transformers_utils.tokenizer import get_tokenizer
except ModuleNotFoundError:
......@@ -26,14 +28,6 @@ except ModuleNotFoundError:
eval_logger = eval_logger
# adapted from https://github.com/vllm-project/vllm/issues/367#issuecomment-1788341727
def run_inference_one_model(
model_args: dict, sampling_params, requests: List[List[int]]
):
llm = LLM(**model_args)
return llm.generate(prompt_token_ids=requests, sampling_params=sampling_params)
@register_model("vllm")
class VLLM(TemplateLM):
_DEFAULT_MAX_LENGTH = 2048
......@@ -60,6 +54,7 @@ class VLLM(TemplateLM):
gpu_memory_utilization: float = 0.9,
device: str = "cuda",
data_parallel_size: int = 1,
**kwargs,
):
super().__init__()
......@@ -92,6 +87,7 @@ class VLLM(TemplateLM):
"quantization": quantization,
"seed": int(seed),
}
self.model_args.update(kwargs)
self.batch_size = (
"auto"
if isinstance(batch_size, str) and "auto" in batch_size
......@@ -100,6 +96,12 @@ class VLLM(TemplateLM):
if self.data_parallel_size <= 1:
self.model = LLM(**self.model_args)
else:
assert parse_version(version("vllm")) < parse_version(
"0.3.3"
), "data_parallel is only compatible with vllm < v0.3.3."
eval_logger.warning(
"You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached."
)
self.model_args["worker_use_ray"] = True
self.batch_size = "auto"
eval_logger.info("Manual batching is not compatible with data parallelism.")
......@@ -181,15 +183,30 @@ class VLLM(TemplateLM):
temperature=0, prompt_logprobs=1, max_tokens=1
)
if self.data_parallel_size > 1:
requests = [list(x) for x in divide(requests, self.data_parallel_size)]
inputs = [(self.model_args, sampling_params, req) for req in requests]
# vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
# also seems to only work with decorator and not with ray.remote() fn
# see https://github.com/vllm-project/vllm/issues/973
# note: this has changed on 0.3.3, and it only works now if num_gpus are set.
# but then tensor_parallel breaks
@ray.remote
def run_inference_one_model(
model_args: dict, sampling_params, requests: List[List[int]]
):
llm = LLM(**model_args)
return llm.generate(
prompt_token_ids=requests, sampling_params=sampling_params
)
with Pool(self.data_parallel_size) as pool:
results = pool.starmap(run_inference_one_model, inputs)
# dispatch requests to all self.data_parallel_size workers, in interleaved fashion
# interleaved important to balance context lengths across workers
requests = [list(x) for x in distribute(self.data_parallel_size, requests)]
inputs = ((self.model_args, sampling_params, req) for req in requests)
object_refs = [run_inference_one_model.remote(*x) for x in inputs]
results = ray.get(object_refs)
# Invoke ray.shutdown() to prevent hang-ups if subsequent calls required.
ray.shutdown()
# flatten results
return [item for sublist in results for item in sublist]
return undistribute(results)
outputs = self.model.generate(
prompt_token_ids=requests,
......@@ -198,10 +215,12 @@ class VLLM(TemplateLM):
)
return outputs
def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
def loglikelihood_rolling(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[float]:
loglikelihoods = []
for (string,) in tqdm([req.args for req in requests]):
for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
rolling_token_windows = list(
map(
make_disjoint_window,
......@@ -227,7 +246,9 @@ class VLLM(TemplateLM):
loglikelihoods.append(string_nll)
return loglikelihoods
def generate_until(self, requests: List[Instance]) -> List[str]:
def generate_until(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[str]:
res = []
# batch tokenize contexts
......@@ -256,7 +277,7 @@ class VLLM(TemplateLM):
pbar = tqdm(
total=len(requests),
disable=(self.rank != 0),
disable=(disable_tqdm or (self.rank != 0)),
desc="Running generate_until requests",
)
# for each different set of kwargs, we execute all requests, by batch.
......@@ -282,8 +303,12 @@ class VLLM(TemplateLM):
raise ValueError(
f"Expected `kwargs` to be of type `dict` but got {gen_kwargs}"
)
# add EOS token to stop sequences
eos = self.tokenizer.decode(self.eot_token_id)
if not until:
until = [self.tokenizer.decode(self.eot_token_id)]
until = [eos]
else:
until.append(eos)
if "max_gen_toks" in kwargs.keys():
max_gen_toks = kwargs.pop("max_gen_toks")
else:
......@@ -390,6 +415,26 @@ class VLLM(TemplateLM):
# The first entry of prompt_logprobs is None because the model has no previous tokens to condition on.
continuation_logprobs_dicts = outputs.prompt_logprobs
def coerce_logprob_to_num(logprob):
# vLLM changed the return type of logprobs from float
# to a Logprob object storing the float value + extra data
# (https://github.com/vllm-project/vllm/pull/3065).
# If we are dealing with vllm's Logprob object, return
# the logprob value stored as an attribute. Otherwise,
# return the object itself (which should be a float
# for older versions of vLLM).
return getattr(logprob, "logprob", logprob)
continuation_logprobs_dicts = [
{
token: coerce_logprob_to_num(logprob)
for token, logprob in logprob_dict.items()
}
if logprob_dict is not None
else None
for logprob_dict in continuation_logprobs_dicts
]
# Calculate continuation_logprobs
# assume ctxlen always >= 1
continuation_logprobs = sum(
......
import abc
import collections
import logging
import os
from functools import partial
from typing import Dict, List, Union
from typing import Dict, List, Mapping, Optional, Union
from lm_eval import utils
from lm_eval.api.task import ConfigurableTask, Task
......@@ -15,7 +14,7 @@ class TaskManager:
"""
def __init__(self, verbosity="INFO", include_path=None) -> None:
def __init__(self, verbosity="INFO", include_path: Optional[str] = None) -> None:
self.verbosity = verbosity
self.include_path = include_path
self.logger = utils.eval_logger
......@@ -26,8 +25,8 @@ class TaskManager:
self.task_group_map = collections.defaultdict(list)
def initialize_tasks(self, include_path: str = None):
"""Creates an dictionary of tasks index.
def initialize_tasks(self, include_path: Optional[str] = None):
"""Creates a dictionary of tasks index.
:param include_path: str = None
An additional path to be searched for tasks
......@@ -59,7 +58,7 @@ class TaskManager:
def match_tasks(self, task_list):
return utils.pattern_match(task_list, self.all_tasks)
def _name_is_registered(self, name):
def _name_is_registered(self, name) -> bool:
if name in self.all_tasks:
return True
return False
......@@ -69,7 +68,7 @@ class TaskManager:
return True
return False
def _name_is_group(self, name):
def _name_is_group(self, name) -> bool:
if self._name_is_registered(name) and (
self.task_index[name]["type"] == "group"
):
......@@ -83,27 +82,29 @@ class TaskManager:
return True
return False
def _config_is_task(self, config):
def _config_is_task(self, config) -> bool:
if ("task" in config) and isinstance(config["task"], str):
return True
return False
def _config_is_group(self, config):
def _config_is_group(self, config) -> bool:
if ("task" in config) and isinstance(config["task"], list):
return True
return False
def _config_is_python_task(self, config):
def _config_is_python_task(self, config) -> bool:
if "class" in config:
return True
return False
def _get_yaml_path(self, name):
assert name in self.task_index
if name not in self.task_index:
raise ValueError
return self.task_index[name]["yaml_path"]
def _get_config(self, name):
assert name in self.task_index
if name not in self.task_index:
raise ValueError
yaml_path = self._get_yaml_path(name)
if yaml_path == -1:
return {}
......@@ -111,7 +112,8 @@ class TaskManager:
return utils.load_yaml_config(yaml_path, mode="full")
def _get_tasklist(self, name):
assert self._name_is_task(name) is False
if self._name_is_task(name):
raise ValueError
return self.task_index[name]["task"]
def _process_alias(self, config, group=None):
......@@ -125,14 +127,15 @@ class TaskManager:
def _load_individual_task_or_group(
self,
name_or_config: Union[str, dict] = None,
parent_name: str = None,
update_config: dict = None,
yaml_path: str = None,
) -> ConfigurableTask:
name_or_config: Optional[Union[str, dict]] = None,
parent_name: Optional[str] = None,
update_config: Optional[dict] = None,
yaml_path: Optional[str] = None,
) -> Mapping:
def load_task(config, task, group=None, yaml_path=None):
if "include" in config:
assert yaml_path is not None
if yaml_path is None:
raise ValueError
config.update(
utils.load_yaml_config(
yaml_path,
......@@ -166,7 +169,7 @@ class TaskManager:
# This checks if we're at the root.
if parent_name is None:
group_config = self._get_config(name_or_config)
if set(group_config.keys()) > set(["task", "group"]):
if set(group_config.keys()) > {"task", "group"}:
update_config = {
k: v
for k, v in group_config.items()
......@@ -228,7 +231,7 @@ class TaskManager:
else:
group_name = name_or_config["group"]
subtask_list = name_or_config["task"]
if set(name_or_config.keys()) > set(["task", "group"]):
if set(name_or_config.keys()) > {"task", "group"}:
update_config = {
k: v
for k, v in name_or_config.items()
......@@ -251,7 +254,7 @@ class TaskManager:
}
return all_subtasks
def load_task_or_group(self, task_list: Union[str, list] = None) -> dict:
def load_task_or_group(self, task_list: Optional[Union[str, list]] = None) -> dict:
"""Loads a dictionary of task objects from a list
:param task_list: Union[str, list] = None
......@@ -272,7 +275,7 @@ class TaskManager:
return self._load_individual_task_or_group(config)
def _get_task_and_group(self, task_dir: str):
"""Creates an dictionary of tasks index with the following metadata,
"""Creates a dictionary of tasks index with the following metadata,
- `type`, that can be either `task`, `python_task`, or `group`.
`task` refer to regular task configs, `python_task` are special
yaml files that only consists of `task` and `class` parameters.
......@@ -358,7 +361,8 @@ def include_path(task_dir):
logger.setLevel(getattr(logging, "INFO"))
logger.info(
"To still use tasks loaded from args.include_path,"
"see an example of the new TaskManager API in https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
"see an example of the new TaskManager API in "
"https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
)
return 0
......@@ -397,7 +401,8 @@ def get_task_name_from_object(task_object):
def get_task_dict(
task_name_list: List[Union[str, Dict, Task]], task_manager: TaskManager = None
task_name_list: List[Union[str, Dict, Task]],
task_manager: Optional[TaskManager] = None,
):
"""Creates a dictionary of task objects from either a name of task, config, or prepared Task object.
......@@ -442,9 +447,10 @@ def get_task_dict(
get_task_name_from_object(task_element): task_element,
}
assert set(task_name_from_string_dict.keys()).isdisjoint(
if not set(task_name_from_string_dict.keys()).isdisjoint(
set(task_name_from_object_dict.keys())
)
):
raise ValueError
return {
**task_name_from_string_dict,
......
# Arabic EXAMS
### Paper
EXAMS: a resource specialized in multilingual high school exam questions.
The original paper [EXAMS](https://aclanthology.org/2020.emnlp-main.438/)
The Arabic EXAMS dataset includes five subjects
- Islamic studies
- Biology
- Physics
- Science
- Social
The original dataset [EXAMS-QA](https://github.com/mhardalov/exams-qa)
EXAMS is a benchmark dataset for cross-lingual and multilingual question answering for high school examinations.
With 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others.
EXAMS offers unique fine-grained evaluation framework across multiple languages and subjects
Homepage for Arabic EXAMS: [EXAMS Arabic Homepage](https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/EXAMS_Arabic)
### Citation
### Groups and Tasks
#### Groups
- `EXAMS Arabic`: include IslamicStudies, Biology, Science, Physics, Social.
#### Tasks
The following tasks evaluate subjects in Arabic EXAMS dataset using loglikelihood-based multiple-choice scoring:
- `aexams_IslamicStudies`
- `aexams_Biology`
- `aexams_Science`
- `aexams_Physics`
- `aexams_Social`
### Checklist
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation?
* [x] Yes, original implementation contributed by author of the benchmark
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
group: aexams
dataset_path: Hennara/aexams
test_split: test
fewshot_split: dev
fewshot_config:
sampler: first_n
output_type: multiple_choice
doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nالجواب:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
"dataset_name": "Biology"
"description": "قم بالإجابة على مايلي في مجال العلوم الحيوية\n\n"
"include": "_default_template_yaml"
"task": "aexams_Biology"
"dataset_name": "IslamicStudies"
"description": "قم بالإجابة على مايلي في مجال العلوم الإسلامية \n\n"
"include": "_default_template_yaml"
"task": "aexams_IslamicStudies"
"dataset_name": "Physics"
"description": "قم بالإجابة على مايلي في مجال الفيزياء \n\n"
"include": "_default_template_yaml"
"task": "aexams_Physics"
"dataset_name": "Science"
"description": "قم بالإجابة على مايلي في مجال العلوم \n\n"
"include": "_default_template_yaml"
"task": "aexams_Science"
"dataset_name": "Social"
"description": "قم بالإجابة على مايلي في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "aexams_Social"
# AGIEval
### Paper
Title: AGIEval: A Human-Centric Benchmark for Evaluating Foundation Models
Abstract: https://arxiv.org/abs/2304.06364.pdf
AGIEval is a human-centric benchmark specifically designed to evaluate the general abilities of foundation models in tasks pertinent to human cognition and problem-solving.
This benchmark is derived from 20 official, public, and high-standard admission and qualification exams intended for general human test-takers, such as general college admission tests (e.g., Chinese College Entrance Exam (Gaokao) and American SAT), law school admission tests, math competitions, lawyer qualification tests, and national civil service exams.
Homepage: https://github.com/ruixiangcui/AGIEval
### Citation
```
@misc{zhong2023agieval,
title={AGIEval: A Human-Centric Benchmark for Evaluating Foundation Models},
author={Wanjun Zhong and Ruixiang Cui and Yiduo Guo and Yaobo Liang and Shuai Lu and Yanlin Wang and Amin Saied and Weizhu Chen and Nan Duan},
year={2023},
eprint={2304.06364},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
Please make sure to cite all the individual datasets in your paper when you use them. We provide the relevant citation information below:
```
@inproceedings{ling-etal-2017-program,
title = "Program Induction by Rationale Generation: Learning to Solve and Explain Algebraic Word Problems",
author = "Ling, Wang and
Yogatama, Dani and
Dyer, Chris and
Blunsom, Phil",
booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2017",
address = "Vancouver, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P17-1015",
doi = "10.18653/v1/P17-1015",
pages = "158--167",
abstract = "Solving algebraic word problems requires executing a series of arithmetic operations{---}a program{---}to obtain a final answer. However, since programs can be arbitrarily complicated, inducing them directly from question-answer pairs is a formidable challenge. To make this task more feasible, we solve these problems by generating answer rationales, sequences of natural language and human-readable mathematical expressions that derive the final answer through a series of small steps. Although rationales do not explicitly specify programs, they provide a scaffolding for their structure via intermediate milestones. To evaluate our approach, we have created a new 100,000-sample dataset of questions, answers and rationales. Experimental results show that indirect supervision of program learning via answer rationales is a promising strategy for inducing arithmetic programs.",
}
@inproceedings{hendrycksmath2021,
title={Measuring Mathematical Problem Solving With the MATH Dataset},
author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
journal={NeurIPS},
year={2021}
}
@inproceedings{Liu2020LogiQAAC,
title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning},
author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},
booktitle={International Joint Conference on Artificial Intelligence},
year={2020}
}
@inproceedings{zhong2019jec,
title={JEC-QA: A Legal-Domain Question Answering Dataset},
author={Zhong, Haoxi and Xiao, Chaojun and Tu, Cunchao and Zhang, Tianyang and Liu, Zhiyuan and Sun, Maosong},
booktitle={Proceedings of AAAI},
year={2020},
}
@article{Wang2021FromLT,
title={From LSAT: The Progress and Challenges of Complex Reasoning},
author={Siyuan Wang and Zhongkun Liu and Wanjun Zhong and Ming Zhou and Zhongyu Wei and Zhumin Chen and Nan Duan},
journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
year={2021},
volume={30},
pages={2201-2216}
}
```
### Groups and Tasks
#### Groups
- `agieval`: Evaluates all tasks listed below.
- `agieval_en`: Evaluates all English subtasks: `agieval_aqua_rat`, `agieval_gaokao_english`, `agieval_logiqa_en`, `agieval_lsat_*`, `agieval_sat_*`, `agieval_math`
- `agieval_cn`: Evaluates all Chinese subtasks:
`agieval_gaokao_biology`, `agieval_gaokao_chemistry`, `agieval_gaokao_chinese`, `agieval_gaokao_geography`,
`agieval_gaokao_history`, `agieval_gaokao_mathqa`, `agieval_gaokao_mathcloze`, `agieval_gaokao_physics`, `agieval_jec_qa_ca`, `agieval_jec_qa_kd`, `agieval_logiqa_zh`
- `agieval_nous`: Evaluates a specific subset of AGIEval tasks (multiple-choice and english-only), namely those in https://github.com/teknium1/LLM-Benchmark-Logs/blob/main/benchmark-logs/Mistral-7B-Base.md
#### Tasks
- `agieval_aqua_rat`
- `agieval_gaokao_biology`
- `agieval_gaokao_chemistry`
- `agieval_gaokao_chinese`
- `agieval_gaokao_english`
- `agieval_gaokao_geography`
- `agieval_gaokao_history`
- `agieval_gaokao_mathqa`
- `agieval_gaokao_mathcloze`
- `agieval_gaokao_physics`
- `agieval_jec_qa_ca`
- `agieval_jec_qa_kd`
- `agieval_logiqa_en`
- `agieval_logiqa_zh`
- `agieval_lsat_ar`
- `agieval_lsat_lr`
- `agieval_lsat_rc`
- `agieval_sat_en`
- `agieval_sat_en_without_passage`
- `agieval_sat_math`
- `agieval_math`
group:
- agieval
- agieval_en
- agieval_nous
task: agieval_aqua_rat
dataset_path: hails/agieval-aqua-rat
dataset_name: null
output_type: multiple_choice
training_split: null
validation_split: null
test_split: test
doc_to_text: "{{query}}"
doc_to_target: "{{gold}}"
doc_to_choice: "{{choices}}"
process_results: !function utils.process_results_mcqa
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
include: aqua-rat.yaml
group:
- agieval
- agieval_cn
task: agieval_gaokao_biology
dataset_path: hails/agieval-gaokao-biology
include: aqua-rat.yaml
group:
- agieval
- agieval_cn
task: agieval_gaokao_chemistry
dataset_path: hails/agieval-gaokao-chemistry
include: aqua-rat.yaml
group:
- agieval
- agieval_cn
task: agieval_gaokao_chinese
dataset_path: hails/agieval-gaokao-chinese
include: aqua-rat.yaml
group:
- agieval
- agieval_en # categorizing as EN because the AGIEval codebase lists this as in `english_qa_tasks`
task: agieval_gaokao_english
dataset_path: hails/agieval-gaokao-english
include: aqua-rat.yaml
group:
- agieval
- agieval_cn
task: agieval_gaokao_geography
dataset_path: hails/agieval-gaokao-geography
include: aqua-rat.yaml
group:
- agieval
- agieval_cn
task: agieval_gaokao_history
dataset_path: hails/agieval-gaokao-history
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment