Commit 31019847 authored by h-albert-lee's avatar h-albert-lee
Browse files

Merge remote-tracking branch 'origin/main' into 1442-inverse-scaling-tasks

parents 10c9d5de dc90fecc
......@@ -140,7 +140,7 @@ lm_eval --model vllm \
--tasks lambada_openai \
--batch_size auto
```
For a full list of supported vLLM configurations, please reference our vLLM integration and the vLLM documentation.
To use vllm, do `pip install lm_eval[vllm]`. For a full list of supported vLLM configurations, please reference our [vLLM integration](https://github.com/EleutherAI/lm-evaluation-harness/blob/e74ec966556253fbe3d8ecba9de675c77c075bce/lm_eval/models/vllm_causallms.py) and the vLLM documentation.
vLLM occasionally differs in output from Huggingface. We treat Huggingface as the reference implementation, and provide a [script](./scripts/model_comparator.py) for checking the validity of vllm results against HF.
......
......@@ -30,9 +30,10 @@ Dataset configuration options:
Prompting / in-context formatting options:
- **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text, doc_to_target, and doc_to_choice.
- **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate input for the model
- **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate target output for the model. For multiple choice tasks, this should return an index into
- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks.
- **description** (`str`, *optional*) — An optional prepended Jinja2 template or string which will be prepended to the few-shot examples passed into the model, often describing the task or providing instructions to a model, such as `"The following are questions (with answers) about {{subject}}.\n\n"`. No delimiters or spacing are inserted between the description and the first few-shot example.
- **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate input for the model
- **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate target output for the model. For multiple choice tasks, this should return an index into
- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks.
- **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples.
- **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested.
......
......@@ -330,6 +330,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
log_samples=args.log_samples,
gen_kwargs=args.gen_kwargs,
task_manager=task_manager,
verbosity=args.verbosity,
predict_only=args.predict_only,
random_seed=args.seed[0],
numpy_random_seed=args.seed[1],
......
......@@ -304,7 +304,9 @@ class TemplateLM(LM):
return context_enc, continuation_enc
def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
def loglikelihood(
self, requests, disable_tqdm: bool = False
) -> List[Tuple[float, bool]]:
new_reqs = []
for context, continuation in [req.args for req in requests]:
if context == "":
......@@ -318,12 +320,14 @@ class TemplateLM(LM):
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
return self._loglikelihood_tokens(new_reqs, disable_tqdm=disable_tqdm)
@abc.abstractmethod
def loglikelihood_rolling(self, requests) -> List[Tuple[float, bool]]:
def loglikelihood_rolling(
self, requests, disable_tqdm: bool = False
) -> List[Tuple[float, bool]]:
pass
@abc.abstractmethod
def generate_until(self, requests) -> List[str]:
def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
pass
......@@ -940,14 +940,14 @@ class ConfigurableTask(Task):
:returns: str
The fewshot context.
"""
if description := self.config.description:
description = utils.apply_template(self.config.description, doc)
if num_fewshot == 0:
# always prepend the (possibly empty) task description
labeled_examples = self.config.description
labeled_examples = description
else:
labeled_examples = self.config.description + self.sampler.get_context(
doc, num_fewshot
)
labeled_examples = description + self.sampler.get_context(doc, num_fewshot)
example = self.doc_to_text(doc)
if self.multiple_input:
......
......@@ -147,7 +147,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.")
def generate_until(self, requests) -> List[str]:
def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
try:
import anthropic
except ModuleNotFoundError:
......@@ -162,7 +162,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
_requests: List[Tuple[str, dict]] = [req.args for req in requests]
res = []
for request in tqdm(_requests):
for request in tqdm(_requests, disable=disable_tqdm):
try:
inp = request[0]
request_args = request[1]
......@@ -199,8 +199,8 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
# Isn't used because we override generate_until
raise NotImplementedError()
def loglikelihood(self, requests):
def loglikelihood(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.")
def loglikelihood_rolling(self, requests):
def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.")
import random
from tqdm import tqdm
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
......@@ -13,27 +15,27 @@ class DummyLM(LM):
def create_from_arg_string(cls, arg_string, additional_config=None):
return cls()
def loglikelihood(self, requests):
def loglikelihood(self, requests, disable_tqdm: bool = False):
res = []
for _ in requests:
for _ in tqdm(requests, disable=disable_tqdm):
res.append((-random.random(), False))
return res
def generate_until(self, requests):
def generate_until(self, requests, disable_tqdm: bool = False):
res = []
for ctx, _ in requests:
for ctx, _ in tqdm(requests, disable=disable_tqdm):
res.append("lol")
assert ctx.strip() != ""
return res
def loglikelihood_rolling(self, requests):
def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
res = []
for _ in requests:
for _ in tqdm(requests, disable=disable_tqdm):
res.append(-random.random())
return res
......@@ -70,11 +70,13 @@ class GGUFLM(LM):
else:
raise Exception(f"Failed to get a valid response after {retries} retries.")
def loglikelihood(self, requests):
def loglikelihood(self, requests, disable_tqdm: bool = False):
if not requests:
return []
res = []
for context, continuation in tqdm([req.args for req in requests]):
for context, continuation in tqdm(
[req.args for req in requests], disable=disable_tqdm
):
response = self.gguf_completion(context=context, continuation=continuation)
if response and "choices" in response and response["choices"]:
choice = response["choices"][0]
......@@ -97,12 +99,12 @@ class GGUFLM(LM):
assert False
return res
def generate_until(self, requests):
def generate_until(self, requests, disable_tqdm: bool = False):
if not requests:
return []
res = []
for request in tqdm([req.args for req in requests]):
for request in tqdm([req.args for req in requests], disable=disable_tqdm):
inp = request[0]
request_args = request[1]
until = request_args.get("until", ["</s>"])
......@@ -122,7 +124,7 @@ class GGUFLM(LM):
res.append(None) # Add default value in case of error
return res
def loglikelihood_rolling(self, requests):
def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
raise NotImplementedError(
"loglikelihood_rolling not yet supported for GGUF models"
)
......@@ -790,7 +790,9 @@ class HFLM(TemplateLM):
return logits
def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
def loglikelihood_rolling(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[float]:
loglikelihoods = []
adaptive_batch_size = None
......@@ -801,7 +803,9 @@ class HFLM(TemplateLM):
print(f"Determined Largest batch size: {batch_size}")
adaptive_batch_size = batch_size
for (string,) in tqdm([req.args for req in requests], disable=(self.rank != 0)):
for (string,) in tqdm(
[req.args for req in requests], disable=(disable_tqdm or (self.rank != 0))
):
rolling_token_windows = list(
map(
utils.make_disjoint_window,
......@@ -1079,7 +1083,9 @@ class HFLM(TemplateLM):
return re_ord.get_original(res)
def generate_until(self, requests: List[Instance]) -> List[str]:
def generate_until(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[str]:
res = []
def _collate(req: Tuple[str, dict]):
......@@ -1095,7 +1101,7 @@ class HFLM(TemplateLM):
pbar = tqdm(
total=len(requests),
disable=(self.rank != 0),
disable=(disable_tqdm or (self.rank != 0)),
desc="Running generate_until requests",
)
adaptive_batch_size = None
......
......@@ -447,12 +447,14 @@ class NEURON_HF(TemplateLM):
return logits
def loglikelihood_rolling(self, requests):
def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
loglikelihoods = []
adaptive_batch_size = None
for (string,) in tqdm([req.args for req in requests], disable=(self.rank != 0)):
for (string,) in tqdm(
[req.args for req in requests], disable=(disable_tqdm or (self.rank != 0))
):
rolling_token_windows = list(
map(
utils.make_disjoint_window,
......@@ -616,7 +618,7 @@ class NEURON_HF(TemplateLM):
return re_ord.get_original(res)
def generate_until(self, requests):
def generate_until(self, requests, disable_tqdm: bool = False):
res = defaultdict(list)
re_ords = {}
......@@ -638,7 +640,7 @@ class NEURON_HF(TemplateLM):
# within each set of reqs for given kwargs, we reorder by token length, descending.
re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
pbar = tqdm(total=len(requests), disable=(self.rank != 0))
pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
# for each different set of kwargs, we execute all requests, by batch.
for key, re_ord in re_ords.items():
......
......@@ -231,7 +231,7 @@ class OpenaiCompletionsLM(TemplateLM):
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
return re_ord.get_original(res)
def generate_until(self, requests) -> List[str]:
def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
if not requests:
return []
res = []
......@@ -258,7 +258,8 @@ class OpenaiCompletionsLM(TemplateLM):
# todo: more intelligent batching for heterogeneous `until`
for chunk, request_args in tqdm(
list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size))
list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size)),
disable=disable_tqdm,
):
inps = []
self._max_gen_toks = request_args.get("max_gen_toks", self.max_gen_toks)
......@@ -308,10 +309,12 @@ class OpenaiCompletionsLM(TemplateLM):
# Isn't used because we override generate_until
raise NotImplementedError()
def loglikelihood_rolling(self, requests) -> List[float]:
def loglikelihood_rolling(
self, requests, disable_tqdm: bool = False
) -> List[float]:
loglikelihoods = []
for (string,) in tqdm([req.args for req in requests]):
for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
rolling_token_windows = list(
map(
utils.make_disjoint_window,
......@@ -398,7 +401,7 @@ class OpenaiChatCompletionsLM(LM):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
def generate_until(self, requests) -> List[str]:
def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
res = defaultdict(list)
re_ords = {}
......@@ -412,7 +415,7 @@ class OpenaiChatCompletionsLM(LM):
[req.args for req in reqs], lambda x: (-len(x[0]), x[0])
)
pbar = tqdm(total=len(requests), disable=(self.rank != 0))
pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
for key, re_ord in re_ords.items():
# n needs to be 1 because messages in
# chat completion are not batch but
......@@ -471,8 +474,8 @@ class OpenaiChatCompletionsLM(LM):
return grouper.get_original(res)
def loglikelihood(self, requests):
def loglikelihood(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.")
def loglikelihood_rolling(self, requests):
def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.")
......@@ -95,9 +95,9 @@ class TextSynthLM(LM):
# Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
raise NotImplementedError()
def loglikelihood(self, requests):
def loglikelihood(self, requests, disable_tqdm: bool = False):
res = []
for context, continuation in tqdm(requests):
for context, continuation in tqdm(requests, disable=disable_tqdm):
response = textsynth_completion(
url=self.api_url + "/v1/engines/" + self.engine + "/logprob",
headers={"Authorization": "Bearer " + self.api_key},
......@@ -119,7 +119,7 @@ class TextSynthLM(LM):
assert False
return res
def loglikelihood_rolling(self, requests):
def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
# TODO: The TextSynth API does not support tokenized inputs so we cannot
# manually partition long contexts into smaller rolling windows as
# done for other models derived from `BaseLM`. Override this method
......@@ -129,12 +129,12 @@ class TextSynthLM(LM):
"input tokenization support from TextSynth."
)
def generate_until(self, requests):
def generate_until(self, requests, disable_tqdm: bool = False):
if not requests:
return []
res = []
for request in tqdm(requests):
for request in tqdm(requests, disable=disable_tqdm):
inp = request[0]
request_args = request[1]
until = request_args["until"]
......
......@@ -215,10 +215,12 @@ class VLLM(TemplateLM):
)
return outputs
def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
def loglikelihood_rolling(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[float]:
loglikelihoods = []
for (string,) in tqdm([req.args for req in requests]):
for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
rolling_token_windows = list(
map(
make_disjoint_window,
......@@ -244,7 +246,9 @@ class VLLM(TemplateLM):
loglikelihoods.append(string_nll)
return loglikelihoods
def generate_until(self, requests: List[Instance]) -> List[str]:
def generate_until(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[str]:
res = []
# batch tokenize contexts
......@@ -273,7 +277,7 @@ class VLLM(TemplateLM):
pbar = tqdm(
total=len(requests),
disable=(self.rank != 0),
disable=(disable_tqdm or (self.rank != 0)),
desc="Running generate_until requests",
)
# for each different set of kwargs, we execute all requests, by batch.
......@@ -411,6 +415,26 @@ class VLLM(TemplateLM):
# The first entry of prompt_logprobs is None because the model has no previous tokens to condition on.
continuation_logprobs_dicts = outputs.prompt_logprobs
def coerce_logprob_to_num(logprob):
# vLLM changed the return type of logprobs from float
# to a Logprob object storing the float value + extra data
# (https://github.com/vllm-project/vllm/pull/3065).
# If we are dealing with vllm's Logprob object, return
# the logprob value stored as an attribute. Otherwise,
# return the object itself (which should be a float
# for older versions of vLLM).
return getattr(logprob, "logprob", logprob)
continuation_logprobs_dicts = [
{
token: coerce_logprob_to_num(logprob)
for token, logprob in logprob_dict.items()
}
if logprob_dict is not None
else None
for logprob_dict in continuation_logprobs_dicts
]
# Calculate continuation_logprobs
# assume ctxlen always >= 1
continuation_logprobs = sum(
......
# Arabic EXAMS
### Paper
EXAMS: a resource specialized in multilingual high school exam questions.
The original paper [EXAMS](https://aclanthology.org/2020.emnlp-main.438/)
The Arabic EXAMS dataset includes five subjects
- Islamic studies
- Biology
- Physics
- Science
- Social
The original dataset [EXAMS-QA](https://github.com/mhardalov/exams-qa)
EXAMS is a benchmark dataset for cross-lingual and multilingual question answering for high school examinations.
With 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others.
EXAMS offers unique fine-grained evaluation framework across multiple languages and subjects
Homepage for Arabic EXAMS: [EXAMS Arabic Homepage](https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/EXAMS_Arabic)
### Citation
### Groups and Tasks
#### Groups
- `EXAMS Arabic`: include IslamicStudies, Biology, Science, Physics, Social.
#### Tasks
The following tasks evaluate subjects in Arabic EXAMS dataset using loglikelihood-based multiple-choice scoring:
- `aexams_IslamicStudies`
- `aexams_Biology`
- `aexams_Science`
- `aexams_Physics`
- `aexams_Social`
### Checklist
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation?
* [x] Yes, original implementation contributed by author of the benchmark
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
group: aexams
dataset_path: Hennara/aexams
test_split: test
fewshot_split: dev
fewshot_config:
sampler: first_n
output_type: multiple_choice
doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nالجواب:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
"dataset_name": "Biology"
"description": "قم بالإجابة على مايلي في مجال العلوم الحيوية\n\n"
"include": "_default_template_yaml"
"task": "aexams_Biology"
"dataset_name": "IslamicStudies"
"description": "قم بالإجابة على مايلي في مجال العلوم الإسلامية \n\n"
"include": "_default_template_yaml"
"task": "aexams_IslamicStudies"
"dataset_name": "Physics"
"description": "قم بالإجابة على مايلي في مجال الفيزياء \n\n"
"include": "_default_template_yaml"
"task": "aexams_Physics"
"dataset_name": "Science"
"description": "قم بالإجابة على مايلي في مجال العلوم \n\n"
"include": "_default_template_yaml"
"task": "aexams_Science"
"dataset_name": "Social"
"description": "قم بالإجابة على مايلي في مجال العلوم الإجتماعية \n\n"
"include": "_default_template_yaml"
"task": "aexams_Social"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment