"vscode:/vscode.git/clone" did not exist on "6599ffcab619453266fc3630f3471246d2a2881b"
Commit 3e8135ce authored by Baber's avatar Baber
Browse files

Merge branch 'main' into comma

parents 8e560c96 0c134ee9
......@@ -2,7 +2,7 @@ import logging
import os
__version__ = "0.4.9"
__version__ = "0.4.9.1"
# Lazy-load .evaluator module to improve CLI startup
......
......@@ -5,8 +5,9 @@ import traceback
from typing import Iterator, List, Sequence, Tuple, TypeVar
# This is a cpp module. Compile janitor_util.cpp with:
# c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
# This is a cpp module.
# See scripts/clean_training_data/README.md for instructions to compile janitor_util.cpp
try:
import janitor_util
......
......@@ -71,13 +71,6 @@ class SteeredModel(HFLM):
"""
HFLM with a steered forward pass.
To derive steering vectors from a sparse model loadable with sparsify or sae_lens,
provide the path to a CSV file with the following columns (example rows are provided below):
loader,action,sparse_model,hookpoint,feature_index,steering_coefficient,sae_id,description,
sparsify,add,EleutherAI/sae-pythia-70m-32k,layers.3,30,10.0,,,
sae_lens,add,gemma-scope-2b-pt-res-canonical,layers.20,12082,240.0,layer_20/width_16k/canonical,increase dogs,
To load steering vectors directly, provide the path to a pytorch (.pt) file with content in the following format:
{
......@@ -86,9 +79,17 @@ class SteeredModel(HFLM):
"steering_coefficient": <float>,
"action": <Literal["add", "clamp"]>,
"bias": <torch.Tensor | None>,
"head_index": <int | None>,
},
...
}
To derive steering vectors from a sparse model loadable with sparsify or sae_lens,
provide the path to a CSV file with the following columns (example rows are provided below):
loader,action,sparse_model,hookpoint,feature_index,steering_coefficient,head_index,sae_id,description,
sparsify,add,EleutherAI/sae-pythia-70m-32k,layers.3,30,10.0,,,,
sae_lens,add,gemma-scope-2b-pt-res-canonical,layers.20,12082,240.0,,layer_20/width_16k/canonical,increase dogs,
"""
super().__init__(pretrained=pretrained, device=device, **kwargs)
......@@ -105,27 +106,31 @@ class SteeredModel(HFLM):
hook_to_steer = {}
for hookpoint, steer_info in steer_config.items():
action = steer_info["action"]
steering_coefficient = steer_info["steering_coefficient"]
steering_vector = (
steer_info["steering_vector"].to(self.device).to(self.model.dtype)
)
bias = (
steer_info["bias"].to(self.device).to(self.model.dtype)
if steer_info["bias"] is not None
else None
)
steering_coefficient = float(steer_info.get("steering_coefficient", 1.0))
head_index = steer_info.get("head_index", None)
bias = steer_info.get("bias", None)
if bias is not None:
bias = bias.to(self.device).to(self.model.dtype)
if action == "add":
# Steers the model by adding some multiple of a steering vector to all sequence positions.
hook_to_steer[hookpoint] = (
lambda acts: acts + steering_coefficient * steering_vector
# Steer the model by adding a multiple of a steering vector to all sequence positions.
assert bias is None, "Bias is not supported for the `add` action."
hook_to_steer[hookpoint] = partial(
self.add,
vector=steering_vector * steering_coefficient,
head_index=head_index,
)
elif action == "clamp":
# Steer the model by clamping the activations to a value in the direction of the steering vector.
hook_to_steer[hookpoint] = partial(
self.clamp,
steering_vector=steering_vector,
direction=steering_vector / torch.norm(steering_vector),
value=steering_coefficient,
bias=bias,
head_index=head_index,
)
else:
raise ValueError(f"Unknown hook type: {action}")
......@@ -195,34 +200,62 @@ class SteeredModel(HFLM):
return steer_data
@classmethod
def add(
cls,
acts: Tensor,
vector: Tensor,
head_index: Optional[int],
):
"""Adds the given vector to the activations.
Args:
acts (Tensor): The activations tensor to edit of shape [batch, pos, ..., features]
vector (Tensor): A vector to add of shape [features]
head_index (int | None): Optional attention head index to add to
"""
if head_index is not None:
acts[:, :, head_index, :] = acts[:, :, head_index, :] + vector
else:
acts = acts + vector
return acts
@classmethod
def clamp(
cls,
acts: Tensor,
steering_vector: Tensor,
direction: Tensor,
value: float,
head_index: Optional[int],
bias: Optional[Tensor] = None,
):
"""Clamps a direction of the activations to be the steering vector * the value.
"""Clamps the activations to a given value in a specified direction. The direction
must be a unit vector.
Args:
acts (Tensor): The activations tensor to edit of shape [batch, pos, features]
steering_vector (Tensor): A direction to clamp of shape [features]
acts (Tensor): The activations tensor to edit of shape [batch, pos, ..., features]
direction (Tensor): A direction to clamp of shape [features]
value (float): Value to clamp the direction to
head_index (int | None): Optional attention head index to clamp
bias (Tensor | None): Optional bias to add to the activations
Returns:
Tensor: The modified activations with the specified direction clamped
"""
if bias is not None:
acts = acts - bias
direction = steering_vector / torch.norm(steering_vector)
proj_magnitude = torch.sum(acts * direction, dim=-1, keepdim=True)
orthogonal_component = acts - proj_magnitude * direction
if head_index is not None:
x = acts[:, :, head_index, :]
proj = (x * direction).sum(dim=-1, keepdim=True)
assert proj == acts @ direction
clamped = orthogonal_component + direction * value
clamped = acts.clone()
clamped[:, :, head_index, :] = x + direction * (value - proj)
else:
proj = torch.sum(acts * direction, dim=-1, keepdim=True)
clamped = acts + direction * (value - proj)
if bias is not None:
return clamped + bias
......
......@@ -680,10 +680,19 @@ class HFLM(TemplateLM):
"0.4.0"
):
raise AssertionError("load_in_4bit requires peft >= 0.4.0")
if self._model.config.vocab_size != len(self.tokenizer):
# Compatible with Gemma3 (multimodal) and old models
if hasattr(self._model.config, "text_config") and hasattr(
self._model.config.text_config, "vocab_size"
):
vocab_size = self._model.config.text_config.vocab_size
else:
vocab_size = self._model.config.vocab_size
if vocab_size != len(self.tokenizer):
# resize model for LoRAs with added tokens
eval_logger.info(
f"Model config indicates vocab_size='{self._model.config.vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..."
f"Model config indicates vocab_size='{vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..."
)
self._model.resize_token_embeddings(len(self.tokenizer))
self._model = PeftModel.from_pretrained(
......
......@@ -289,7 +289,7 @@ class OpenAIChatCompletion(LocalChatCompletion):
"seed": seed,
**gen_kwargs,
}
if "o1" in self.model:
if "o1" in self.model or "5" in self.model:
output.pop("stop")
output["temperature"] = 1
elif "o3" in self.model:
......
......@@ -28,9 +28,8 @@ class OptimumLM(HFLM):
**kwargs,
) -> None:
if "backend" in kwargs:
# optimum currently only supports causal models
assert kwargs["backend"] == "causal", (
"Currently, only OVModelForCausalLM is supported."
assert kwargs["backend"] in ["causal", "seq2seq"], (
"Currently, only OVModelForCausalLM or OVModelForSeq2SeqLM are supported."
)
self.openvino_device = device
......@@ -54,7 +53,7 @@ class OptimumLM(HFLM):
"package `optimum` is not installed. Please install it via `pip install optimum[openvino]`"
)
else:
from optimum.intel.openvino import OVModelForCausalLM
from optimum.intel.openvino import OVModelForCausalLM, OVModelForSeq2SeqLM
model_kwargs = kwargs if kwargs else {}
if "ov_config" in model_kwargs:
......@@ -76,17 +75,14 @@ class OptimumLM(HFLM):
model_kwargs["ov_config"]["MODEL_DISTRIBUTION_POLICY"] = (
"PIPELINE_PARALLEL"
)
model_file = Path(pretrained) / "openvino_model.xml"
if model_file.exists():
export = False
else:
export = True
self._model = OVModelForCausalLM.from_pretrained(
model_cls = (
OVModelForCausalLM if self.backend == "causal" else OVModelForSeq2SeqLM
)
self._model = model_cls.from_pretrained(
pretrained,
revision=revision,
trust_remote_code=trust_remote_code,
export=export,
device=self.openvino_device.upper(),
**model_kwargs,
)
......@@ -216,7 +216,7 @@ class SGLangLM(TemplateLM):
# we group requests by their generation_kwargs,
# so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
# in the same batch.
re_ords = Collator(requests, _collate_gen, group_by="gen_kwargs")
re_ords = Collator(requests, _collate_gen, group_by=None)
chunks = re_ords.get_batched(
n=int(self.batch_size) if self.batch_size != "auto" else 0, batch_fn=None
)
......@@ -232,36 +232,41 @@ class SGLangLM(TemplateLM):
context_and_encoding, all_gen_kwargs = zip(*chunk)
context, context_encoding = zip(*context_and_encoding)
# we assume all gen kwargs in the batch are the same
# this is safe to assume because the `grouper` object ensures it.
gen_kwargs = all_gen_kwargs[0]
# unpack our keyword arguments.
if isinstance(gen_kwargs, dict):
kwargs = copy.deepcopy(gen_kwargs) # edge case for repeats > 1
# add EOS token to stop sequences
until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
else:
raise ValueError(
f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
context_encoding_truncated = []
sampling_params = []
for x, gen_kwargs in zip(context_encoding, all_gen_kwargs):
# unpack our keyword arguments.
if isinstance(gen_kwargs, dict):
kwargs = copy.deepcopy(gen_kwargs) # edge case for repeats > 1
# add EOS token to stop sequences
until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
else:
raise ValueError(
f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
)
if "max_gen_toks" in kwargs.keys():
max_gen_toks = kwargs.pop("max_gen_toks")
else:
max_gen_toks = self.max_gen_toks
# set the max length in tokens of inputs ("context_enc")
# max len for inputs = max length, minus room to generate the max new tokens
max_ctx_len = self.max_length - max_gen_toks
if len(x) > max_ctx_len:
context_encoding_truncated.append(x[-max_ctx_len:])
else:
context_encoding_truncated.append(x)
# create sampling params
kwargs = self.modify_gen_kwargs(kwargs)
sampling_params.append(
kwargs | {"max_tokens": max_gen_toks, "stop": until}
)
if "max_gen_toks" in kwargs.keys():
max_gen_toks = kwargs.pop("max_gen_toks")
else:
max_gen_toks = self.max_gen_toks
# set the max length in tokens of inputs ("context_enc")
# max len for inputs = max length, minus room to generate the max new tokens
max_ctx_len = self.max_length - max_gen_toks
context_encoding = [x[-max_ctx_len:] for x in context_encoding]
# perform batched generation
# cont is a list of dic. See here https://github.com/sgl-project/sglang/blob/0a6f18f068e4095fc228e798454e8496c9749214/python/sglang/srt/entrypoints/engine.py#L111 .
cont = self._model_generate(
requests=context_encoding,
requests=context_encoding_truncated,
generate=True,
max_tokens=max_gen_toks,
stop=until,
**kwargs,
sampling_params=sampling_params,
)
# cache generations
......@@ -284,28 +289,22 @@ class SGLangLM(TemplateLM):
self,
requests: List[List[int]] = None,
generate: bool = False,
max_tokens: int = None,
stop: Optional[List[str]] = None,
sampling_params: Union[List[Dict], Dict, None] = None,
return_logprob: bool = False,
top_logprobs_num: int = 1,
logprob_start_len: int = -1,
**kwargs,
):
# check sglang sampling parameters: https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/sampling/sampling_params.py#L21 and https://docs.sglang.ai/references/sampling_params.html.
if generate:
kwargs = self.modify_gen_kwargs(kwargs)
sampling_params = {
"max_new_tokens": max_tokens,
"stop": stop,
}
sampling_params.update(kwargs)
else:
sampling_params = {
"temperature": 0,
"max_new_tokens": 1,
}
sampling_params.update(kwargs)
if not generate:
sampling_params = sampling_params if sampling_params else {}
sampling_params.update(
{
"temperature": 0,
"max_new_tokens": 1,
}
)
if not isinstance(sampling_params, List):
sampling_params = [sampling_params] * len(requests)
# Refer to: https://docs.sglang.ai/backend/offline_engine_api.html
outputs = self.model.generate(
input_ids=requests,
......
import copy
import gc
import inspect
import logging
import os
from importlib.metadata import version
......@@ -33,7 +32,7 @@ from lm_eval.utils import (
try:
import ray
from vllm import LLM, SamplingParams
from vllm import LLM, SamplingParams, TokensPrompt
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer import get_tokenizer
from vllm.utils import get_open_port
......@@ -51,7 +50,7 @@ eval_logger = logging.getLogger(__name__)
def _vllm_mp_worker(
model_args: dict,
sampling_params: "SamplingParams",
sampling_params: list["SamplingParams"],
requests: list[list[int]],
lora_request: "LoRARequest",
result_queue: "Queue",
......@@ -79,7 +78,7 @@ def _vllm_mp_worker(
try:
llm = LLM(**model_args)
res = llm.generate(
prompt_token_ids=requests,
[TokensPrompt(prompt_token_ids=request) for request in requests],
sampling_params=sampling_params,
lora_request=lora_request,
)
......@@ -196,6 +195,12 @@ class VLLM(TemplateLM):
self.batch_size = "auto"
eval_logger.info("Manual batching is not compatible with data parallelism.")
if "gemma" in pretrained.lower():
add_bos_token = True
eval_logger.info(
"Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it."
)
from transformers import AutoConfig
self._config = AutoConfig.from_pretrained(
......@@ -214,11 +219,6 @@ class VLLM(TemplateLM):
"enable_thinking", enable_thinking
)
self.add_bos_token = add_bos_token
if "gemma" in pretrained.lower():
self.add_bos_token = True
eval_logger.info(
"Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it."
)
if parse_version(version("vllm")) >= parse_version("0.8.3"):
kwargs_resolve_hf_chat_template = {
......@@ -239,13 +239,6 @@ class VLLM(TemplateLM):
model_config = engine_args.create_model_config()
kwargs_resolve_hf_chat_template["model_config"] = model_config
# https://github.com/vllm-project/vllm/pull/18259
if (
"trsut_remote_code"
in inspect.signature(resolve_hf_chat_template).parameters
):
kwargs_resolve_hf_chat_template["trsut_remote_code"] = trust_remote_code
else:
kwargs_resolve_hf_chat_template["trust_remote_code"] = trust_remote_code
......@@ -371,17 +364,14 @@ class VLLM(TemplateLM):
self,
requests: List[List[int]] = None,
generate: bool = False,
max_tokens: int = None,
stop: Optional[List[str]] = None,
**kwargs,
sampling_params: Union[List["SamplingParams"], "SamplingParams", None] = None,
):
if generate:
kwargs = self.modify_gen_kwargs(kwargs)
sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
else:
if not generate or sampling_params is None:
sampling_params = SamplingParams(
temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
)
if not isinstance(sampling_params, List):
sampling_params = [sampling_params] * len(requests)
if self.data_parallel_size > 1 and not self.V1:
# vLLM hangs if resources are set in ray.remote
# also seems to only work with decorator and not with ray.remote() fn
......@@ -389,13 +379,13 @@ class VLLM(TemplateLM):
@ray.remote
def run_inference_one_model(
model_args: dict,
sampling_params: SamplingParams,
sampling_params: List["SamplingParams"],
requests: List[List[int]],
lora_request: LoRARequest,
lora_request: "LoRARequest",
):
llm = LLM(**model_args)
return llm.generate(
prompt_token_ids=requests,
[TokensPrompt(prompt_token_ids=request) for request in requests],
sampling_params=sampling_params,
lora_request=lora_request,
)
......@@ -403,9 +393,12 @@ class VLLM(TemplateLM):
# dispatch requests to all self.data_parallel_size workers, in interleaved fashion
# interleaved important to balance context lengths across workers
requests = [list(x) for x in distribute(self.data_parallel_size, requests)]
sampling_params = [
list(sp) for sp in distribute(self.data_parallel_size, sampling_params)
]
inputs = (
(self.model_args, sampling_params, req, self.lora_request)
for req in requests
(self.model_args, sp, req, self.lora_request)
for req, sp in zip(requests, sampling_params)
)
object_refs = [run_inference_one_model.remote(*x) for x in inputs]
results = ray.get(object_refs)
......@@ -420,16 +413,18 @@ class VLLM(TemplateLM):
dp_master_port = os.environ.get("VLLM_DP_MASTER_PORT") or get_open_port()
requests = (list(x) for x in distribute(self.data_parallel_size, requests))
sampling_params = (
list(sp) for sp in distribute(self.data_parallel_size, sampling_params)
)
procs, resq = [], Queue()
# We use Process as it is non-daemonic
try:
for rank, req in enumerate(requests):
for rank, (sp, req) in enumerate(zip(requests, sampling_params)):
proc = Process(
target=_vllm_mp_worker,
args=(
self.model_args.copy(),
sampling_params,
sp,
req,
self.lora_request,
resq,
......@@ -484,7 +479,7 @@ class VLLM(TemplateLM):
else:
outputs = self.model.generate(
prompt_token_ids=requests,
[TokensPrompt(prompt_token_ids=request) for request in requests],
sampling_params=sampling_params,
use_tqdm=True if self.batch_size == "auto" else False,
lora_request=self.lora_request,
......@@ -583,10 +578,11 @@ class VLLM(TemplateLM):
# - any OOMs will happen right away rather than near the end
return -len(_requests[0][1]), _requests[0][0]
# we group requests by their generation_kwargs,
# so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
# in the same batch.
re_ords = Collator(requests, _collate_gen, group_by="gen_kwargs")
re_ords = Collator(
requests,
_collate_gen,
group_by=None,
)
chunks = re_ords.get_batched(
n=int(self.batch_size) if self.batch_size != "auto" else 0, batch_fn=None
)
......@@ -601,41 +597,44 @@ class VLLM(TemplateLM):
for chunk in chunks:
context_and_encoding, all_gen_kwargs = zip(*chunk)
context, context_encoding = zip(*context_and_encoding)
# we assume all gen kwargs in the batch are the same
# this is safe to assume because the `grouper` object ensures it.
gen_kwargs = all_gen_kwargs[0]
# unpack our keyword arguments.
if isinstance(gen_kwargs, dict):
kwargs = copy.deepcopy(gen_kwargs) # edge case for repeats > 1
# add EOS token to stop sequences
until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
else:
raise ValueError(
f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
)
if "max_gen_toks" in kwargs.keys():
max_gen_toks = kwargs.pop("max_gen_toks")
else:
max_gen_toks = self.max_gen_toks
# set the max length in tokens of inputs ("context_enc")
# max len for inputs = max length, minus room to generate the max new tokens
max_ctx_len = self.max_length - max_gen_toks
all_lengths = [len(x) for x in context_encoding]
for length in all_lengths:
if length > max_ctx_len:
context_encoding_truncated = []
sampling_params = []
for x, gen_kwargs in zip(context_encoding, all_gen_kwargs):
# unpack our keyword arguments.
if isinstance(gen_kwargs, dict):
kwargs = copy.deepcopy(gen_kwargs) # edge case for repeats > 1
# add EOS token to stop sequences
until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
else:
raise ValueError(
f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
)
if "max_gen_toks" in kwargs.keys():
max_gen_toks = kwargs.pop("max_gen_toks")
else:
max_gen_toks = self.max_gen_toks
# set the max length in tokens of inputs ("context_enc")
# max len for inputs = max length, minus room to generate the max new tokens
max_ctx_len = self.max_length - max_gen_toks
if len(x) > max_ctx_len:
eval_logger.warning(
f"Context length {length} exceeds max length (context + max gen tokens): {max_ctx_len}. Truncating context."
f"Context length {len(x)} exceeds max length (context + max gen tokens): {max_ctx_len}. Truncating context."
)
context_encoding = [x[-max_ctx_len:] for x in context_encoding]
context_encoding_truncated.append(x[-max_ctx_len:])
else:
context_encoding_truncated.append(x)
# create sampling params
kwargs = self.modify_gen_kwargs(kwargs)
sampling_params.append(
SamplingParams(max_tokens=max_gen_toks, stop=until, **kwargs)
)
# perform batched generation
cont = self._model_generate(
requests=context_encoding,
requests=context_encoding_truncated,
generate=True,
max_tokens=max_gen_toks,
stop=until,
**kwargs,
sampling_params=sampling_params,
)
# cache generations
......
# Tasks
A list of supported tasks and task groupings can be viewed with `lm-eval --tasks list`.
A list of supported tasks and task groupings can be viewed with `lm-eval --tasks list`.
For more information, including a full list of task names and their precise meanings or sources, follow the links provided to the individual README.md files for each subfolder.
For more information, including a full list of task names and their precise meanings or sources, follow the links
provided to the individual README.md files for each subfolder.
| Task Family | Description | Language(s) |
|--------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|
......@@ -29,9 +29,12 @@
| [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) |
| benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. | |
| [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) |
| [bhs](bhs/README.md) | Grammatical knowledge evaluation for low-resource langauges. | Basque, Hindi, Swahili |
| [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple |
| [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English |
| [blimp_nl](blimp_nl/README.md) | A benchmark evaluating language models' grammatical capabilities in Dutch based on comparing the probabilities of minimal pairs of grammatical and ungrammatical sentences. | Dutch |
| [c4](c4/README.md) | Tasks based on a colossal, cleaned version of Common Crawl's web crawl corpus to assess models' language modeling capabilities. | English |
| [cabbq](cabbq/README.md) | Adaptation of the [BBQ](bbq/README.md) benchmark to the Catalan language and stereotypes prevalent in Spain. | Catalan |
| [careqa](careqa/README.md) | Multiple choice and open-ended medical question answering based on the Spanish Specialised Healthcare Training (MIR) exams. | English, Spanish |
| [catalan_bench](catalan_bench/README.md) | Collection of tasks in Catalan encompassing various evaluation areas. | Catalan |
| [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
......@@ -41,14 +44,17 @@
| [copal_id](copal_id/README.md) United States | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian |
| [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English |
| [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French |
| [click](click/README.md) | A benchmark dataset of Cultural and Linguistic Intelligence in Korean (CLIcK), comprising 1,995 QA pairs sourced from official Korean exams and textbooks to test Korean cultural and linguistic knowledge. | Korean |
| csatqa | Tasks related to SAT and other standardized testing questions for academic assessment. | Korean |
| [darija_bench](darija_bench/README.md) | Traditional NLP tasks (Translation, Summariation, etc..) for Moroccan Darija | Moroccan Darija (some MT) |
| [darijahellaswag](darijahellaswag/README.md) | Moroccan Darija version of HellaSwag. | Moroccan Darija (MT) |
| [darijammlu](darijammlu/README.md) | Multiple-choice QA in Moroccan Darija (an Arabic dialect). | Moroccan Darija (MT) |
| [discrim_eval](discrim_eval/README.md) | Prompts for binary decisions covering 70 scenarios to evaluate demographic bias. | English |
| [drop](drop/README.md) | Tasks requiring numerical reasoning, reading comprehension, and question answering. | English |
| [egyhellaswag](egyhellaswag/README.md) | Egyptian Arabic (Masri) version of HellaSwag. | Egyptian Arabic (MT) |
| [egymmlu](egymmlu/README.md) | Multiple-choice QA in Egyptian Arabic. | Egyptian Arabic (MT) |
| [eq_bench](eq_bench/README.md) | Tasks focused on equality and ethics in question answering and decision-making. | English |
| [esbbq](esbbq/README.md) | Adaptation of the [BBQ](bbq/README.md) benchmark to the Spanish language and stereotypes prevalent in Spain. | Spanish |
| [eus_exams](eus_exams/README.md) | Tasks based on various professional and academic exams in the Basque language. | Basque |
| [eus_proficiency](eus_proficiency/README.md) | Tasks designed to test proficiency in the Basque language across various topics. | Basque |
| [eus_reading](eus_reading/README.md) | Reading comprehension tasks specifically designed for the Basque language. | Basque |
......@@ -71,6 +77,7 @@
| [histoires_morales](histoires_morales/README.md) | A dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations. | French (Some MT) |
| [hrm8k](hrm8k/README.md) | A challenging bilingual math reasoning benchmark for Korean and English. | Korean (Some MT), English (Some MT) |
| [humaneval](humaneval/README.md) | Code generation task that measure functional correctness for synthesizing programs from docstrings. | Python |
| [icelandic_winogrande](icelandic_winogrande/README.md) | Manually translated and localized version of the [WinoGrande](winogrande/README.md) commonsense reasoning benchmark for Icelandic. | Icelandic |
| [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English |
| [inverse_scaling](inverse_scaling/README.md) | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English |
| [japanese_leaderboard](japanese_leaderboard/README.md) | Japanese language understanding tasks to benchmark model performance on various linguistic aspects. | Japanese |
......@@ -85,9 +92,12 @@
| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese |
| [leaderboard](leaderboard/README.md) | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English |
| [lingoly](lingoly/README.md) | Challenging logical reasoning benchmark in low-resource languages with controls for memorization | English, Multilingual |
| [libra](libra/README.md) | Evaluates long-context understanding in Russian across four complexity levels | Russian (MT) |
| [llama3](llama3/README.md) | Evals reproducing those provided by the LLAMA team in the Hugging Face repo (instruct) | English, Multilingual |
| [libra](libra/README.md) | Evaluates long-context understanding in Russian across four complexity levels | Russian (MT) |
| [lm_syneval](lm_syneval/README.md) | Evaluates the syntactic capabilities of language models. | English |
| [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese |
| [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese |
| [longbench](longbench/README.md) | LongBench evaluates language models' ability to understand lengthy texts across multiple tasks and languages. | English, Chinese |
| [mastermind](mastermind/README.md) | Reasoning benchmark based on the board game of Mastermind. | English |
| [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English |
| [mbpp](mbpp/README.md) | A benchmark designed to measure the ability to synthesize short Python programs from natural language descriptions. | Python |
......@@ -107,7 +117,7 @@
| [mmlu](mmlu/README.md) | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English |
| [mmlu_pro](mmlu_pro/README.md) | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options. | English |
| [mmlu-pro-plus](mmlu-pro-plus/README.md) | A new test set for evaluating shortcut learning and higher-order reasoning of LLMs. | English |
| [mmlu_prox](mmlu_prox/README.md) | A multilingual benchmark that extends MMLU-Pro to multiple typologically diverse languages with human validation. | English, Japanese, Chinese, Korean, French, German, Spanish, Portuguese, Swahili, Thai, Arabic, Hindi, Bengali |
| [mmlu_prox](mmlu_prox/README.md) | A multilingual benchmark that extends MMLU-Pro to multiple typologically diverse languages with human validation. | English, Japanese, Chinese, Korean, French, German, Spanish, Portuguese, Zulu, Swahili, Wolof, Yoruba, Thai, Arabic, Hindi, Bengali, Serbian, Hungarian, Vietnamese, Czech, Marathi, Afrikaans, Nepali, Telugu, Urdu, Russian, Indonesian, Italian, Ukrainian|
| [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigorous. | English |
| model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | |
| [moral_stories](moral_stories/README.md) | A crowd-sourced dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations. | English |
......@@ -156,6 +166,7 @@
| [truthfulqa](truthfulqa/README.md) | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English |
| [truthfulqa-multi](truthfulqa-multi/README.md) | Is a multilingual version of TruthfulQA, a QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English, Spanish, Catalan, Basque, Galician |
| [turkishmmlu](turkishmmlu/README.md) | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams. | Turkish |
| [turblimp_core](turblimp/README.md) | A benchmark evaluating language models' grammatical capabilities in Turkish based on comparing the probabilities of minimal pairs of grammatical and ungrammatical sentences. | Turkish |
| [unitxt](unitxt/README.md) | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI. | English |
| [unscramble](unscramble/README.md) | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding. | English |
| [webqs](webqs/README.md) | Web-based question answering tasks designed to evaluate internet search and retrieval. | English |
......@@ -171,8 +182,10 @@
| [xquad](xquad/README.md) | Cross-lingual Question Answering Dataset in multiple languages. | Arabic, German, Greek, English, Spanish, Hindi, Romanian, Russian, Thai, Turkish, Vietnamese, Chinese |
| [xstorycloze](xstorycloze/README.md) | Cross-lingual narrative understanding tasks to predict story endings in multiple languages. | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese |
| [xwinograd](xwinograd/README.md) | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages. | English, French, Japanese, Portuguese, Russian, Chinese |
| [zhoblimp](zhoblimp/README.md) | A benchmark evaluating language models' grammatical capabilities in Chinese based on comparing the probabilities of minimal pairs of grammatical and ungrammatical sentences. | Chinese |
## Multimodal Tasks
| Task Family | Description | Modality |
|------------------------------|---------------------------------------------------------------------------------------------------------|-------------|
| [chartqa](chartqa/README.md) | A benchmark for question answering about charts that requires both visual and logical reasoning. | Image, Text |
......
......@@ -81,7 +81,7 @@ class TaskManager:
task_index = {}
for task_dir in all_paths:
tasks = self._get_task_and_group(task_dir)
task_index = {**tasks, **task_index}
task_index = {**task_index, **tasks}
return task_index
......
......@@ -2,7 +2,6 @@ tag:
- adr_tasks
- adr_prompt_1
dataset_path: masakhane/diacritics-restoration
dataset_kwargs: {trust_remote_code: True}
doc_to_target: target
output_type: generate_until
fewshot_split: dev
......
......@@ -2,7 +2,6 @@ tag:
- adr_tasks
- adr_prompt_2
dataset_path: masakhane/diacritics-restoration
dataset_kwargs: {trust_remote_code: True}
doc_to_target: target
output_type: generate_until
fewshot_split: dev
......
......@@ -2,7 +2,6 @@ tag:
- adr_tasks
- adr_prompt_3
dataset_path: masakhane/diacritics-restoration
dataset_kwargs: {trust_remote_code: True}
doc_to_target: target
output_type: generate_until
fewshot_split: dev
......
......@@ -2,7 +2,6 @@ tag:
- adr_tasks
- adr_prompt_4
dataset_path: masakhane/diacritics-restoration
dataset_kwargs: {trust_remote_code: True}
doc_to_target: target
output_type: generate_until
fewshot_split: dev
......
......@@ -2,7 +2,6 @@ tag:
- adr_tasks
- adr_prompt_5
dataset_path: masakhane/diacritics-restoration
dataset_kwargs: {trust_remote_code: True}
doc_to_target: target
output_type: generate_until
fewshot_split: dev
......
......@@ -4,7 +4,6 @@ tag:
task: null
dataset_path: masakhane/afrisenti
dataset_name: null
dataset_kwargs: {trust_remote_code: True}
output_type: multiple_choice
validation_split: validation
test_split: test
......
......@@ -3,7 +3,6 @@ tag:
- afrisent_prompt_2
dataset_path: masakhane/afrisenti
dataset_name: null
dataset_kwargs: {trust_remote_code: True}
output_type: multiple_choice
validation_split: validation
test_split: test
......
......@@ -3,7 +3,6 @@ tag:
- afrisenti_prompt_3
dataset_path: masakhane/afrisenti
dataset_name: null
dataset_kwargs: {trust_remote_code: True}
output_type: multiple_choice
validation_split: validation
test_split: test
......
......@@ -3,7 +3,6 @@ tag:
- afrisenti_prompt_4
dataset_path: masakhane/afrisenti
dataset_name: null
dataset_kwargs: {trust_remote_code: True}
output_type: multiple_choice
validation_split: validation
test_split: test
......
......@@ -3,7 +3,6 @@ tag:
- afrisenti_prompt_5
dataset_path: masakhane/afrisenti
dataset_name: null
dataset_kwargs: {trust_remote_code: True}
output_type: multiple_choice
validation_split: validation
test_split: test
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment