Commit 88486e57 authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'group-agg-rework' of...

Merge branch 'group-agg-rework' of https://github.com/EleutherAI/lm-evaluation-harness into multiprompt
parents 5971f2ca ba73d131
......@@ -307,7 +307,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
# defaults to os.environ.get("ANTHROPIC_API_KEY")
self.client = anthropic.Anthropic()
self.temperature = temperature
self.max_token = max_tokens
self.max_tokens = max_tokens
self.tokenizer = self.client.get_tokenizer()
self.kwargs = kwargs
......
......@@ -30,6 +30,7 @@ from lm_eval.api.registry import register_model
from lm_eval.models.utils import (
Collator,
clear_torch_cache,
configure_pad_token,
get_dtype,
pad_and_concat,
stop_sequences_criteria,
......@@ -253,32 +254,10 @@ class HFLM(TemplateLM):
self.logits_cache = logits_cache
self.vocab_size = self.tokenizer.vocab_size
# select (or create) a pad token to use
if self.tokenizer.pad_token:
pass
elif self.tokenizer.unk_token:
self.tokenizer.pad_token_id = self.tokenizer.unk_token_id
elif self.tokenizer.eos_token:
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
else:
if getattr(self.config, "model_type", None) == "qwen":
# Qwen's trust_remote_code tokenizer does not allow for adding special tokens
self.tokenizer.pad_token = "<|endoftext|>"
elif (
self.tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
or self.tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
):
# The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
# The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
# ---
# Note that the world tokenizer class name, might change in the future for the final huggingface merge
# https://github.com/huggingface/transformers/pull/26963
assert self.tokenizer.pad_token_id == 0
else:
self.tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config)
# TODO: override this for Gemma
self.add_bos_token = add_bos_token
if getattr(self.config, "model_type", None) == "gemma":
if getattr(self.config, "model_type", None) in ["gemma", "gemma2"]:
self.add_bos_token = True
eval_logger.info(
f"Model type is '{self.config.model_type}', a BOS token will be used as Gemma underperforms without it."
......
......@@ -288,7 +288,7 @@ class NEURON_HF(TemplateLM):
self.vocab_size = self.tokenizer.vocab_size
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.add_bos_token = self.add_bos_token
self.add_bos_token = add_bos_token
self._max_length = max_length
......
""" TextSynth API
"""TextSynth API
Implementation provided by Fabrice Bellard:
https://github.com/EleutherAI/lm-evaluation-harness/issues/295
......@@ -11,6 +11,7 @@ Example usage:
Homepage: https://textsynth.com/index.html
"""
import logging
import os
......
......@@ -5,6 +5,7 @@ import itertools
import time
from functools import wraps
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
......@@ -24,6 +25,11 @@ import transformers
from lm_eval.utils import eval_logger
if TYPE_CHECKING:
from transformers import PreTrainedTokenizerBase
from transformers.configuration_utils import PretrainedConfig
def chunks(iter, n: int = 0, fn=None):
"""
Divides an iterable into chunks of specified size or based on a given function.
......@@ -613,3 +619,48 @@ class Collator:
if arr:
yield arr
def configure_pad_token(
tokenizer: "PreTrainedTokenizerBase",
model_config: Optional["PretrainedConfig"] = None,
) -> "PreTrainedTokenizerBase":
"""
This function checks if the (Hugging Face) tokenizer has a padding token and sets it if not present.
Some tokenizers require special handling.
Args:
tokenizer: The tokenizer for which the padding token is to be handled.
model_config: The configuration of the model. Default is None.
Returns:
The tokenizer after the padding token has been handled.
Raises:
AssertionError: If the tokenizer is of type RWKVWorldTokenizer or Rwkv5Tokenizer and the padding token id is not 0.
"""
if tokenizer.pad_token:
pass
elif tokenizer.unk_token:
tokenizer.pad_token_id = tokenizer.unk_token_id
elif tokenizer.eos_token:
tokenizer.pad_token_id = tokenizer.eos_token_id
else:
# handle special cases
if model_config and getattr(model_config, "model_type", None) == "qwen":
# Qwen's trust_remote_code tokenizer does not allow for adding special tokens
tokenizer.pad_token = "<|endoftext|>"
elif (
tokenizer.__class__.__name__ == "RWKVWorldTokenizer"
or tokenizer.__class__.__name__ == "Rwkv5Tokenizer"
):
# The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
# The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
# ---
# Note that the world tokenizer class name, might change in the future for the final huggingface merge
# https://github.com/huggingface/transformers/pull/26963
assert tokenizer.pad_token_id == 0
else:
tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
return tokenizer
import copy
from importlib.metadata import version
from importlib.util import find_spec
from typing import List, Literal, Optional, Tuple, Union
from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
from more_itertools import distribute
from packaging.version import parse as parse_version
......@@ -10,7 +10,7 @@ from tqdm import tqdm
from lm_eval.api.instance import Instance
from lm_eval.api.model import TemplateLM
from lm_eval.api.registry import register_model
from lm_eval.models.utils import Collator, undistribute
from lm_eval.models.utils import Collator, configure_pad_token, undistribute
from lm_eval.utils import (
eval_logger,
get_rolling_token_windows,
......@@ -26,6 +26,8 @@ try:
except ModuleNotFoundError:
pass
if TYPE_CHECKING:
pass
eval_logger = eval_logger
......@@ -118,7 +120,14 @@ class VLLM(TemplateLM):
trust_remote_code=trust_remote_code,
tokenizer_revision=tokenizer_revision,
)
self.tokenizer = configure_pad_token(self.tokenizer)
self.add_bos_token = add_bos_token
if "gemma" in pretrained.lower():
self.add_bos_token = True
eval_logger.info(
"Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it."
)
self.custom_prefix_token_id = prefix_token_id
if prefix_token_id is not None:
eval_logger.info(
......@@ -170,23 +179,46 @@ class VLLM(TemplateLM):
def max_gen_toks(self):
return self._max_gen_toks
def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
"""
Method to apply a chat template to a list of chat history between user and model.
"""
return self.tokenizer.apply_chat_template(
chat_history, tokenize=False, add_generation_prompt=True
)
@property
def chat_template(self) -> str:
if self.tokenizer.chat_template is not None:
return self.tokenizer.chat_template
return self.tokenizer.default_chat_template
@property
def tokenizer_name(self) -> str:
return self.tokenizer.name_or_path.replace("/", "__")
def tok_encode(
self,
string: str,
left_truncate_len=None,
add_special_tokens=None,
truncation=False,
):
""" """
string: Union[str, List[str]],
left_truncate_len: int = None,
add_special_tokens: bool = False,
truncation: bool = False,
) -> Union[List[int], List[List[int]]]:
if not add_special_tokens:
add_special_tokens = False or self.add_bos_token
encoding = self.tokenizer.encode(
string, add_special_tokens=add_special_tokens, truncation=truncation
)
encoding: Union[List[List[int]], List[int]] = self.tokenizer(
string,
add_special_tokens=add_special_tokens,
truncation=truncation,
return_attention_mask=False,
).input_ids
# left-truncate the encoded context to be at most `left_truncate_len` tokens long
if left_truncate_len:
encoding = encoding[-left_truncate_len:]
if not isinstance(string, str):
encoding = [enc[-left_truncate_len:] for enc in encoding]
else:
encoding = encoding[-left_truncate_len:]
return encoding
......@@ -203,7 +235,7 @@ class VLLM(TemplateLM):
sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs)
else:
sampling_params = SamplingParams(
temperature=0, prompt_logprobs=1, max_tokens=1
temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
)
if self.data_parallel_size > 1:
# vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
......@@ -284,7 +316,9 @@ class VLLM(TemplateLM):
# batch tokenize contexts
context, all_gen_kwargs = zip(*(req.args for req in requests))
context_encoding = self.tokenizer(context, add_special_tokens=False).input_ids
context_encoding: List[List[int]] = self.tok_encode(
context, add_special_tokens=self.add_bos_token
)
requests = [
((a, b), c) for a, b, c in zip(context, context_encoding, all_gen_kwargs)
]
......@@ -493,7 +527,10 @@ class VLLM(TemplateLM):
def modify_gen_kwargs(kwargs: dict) -> dict:
# sampling_params
do_sample = kwargs.pop("do_sample", None)
if do_sample is False or "temperature" not in kwargs:
if do_sample is False and "temperature" not in kwargs:
eval_logger.debug(
"Got `do_sample=False` and no temperature value, setting VLLM temperature to 0.0 ..."
)
kwargs["temperature"] = 0.0
# hf defaults
kwargs["skip_special_tokens"] = kwargs.get("skip_special_tokens", False)
......
......@@ -10,8 +10,8 @@
| [aclue](aclue/README.md) | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese |
| [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic |
| [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese |
| [ammlu](ammlu/README.md) | Arabic version of MMLU. | Arabic |
| [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English |
| [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic |
| [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions. | English |
| [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English |
| [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English |
......@@ -20,11 +20,13 @@
| [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German |
| [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) |
| benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. | |
| [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) |
| [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple |
| [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English |
| [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
| [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese |
| code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby |
| [commonsense_qa](commmonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English |
| [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian |
| [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English |
| [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French |
......@@ -47,12 +49,15 @@
| [hendrycks_ethics](hendrycks_ethics/README.md) | Tasks designed to evaluate the ethical reasoning capabilities of models. | English |
| [hendrycks_math](hendrycks_math/README.md) | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English |
| [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English |
| [inverse_scaling](inverse_scaling/README.md) | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English |
| [kmmlu](kmmlu/README.md) | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean |
| [kobest](kobest/README.md) | A collection of tasks designed to evaluate understanding in Korean language. | Korean |
| [kormedmcqa](kormedmcqa/README.md) | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean |
| [lambada](lambada/README.md) | Tasks designed to predict the endings of text passages, testing language prediction skills. | English |
| [lambada_cloze](lambada_cloze/README.md) | Cloze-style LAMBADA dataset. | English |
| [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. | German, English, Spanish, French, Italian |
| [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian |
| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese |
| [leaderboard](leaderboard/README.md) | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English |
| [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese |
| [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese |
| [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English |
......@@ -70,6 +75,7 @@
| okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) |
| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) |
| [openbookqa](openbookqa/README.md) | Open-book question answering tasks that require external knowledge and reasoning. | English |
| [paloma](paloma/README.md) | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit. | English |
| [paws-x](paws-x/README.md) | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean |
| [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English |
| [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English |
......
......@@ -5,7 +5,9 @@ from functools import partial
from typing import Dict, List, Mapping, Optional, Union
from lm_eval import utils
from lm_eval.api.task import ConfigurableGroup, ConfigurableTask, GroupConfig, Task
from lm_eval.api.group import ConfigurableGroup, GroupConfig
from lm_eval.api.task import ConfigurableTask, Task
from lm_eval.evaluator_utils import get_subtask_list
GROUP_ONLY_KEYS = list(GroupConfig().to_dict().keys())
......@@ -17,27 +19,43 @@ class TaskManager:
"""
def __init__(self, verbosity="INFO", include_path: Optional[str] = None) -> None:
def __init__(
self,
verbosity="INFO",
include_path: Optional[Union[str, List]] = None,
include_defaults: bool = True,
) -> None:
self.verbosity = verbosity
self.include_path = include_path
self.logger = utils.eval_logger
self.logger.setLevel(getattr(logging, f"{verbosity}"))
self._task_index = self.initialize_tasks(include_path=include_path)
self._task_index = self.initialize_tasks(
include_path=include_path, include_defaults=include_defaults
)
self._all_tasks = sorted(list(self._task_index.keys()))
self.task_group_map = collections.defaultdict(list)
def initialize_tasks(self, include_path: Optional[str] = None):
def initialize_tasks(
self,
include_path: Optional[Union[str, List]] = None,
include_defaults: bool = True,
):
"""Creates a dictionary of tasks index.
:param include_path: str = None
An additional path to be searched for tasks
:param include_path: Union[str, List] = None
An additional path to be searched for tasks recursively.
Can provide more than one such path as a list.
:param include_defaults: bool = True
If set to false, default tasks (those in lm_eval/tasks/) are not indexed.
:return
Dictionary of task names as key and task metadata
"""
all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
if include_defaults:
all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
else:
all_paths = []
if include_path is not None:
if isinstance(include_path, str):
include_path = [include_path]
......@@ -150,9 +168,16 @@ class TaskManager:
**config,
}
if self._config_is_python_task(config):
task_object = config["class"]()
task_object = (
config["class"](config=config)
if issubclass(config["class"], ConfigurableTask)
else config["class"]()
)
# very scuffed: set task name here. TODO: fixme?
task_object.config.task = config["task"]
else:
task_object = ConfigurableTask(config=config)
return {task: task_object}
def _get_group_and_subtask_from_config(config):
......@@ -181,7 +206,9 @@ class TaskManager:
if update_config is not None:
# Process name_or_config as a dict instead
name_or_config = {"task": name_or_config, **update_config}
elif self._name_is_task(name_or_config):
elif self._name_is_task(name_or_config) or self._name_is_python_task(
name_or_config
):
task_config = self._get_config(name_or_config)
doc_to_text = task_config.get("doc_to_text", None)
if isinstance(doc_to_text, list):
......@@ -204,9 +231,20 @@ class TaskManager:
group_config
)
else:
group_name = ConfigurableGroup(
config={"group": name_or_config, "task": subtask_list}
)
if self._name_is_tag(name_or_config):
fn = partial(
self._load_individual_task_or_group,
update_config=name_or_config
if isinstance(name_or_config, dict)
else None,
)
return dict(
collections.ChainMap(*map(fn, reversed(subtask_list)))
)
else:
group_name = ConfigurableGroup(
config={"group": name_or_config, "task": subtask_list}
)
if isinstance(name_or_config, dict):
if self._config_is_task(name_or_config):
......@@ -325,8 +363,13 @@ class TaskManager:
"""
# TODO: remove group in next release
print_info = True
ignore_dirs = [
"__pycache__",
".ipynb_checkpoints",
]
tasks_and_groups = collections.defaultdict()
for root, _, file_list in os.walk(task_dir):
for root, dirs, file_list in os.walk(task_dir):
dirs[:] = [d for d in dirs if d not in ignore_dirs]
for f in file_list:
if f.endswith(".yaml"):
yaml_path = os.path.join(root, f)
......@@ -371,12 +414,13 @@ class TaskManager:
if attr in config:
if attr == "group" and print_info:
self.logger.info(
"`group` and `group_alias` will no longer be used in the next release of lm-eval. "
"`tags` will be used to allow to call a collection of tasks just like `group`. "
"`group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lm-eval. "
"`tag` will be used to allow to call a collection of tasks just like `group`. "
"`group` will be removed in order to not cause confusion with the new ConfigurableGroup "
"which will be the offical way to create groups with addition of group-wide configuations."
)
print_info = False
# attr = "tag"
attr_list = config[attr]
if isinstance(attr_list, str):
......@@ -389,6 +433,12 @@ class TaskManager:
"task": [task],
"yaml_path": -1,
}
elif tasks_and_groups[tag]["type"] != "tag":
self.logger.info(
f"The tag {tag} is already registered as a group, this tag will not be registered. "
"This may affect tasks you want to call."
)
break
else:
tasks_and_groups[tag]["task"].append(task)
else:
......@@ -419,6 +469,33 @@ def get_task_name_from_object(task_object):
)
def _check_duplicates(task_dict: dict) -> List[str]:
"""helper function solely used in validating get_task_dict output.
Takes the output of lm_eval.evaluator_utils.get_subtask_list and
returns a list of all leaf subtasks contained within, and errors if any such leaf subtasks are
"oversubscribed" to several disjoint groups.
"""
subtask_names = []
for key, value in task_dict.items():
subtask_names.extend(value)
duplicate_tasks = {
task_name for task_name in subtask_names if subtask_names.count(task_name) > 1
}
# locate the potentially problematic groups that seem to 'compete' for constituent subtasks
competing_groups = [
group
for group in task_dict.keys()
if len(set(task_dict[group]).intersection(duplicate_tasks)) > 0
]
if len(duplicate_tasks) > 0:
raise ValueError(
f"Found 1 or more tasks while trying to call get_task_dict() that were members of more than 1 called group: {list(duplicate_tasks)}. Offending groups: {competing_groups}. Please call groups which overlap their constituent tasks in separate evaluation runs."
)
def get_task_dict(
task_name_list: Union[str, List[Union[str, Dict, Task]]],
task_manager: Optional[TaskManager] = None,
......@@ -436,6 +513,7 @@ def get_task_dict(
:return
Dictionary of task objects
"""
task_name_from_string_dict = {}
task_name_from_config_dict = {}
task_name_from_object_dict = {}
......@@ -482,8 +560,16 @@ def get_task_dict(
):
raise ValueError
return {
final_task_dict = {
**task_name_from_string_dict,
**task_name_from_config_dict,
**task_name_from_object_dict,
}
# behavior can get odd if one tries to invoke several groups that "compete" for the same task.
# (notably, because one could request several num_fewshot values at once in GroupConfig overrides for the subtask
# and we'd be unsure which to use and report.)
# we explicitly check and error in this case.
_check_duplicates(get_subtask_list(final_task_dict))
return final_task_dict
......@@ -26,7 +26,7 @@ Homepage: https://github.com/isen-zhang/ACLUE
}
```
### Groups and Tasks
### Groups, Tags, and Tasks
#### Groups
......
group: aclue
task:
- aclue_ancient_chinese_culture
- aclue_ancient_literature
- aclue_ancient_medical
- aclue_ancient_phonetics
- aclue_basic_ancient_chinese
- aclue_couplet_prediction
- aclue_homographic_character_resolution
- aclue_named_entity_recognition
- aclue_poetry_appreciate
- aclue_poetry_context_prediction
- aclue_poetry_quality_assessment
- aclue_poetry_sentiment_analysis
- aclue_polysemy_resolution
- aclue_reading_comprehension
- aclue_sentence_segmentation
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
- metric: acc_norm
aggregation: mean
weight_by_size: true
metadata:
version: 0.0
group: aclue
dataset_path: tyouisen/aclue
test_split: test
fewshot_split: dev
......
"""
Take in a YAML, and output all other splits with this YAML
"""
import argparse
import os
......
......@@ -24,11 +24,11 @@ Homepage for Arabic EXAMS: [EXAMS Arabic Homepage](https://github.com/FreedomInt
### Citation
### Groups and Tasks
### Groups, Tags, and Tasks
#### Groups
- `EXAMS Arabic`: include IslamicStudies, Biology, Science, Physics, Social.
- `aexams`: Arabic EXAMS dataset, including IslamicStudies, Biology, Science, Physics, Social subjects.
#### Tasks
......
group: aexams
task:
- aexams_Biology
- aexams_IslamicStudies
- aexams_Physics
- aexams_Science
- aexams_Social
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
- metric: acc_norm
aggregation: mean
weight_by_size: true
metadata:
version: 0.0
group: aexams
dataset_path: Hennara/aexams
test_split: test
fewshot_split: dev
......
......@@ -75,7 +75,7 @@ Please make sure to cite all the individual datasets in your paper when you use
}
```
### Groups and Tasks
### Groups, Tags, and Tasks
#### Groups
......@@ -89,6 +89,10 @@ Please make sure to cite all the individual datasets in your paper when you use
- `agieval_nous`: Evaluates a specific subset of AGIEval tasks (multiple-choice and english-only), namely those in https://github.com/teknium1/LLM-Benchmark-Logs/blob/main/benchmark-logs/Mistral-7B-Base.md
#### Tags
None.
#### Tasks
- `agieval_aqua_rat`
......
group: agieval
task:
- agieval_gaokao_biology
- agieval_gaokao_chemistry
- agieval_gaokao_chinese
- agieval_gaokao_geography
- agieval_gaokao_history
- agieval_gaokao_mathcloze
- agieval_gaokao_mathqa
- agieval_gaokao_physics
- agieval_jec_qa_ca
- agieval_jec_qa_kd
- agieval_logiqa_zh
- agieval_aqua_rat
- agieval_gaokao_english
- agieval_logiqa_en
- agieval_lsat_ar
- agieval_lsat_lr
- agieval_lsat_rc
- agieval_math
- agieval_sat_en_without_passage
- agieval_sat_en
- agieval_sat_math
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
metadata:
version: 0.0
group: agieval_cn
task:
- agieval_gaokao_biology
- agieval_gaokao_chemistry
- agieval_gaokao_chinese
- agieval_gaokao_geography
- agieval_gaokao_history
- agieval_gaokao_mathcloze
- agieval_gaokao_mathqa
- agieval_gaokao_physics
- agieval_jec_qa_ca
- agieval_jec_qa_kd
- agieval_logiqa_zh
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
metadata:
version: 0.0
group: agieval_en
task:
- agieval_aqua_rat
- agieval_gaokao_english # categorizing as EN because the AGIEval codebase lists this as in `english_qa_tasks`
- agieval_logiqa_en
- agieval_lsat_ar
- agieval_lsat_lr
- agieval_lsat_rc
- agieval_math
- agieval_sat_en_without_passage
- agieval_sat_en
- agieval_sat_math
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
metadata:
version: 0.0
group: agieval_nous
task:
- agieval_aqua_rat
- agieval_logiqa_en
- agieval_lsat_ar
- agieval_lsat_lr
- agieval_lsat_rc
- agieval_sat_en_without_passage
- agieval_sat_en
- agieval_sat_math
aggregate_metric_list:
- metric: acc_norm
aggregation: mean
weight_by_size: true
metadata:
version: 0.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment