Unverified Commit da211969 authored by Jess's avatar Jess Committed by GitHub
Browse files

Merge branch 'EleutherAI:main' into main

parents 1b97e487 801322e0
...@@ -307,7 +307,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install ...@@ -307,7 +307,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
# defaults to os.environ.get("ANTHROPIC_API_KEY") # defaults to os.environ.get("ANTHROPIC_API_KEY")
self.client = anthropic.Anthropic() self.client = anthropic.Anthropic()
self.temperature = temperature self.temperature = temperature
self.max_token = max_tokens self.max_tokens = max_tokens
self.tokenizer = self.client.get_tokenizer() self.tokenizer = self.client.get_tokenizer()
self.kwargs = kwargs self.kwargs = kwargs
......
...@@ -2,7 +2,7 @@ import copy ...@@ -2,7 +2,7 @@ import copy
import os import os
from datetime import timedelta from datetime import timedelta
from pathlib import Path from pathlib import Path
from typing import List, Literal, Optional, Tuple, Union from typing import Dict, List, Literal, Optional, Tuple, Union
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
...@@ -44,13 +44,13 @@ def _get_accelerate_args( ...@@ -44,13 +44,13 @@ def _get_accelerate_args(
max_memory_per_gpu: Optional[Union[int, str]] = None, max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None, max_cpu_memory: Optional[Union[int, str]] = None,
offload_folder: Optional[str] = "./offload", offload_folder: Optional[str] = "./offload",
gpus: Optional[int] = None,
) -> dict: ) -> dict:
"""Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`.""" """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
max_memory = {} max_memory = {}
if max_memory_per_gpu is not None: if max_memory_per_gpu is not None:
max_memory_per_gpu_map = { max_memory_per_gpu_map = {
device_idx: max_memory_per_gpu device_idx: max_memory_per_gpu for device_idx in range(gpus)
for device_idx in range(torch.cuda.device_count())
} }
max_memory.update(max_memory_per_gpu_map) max_memory.update(max_memory_per_gpu_map)
if max_cpu_memory is not None: if max_cpu_memory is not None:
...@@ -153,12 +153,16 @@ class HFLM(TemplateLM): ...@@ -153,12 +153,16 @@ class HFLM(TemplateLM):
if accelerator.num_processes > 1: if accelerator.num_processes > 1:
self.accelerator = accelerator self.accelerator = accelerator
if "npu" in accelerator.device.type:
gpus = torch.npu.device_count()
if not (parallelize or accelerator.num_processes > 1): if not (parallelize or accelerator.num_processes > 1):
# use user-passed device # use user-passed device
device_list = set( device_list = set(
["cuda", "cpu"] ["cuda", "cpu"]
+ [f"cuda:{i}" for i in range(torch.cuda.device_count())] + [f"cuda:{i}" for i in range(gpus)]
+ ["mps", "mps:0"] + ["mps", "mps:0"]
+ [f"npu:{i}" for i in range(gpus)]
) )
if device and device in device_list: if device and device in device_list:
self._device = torch.device(device) self._device = torch.device(device)
...@@ -199,6 +203,15 @@ class HFLM(TemplateLM): ...@@ -199,6 +203,15 @@ class HFLM(TemplateLM):
config=self.config, backend=backend, trust_remote_code=trust_remote_code config=self.config, backend=backend, trust_remote_code=trust_remote_code
) )
# load tokenizer so we know tokenizer vocabulary size before loading model and PEFT
self._create_tokenizer(
pretrained,
tokenizer,
revision=revision,
trust_remote_code=trust_remote_code,
use_fast_tokenizer=use_fast_tokenizer,
)
# if we passed `pretrained` as a string, initialize our model now # if we passed `pretrained` as a string, initialize our model now
if isinstance(pretrained, str): if isinstance(pretrained, str):
self._create_model( self._create_model(
...@@ -207,6 +220,7 @@ class HFLM(TemplateLM): ...@@ -207,6 +220,7 @@ class HFLM(TemplateLM):
dtype=dtype, dtype=dtype,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
parallelize=parallelize, parallelize=parallelize,
gpus=gpus,
device_map_option=device_map_option, device_map_option=device_map_option,
max_memory_per_gpu=max_memory_per_gpu, max_memory_per_gpu=max_memory_per_gpu,
max_cpu_memory=max_cpu_memory, max_cpu_memory=max_cpu_memory,
...@@ -235,14 +249,6 @@ class HFLM(TemplateLM): ...@@ -235,14 +249,6 @@ class HFLM(TemplateLM):
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore." "Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes` or `device_map` is provided. If the desired GPU is being used, this message is safe to ignore."
) )
self._create_tokenizer(
pretrained,
tokenizer,
revision=revision,
trust_remote_code=trust_remote_code,
use_fast_tokenizer=use_fast_tokenizer,
)
self.truncation = truncation self.truncation = truncation
self.logits_cache = logits_cache self.logits_cache = logits_cache
self.vocab_size = self.tokenizer.vocab_size self.vocab_size = self.tokenizer.vocab_size
...@@ -321,6 +327,7 @@ class HFLM(TemplateLM): ...@@ -321,6 +327,7 @@ class HFLM(TemplateLM):
in [ in [
DistributedType.FSDP, DistributedType.FSDP,
DistributedType.MULTI_GPU, DistributedType.MULTI_GPU,
DistributedType.MULTI_NPU,
] ]
), "Unsupported distributed type provided. Only DDP and FSDP are supported." ), "Unsupported distributed type provided. Only DDP and FSDP are supported."
if accelerator.distributed_type == DistributedType.FSDP: if accelerator.distributed_type == DistributedType.FSDP:
...@@ -329,9 +336,7 @@ class HFLM(TemplateLM): ...@@ -329,9 +336,7 @@ class HFLM(TemplateLM):
self._model = accelerator.prepare_model( self._model = accelerator.prepare_model(
self.model, evaluation_mode=True self.model, evaluation_mode=True
) )
self._device = torch.device( self._device = torch.device(f"{accelerator.device}")
f"cuda:{accelerator.local_process_index}"
)
self.accelerator = accelerator self.accelerator = accelerator
if self.accelerator.is_local_main_process: if self.accelerator.is_local_main_process:
...@@ -414,6 +419,16 @@ class HFLM(TemplateLM): ...@@ -414,6 +419,16 @@ class HFLM(TemplateLM):
def world_size(self): def world_size(self):
return self._world_size return self._world_size
@property
def tokenizer_name(self) -> str:
return self.tokenizer.name_or_path.replace("/", "__")
@property
def chat_template(self) -> str:
if self.tokenizer.chat_template is not None:
return self.tokenizer.chat_template
return self.tokenizer.default_chat_template
def _get_backend( def _get_backend(
self, self,
config: Union[transformers.PretrainedConfig, transformers.AutoConfig], config: Union[transformers.PretrainedConfig, transformers.AutoConfig],
...@@ -488,6 +503,7 @@ class HFLM(TemplateLM): ...@@ -488,6 +503,7 @@ class HFLM(TemplateLM):
# only used if `parallelize=True`. # only used if `parallelize=True`.
# (accelerate naive PP (device_map) options) # (accelerate naive PP (device_map) options)
parallelize: Optional[bool] = False, parallelize: Optional[bool] = False,
gpus: Optional[int] = None,
device_map_option: Optional[str] = "auto", device_map_option: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None, max_memory_per_gpu: Optional[Union[int, str]] = None,
max_cpu_memory: Optional[Union[int, str]] = None, max_cpu_memory: Optional[Union[int, str]] = None,
...@@ -519,6 +535,7 @@ class HFLM(TemplateLM): ...@@ -519,6 +535,7 @@ class HFLM(TemplateLM):
max_memory_per_gpu, max_memory_per_gpu,
max_cpu_memory, max_cpu_memory,
offload_folder, offload_folder,
gpus,
) )
) )
elif "device_map" not in model_kwargs: elif "device_map" not in model_kwargs:
...@@ -527,9 +544,7 @@ class HFLM(TemplateLM): ...@@ -527,9 +544,7 @@ class HFLM(TemplateLM):
# for quantized models now seems to be device_map="auto" # for quantized models now seems to be device_map="auto"
# which breaks data-parallel mode. # which breaks data-parallel mode.
if hasattr(self, "accelerator"): if hasattr(self, "accelerator"):
model_kwargs.update( model_kwargs.update({"device_map": {"": f"{self.accelerator.device}"}})
{"device_map": {"": f"cuda:{self.accelerator.local_process_index}"}}
)
else: else:
model_kwargs.update({"device_map": {"": str(self.device)}}) model_kwargs.update({"device_map": {"": str(self.device)}})
...@@ -579,6 +594,12 @@ class HFLM(TemplateLM): ...@@ -579,6 +594,12 @@ class HFLM(TemplateLM):
if model_kwargs.get("load_in_4bit", None): if model_kwargs.get("load_in_4bit", None):
if version.parse(PEFT_VERSION) < version.parse("0.4.0"): if version.parse(PEFT_VERSION) < version.parse("0.4.0"):
raise AssertionError("load_in_4bit requires peft >= 0.4.0") raise AssertionError("load_in_4bit requires peft >= 0.4.0")
if self._model.config.vocab_size != len(self.tokenizer):
# resize model for LoRAs with added tokens
self._model.resize_token_embeddings(len(self.tokenizer))
eval_logger.info(
f"Model config indicates vocab_size='{self._model.config.vocab_size}', but found tokenizer with vocab size '{len(self.tokenizer)}'. Resizing model embedding layer..."
)
self._model = PeftModel.from_pretrained( self._model = PeftModel.from_pretrained(
self._model, peft, revision=revision self._model, peft, revision=revision
) )
...@@ -1279,6 +1300,14 @@ class HFLM(TemplateLM): ...@@ -1279,6 +1300,14 @@ class HFLM(TemplateLM):
return res return res
def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
"""
Method to apply a chat template to a list of chat history between user and model.
"""
return self.tokenizer.apply_chat_template(
chat_history, tokenize=False, add_generation_prompt=True
)
def get_model_info(self) -> dict: def get_model_info(self) -> dict:
""" """
Method to get Hugging Face model information for experiment reproducibility. Method to get Hugging Face model information for experiment reproducibility.
......
...@@ -288,7 +288,7 @@ class NEURON_HF(TemplateLM): ...@@ -288,7 +288,7 @@ class NEURON_HF(TemplateLM):
self.vocab_size = self.tokenizer.vocab_size self.vocab_size = self.tokenizer.vocab_size
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.add_bos_token = self.add_bos_token self.add_bos_token = add_bos_token
self._max_length = max_length self._max_length = max_length
......
""" TextSynth API """TextSynth API
Implementation provided by Fabrice Bellard: Implementation provided by Fabrice Bellard:
https://github.com/EleutherAI/lm-evaluation-harness/issues/295 https://github.com/EleutherAI/lm-evaluation-harness/issues/295
...@@ -11,6 +11,7 @@ Example usage: ...@@ -11,6 +11,7 @@ Example usage:
Homepage: https://textsynth.com/index.html Homepage: https://textsynth.com/index.html
""" """
import logging import logging
import os import os
......
...@@ -21,9 +21,7 @@ from lm_eval.utils import ( ...@@ -21,9 +21,7 @@ from lm_eval.utils import (
try: try:
import ray import ray
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest
if parse_version(version("vllm")) > parse_version("0.3.0"):
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
except ModuleNotFoundError: except ModuleNotFoundError:
pass pass
...@@ -102,9 +100,6 @@ class VLLM(TemplateLM): ...@@ -102,9 +100,6 @@ class VLLM(TemplateLM):
if self.data_parallel_size <= 1: if self.data_parallel_size <= 1:
self.model = LLM(**self.model_args) self.model = LLM(**self.model_args)
else: else:
assert parse_version(version("vllm")) < parse_version(
"0.3.3"
), "data_parallel is only compatible with vllm < v0.3.3."
eval_logger.warning( eval_logger.warning(
"You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached." "You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached."
) )
...@@ -124,6 +119,12 @@ class VLLM(TemplateLM): ...@@ -124,6 +119,12 @@ class VLLM(TemplateLM):
tokenizer_revision=tokenizer_revision, tokenizer_revision=tokenizer_revision,
) )
self.add_bos_token = add_bos_token self.add_bos_token = add_bos_token
if "gemma" in pretrained.lower():
self.add_bos_token = True
eval_logger.info(
"Found 'gemma' in model name, a BOS token will be used as Gemma underperforms without it."
)
self.custom_prefix_token_id = prefix_token_id self.custom_prefix_token_id = prefix_token_id
if prefix_token_id is not None: if prefix_token_id is not None:
eval_logger.info( eval_logger.info(
...@@ -498,7 +499,10 @@ class VLLM(TemplateLM): ...@@ -498,7 +499,10 @@ class VLLM(TemplateLM):
def modify_gen_kwargs(kwargs: dict) -> dict: def modify_gen_kwargs(kwargs: dict) -> dict:
# sampling_params # sampling_params
do_sample = kwargs.pop("do_sample", None) do_sample = kwargs.pop("do_sample", None)
if do_sample is False or "temperature" not in kwargs: if do_sample is False and "temperature" not in kwargs:
eval_logger.debug(
"Got `do_sample=False` and no temperature value, setting VLLM temperature to 0.0 ..."
)
kwargs["temperature"] = 0.0 kwargs["temperature"] = 0.0
# hf defaults # hf defaults
kwargs["skip_special_tokens"] = kwargs.get("skip_special_tokens", False) kwargs["skip_special_tokens"] = kwargs.get("skip_special_tokens", False)
......
# Tasks
A list of supported tasks and task groupings can be viewed with `lm-eval --tasks list`.
For more information, including a full list of task names and their precise meanings or sources, follow the links provided to the individual README.md files for each subfolder.
| Task Family | Description | Language(s) |
|-------------|-------------|-------------|
| [aclue](aclue/README.md) | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese |
| [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic |
| [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese |
| [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English |
| [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic |
| [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions. | English |
| [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English |
| [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English |
| [babi](babi/README.md) | Tasks designed as question and answering challenges based on simulated stories. | English |
| [basqueglue](basqueglue/README.md) | Tasks designed to evaluate language understanding in Basque language. | Basque |
| [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German |
| [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) |
| benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. | |
| [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) |
| [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple |
| [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English |
| [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
| [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese |
| code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby |
| [commonsense_qa](commmonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English |
| [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian |
| [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English |
| [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French |
| csatqa | Tasks related to SAT and other standardized testing questions for academic assessment. | Korean |
| [drop](drop/README.md) | Tasks requiring numerical reasoning, reading comprehension, and question answering. | English |
| [eq_bench](eq_bench/README.md) | Tasks focused on equality and ethics in question answering and decision-making. | English |
| [eus_exams](eus_exams/README.md) | Tasks based on various professional and academic exams in the Basque language. | Basque |
| [eus_proficiency](eus_proficiency/README.md) | Tasks designed to test proficiency in the Basque language across various topics. | Basque |
| [eus_reading](eus_reading/README.md) | Reading comprehension tasks specifically designed for the Basque language. | Basque |
| [eus_trivia](eus_trivia/README.md) | Trivia and knowledge testing tasks in the Basque language. | Basque |
| [fda](fda/README.md) | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English |
| [fld](fld/README.md) | Tasks involving free-form and directed dialogue understanding. | English |
| [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French|
| [glue](glue/README.md) | General Language Understanding Evaluation benchmark to test broad language abilities. | English |
| [gpqa](gpqa/README.md) | Tasks designed for general public question answering and knowledge verification. | English |
| [gsm8k](gsm8k/README.md) | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English |
| [haerae](haerae/README.md) | Tasks focused on assessing detailed factual and historical knowledge. | Korean |
| [headqa](headqa/README.md) | A high-level education-based question answering dataset to test specialized knowledge. | Spanish, English |
| [hellaswag](hellaswag/README.md) | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity. | English |
| [hendrycks_ethics](hendrycks_ethics/README.md) | Tasks designed to evaluate the ethical reasoning capabilities of models. | English |
| [hendrycks_math](hendrycks_math/README.md) | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English |
| [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English |
| [kmmlu](kmmlu/README.md) | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean |
| [kobest](kobest/README.md) | A collection of tasks designed to evaluate understanding in Korean language. | Korean |
| [kormedmcqa](kormedmcqa/README.md) | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean |
| [lambada](lambada/README.md) | Tasks designed to predict the endings of text passages, testing language prediction skills. | English |
| [lambada_cloze](lambada_cloze/README.md) | Cloze-style LAMBADA dataset. | English |
| [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian |
| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese |
| [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese |
| [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese |
| [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English |
| [mc_taco](mc_taco/README.md) | Question-answer pairs that require temporal commonsense comprehension. | English |
| medmcqa | Medical multiple choice questions assessing detailed medical knowledge. | English |
| medqa | Multiple choice question answering based on the United States Medical License Exams. | |
| [mgsm](mgsm/README.md) | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu |
| [minerva_math](minerva_math/README.md) | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills. | English |
| mmlu | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English |
| model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | |
| [mutual](mutual/README.md) | A retrieval-based dataset for multi-turn dialogue reasoning. | English |
| [nq_open](nq_open/README.md) | Open domain question answering tasks based on the Natural Questions dataset. | English |
| [okapi/arc_multilingual](okapi/arc_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.** |
| [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (30 languages) |
| okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) |
| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) |
| [openbookqa](openbookqa/README.md) | Open-book question answering tasks that require external knowledge and reasoning. | English |
| [paloma](paloma/README.md) | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit. | English |
| [paws-x](paws-x/README.md) | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean |
| [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English |
| [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English |
| [piqa](piqa/README.md) | Physical Interaction Question Answering tasks to test physical commonsense reasoning. | English |
| [polemo2](polemo2/README.md) | Sentiment analysis and emotion detection tasks based on Polish language data. | Polish |
| [prost](prost/README.md) | Tasks requiring understanding of professional standards and ethics in various domains. | English |
| [pubmedqa](pubmedqa/README.md) | Question answering tasks based on PubMed research articles for biomedical understanding. | English |
| [qa4mre](qa4mre/README.md) | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning. | English |
| [qasper](qasper/README.md) | Question Answering dataset based on academic papers, testing in-depth scientific knowledge. | English |
| [race](race/README.md) | Reading comprehension assessment tasks based on English exams in China. | English |
| realtoxicityprompts | Tasks to evaluate language models for generating text with potential toxicity. | |
| [sciq](sciq/README.md) | Science Question Answering tasks to assess understanding of scientific concepts. | English |
| [scrolls](scrolls/README.md) | Tasks that involve long-form reading comprehension across various domains. | English |
| [siqa](siqa/README.md) | Social Interaction Question Answering to evaluate common sense and social reasoning. | English |
| [squad_completion](squad_completion/README.md) | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs. | English |
| [squadv2](squadv2/README.md) | Stanford Question Answering Dataset version 2, a reading comprehension benchmark. | English |
| [storycloze](storycloze/README.md) | Tasks to predict story endings, focusing on narrative logic and coherence. | English |
| [super_glue](super_glue/README.md) | A suite of challenging tasks designed to test a range of language understanding skills. | English |
| [swag](swag/README.md) | Situations With Adversarial Generations, predicting the next event in videos. | English |
| [swde](swde/README.md) | Information extraction tasks from semi-structured web pages. | English |
| [tinyBenchmarks](tinyBenchmarks/README.md) | Evaluation of large language models with fewer examples using tiny versions of popular benchmarks. | English |
| [tmmluplus](tmmluplus/README.md) | An extended set of tasks under the TMMLU framework for broader academic assessments. | Traditional Chinese |
| [toxigen](toxigen/README.md) | Tasks designed to evaluate language models on their propensity to generate toxic content. | English |
| [translation](translation/README.md) | Tasks focused on evaluating the language translation capabilities of models. | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese |
| [triviaqa](triviaqa/README.md) | A large-scale dataset for trivia question answering to test general knowledge. | English |
| [truthfulqa](truthfulqa/README.md) | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English |
| [unitxt](unitxt/README.md) | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI. | English |
| [unscramble](unscramble/README.md) | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding. | English |
| [webqs](webqs/README.md) | Web-based question answering tasks designed to evaluate internet search and retrieval. | English |
| [wikitext](wikitext/README.md) | Tasks based on text from Wikipedia articles to assess language modeling and generation. | English |
| [winogrande](winogrande/README.md) | A large-scale dataset for coreference resolution, inspired by the Winograd Schema Challenge. | English |
| [wmdp](wmdp/README.md) | A benchmark with the objective of minimizing performance, based on potentially-sensitive multiple-choice knowledge questions. | English |
| [wmt2016](wmt2016/README.md) | Tasks from the WMT 2016 shared task, focusing on translation between multiple languages. | English, Czech, German, Finnish, Russian, Romanian, Turkish |
| [wsc273](wsc273/README.md) | The Winograd Schema Challenge, a test of commonsense reasoning and coreference resolution. | English |
| [xcopa](xcopa/README.md) | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages. | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese |
| [xnli](xnli/README.md) | Cross-Lingual Natural Language Inference to test understanding across different languages. | Arabic, Bulgarian, German, Greekm English, Spanish, French, Hindi, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese |
| [xnli_eu](xnli_eu/README.md) | Cross-lingual Natural Language Inference tasks in Basque. | Basque |
| [xstorycloze](xstorycloze/README.md) | Cross-lingual narrative understanding tasks to predict story endings in multiple languages. | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese |
| [xwinograd](xwinograd/README.md) | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages. | English, French, Japanese, Portuguese, Russian, Chinese |
...@@ -14,27 +14,43 @@ class TaskManager: ...@@ -14,27 +14,43 @@ class TaskManager:
""" """
def __init__(self, verbosity="INFO", include_path: Optional[str] = None) -> None: def __init__(
self,
verbosity="INFO",
include_path: Optional[Union[str, List]] = None,
include_defaults: bool = True,
) -> None:
self.verbosity = verbosity self.verbosity = verbosity
self.include_path = include_path self.include_path = include_path
self.logger = utils.eval_logger self.logger = utils.eval_logger
self.logger.setLevel(getattr(logging, f"{verbosity}")) self.logger.setLevel(getattr(logging, f"{verbosity}"))
self._task_index = self.initialize_tasks(include_path=include_path) self._task_index = self.initialize_tasks(
include_path=include_path, include_defaults=include_defaults
)
self._all_tasks = sorted(list(self._task_index.keys())) self._all_tasks = sorted(list(self._task_index.keys()))
self.task_group_map = collections.defaultdict(list) self.task_group_map = collections.defaultdict(list)
def initialize_tasks(self, include_path: Optional[str] = None): def initialize_tasks(
self,
include_path: Optional[Union[str, List]] = None,
include_defaults: bool = True,
):
"""Creates a dictionary of tasks index. """Creates a dictionary of tasks index.
:param include_path: str = None :param include_path: Union[str, List] = None
An additional path to be searched for tasks An additional path to be searched for tasks recursively.
Can provide more than one such path as a list.
:param include_defaults: bool = True
If set to false, default tasks (those in lm_eval/tasks/) are not indexed.
:return :return
Dictionary of task names as key and task metadata Dictionary of task names as key and task metadata
""" """
all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"] if include_defaults:
all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"]
else:
all_paths = []
if include_path is not None: if include_path is not None:
if isinstance(include_path, str): if isinstance(include_path, str):
include_path = [include_path] include_path = [include_path]
...@@ -296,8 +312,13 @@ class TaskManager: ...@@ -296,8 +312,13 @@ class TaskManager:
:return :return
Dictionary of task names as key and task metadata Dictionary of task names as key and task metadata
""" """
ignore_dirs = [
"__pycache__",
".ipynb_checkpoints",
]
tasks_and_groups = collections.defaultdict() tasks_and_groups = collections.defaultdict()
for root, _, file_list in os.walk(task_dir): for root, dirs, file_list in os.walk(task_dir):
dirs[:] = [d for d in dirs if d not in ignore_dirs]
for f in file_list: for f in file_list:
if f.endswith(".yaml"): if f.endswith(".yaml"):
yaml_path = os.path.join(root, f) yaml_path = os.path.join(root, f)
...@@ -413,7 +434,9 @@ def get_task_dict( ...@@ -413,7 +434,9 @@ def get_task_dict(
) )
string_task_name_list = [task for task in task_name_list if isinstance(task, str)] string_task_name_list = [task for task in task_name_list if isinstance(task, str)]
others_task_name_list = [task for task in task_name_list if ~isinstance(task, str)] others_task_name_list = [
task for task in task_name_list if not isinstance(task, str)
]
if len(string_task_name_list) > 0: if len(string_task_name_list) > 0:
if task_manager is None: if task_manager is None:
task_manager = TaskManager() task_manager = TaskManager()
......
""" """
Take in a YAML, and output all other splits with this YAML Take in a YAML, and output all other splits with this YAML
""" """
import argparse import argparse
import os import os
......
# ArabicMMLU #Arabic COPA
### Paper ### Paper
ArabicMMLU: Measuring massive multitask language understanding in Arabic Original Title: `COPA`
This dataset has been translated from the original MMLU with the help of GPT-4.
The original data [MMLU](https://arxiv.org/pdf/2009.03300v3.pdf)
The translation has been done with AceGPT researchers [AceGPT](https://arxiv.org/abs/2309.12053)
ArabicMMLU is a comprehensive evaluation benchmark specifically designed to evaluate the knowledge and reasoning abilities of LLMs within the context of Arabic language and culture. The Choice Of Plausible Alternatives (COPA) evaluation provides researchers with a tool for assessing progress in open-domain commonsense causal reasoning.
ArabicMMLU covers a wide range of subjects, comprising 57 topics that span from elementary to advanced professional levels.
Homepage: [AceGPT Homepage](https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/MMLUArabic) [Homepage](https://people.ict.usc.edu/~gordon/copa.html)
### Citation AlGhafa has translated this dataset to Arabic[AlGafa](https://aclanthology.org/2023.arabicnlp-1.21.pdf)
The link to the Arabic version of the dataset [PICA](https://gitlab.com/tiiuae/alghafa/-/tree/main/arabic-eval/copa_ar)
### Citation
### Groups and Tasks ### Groups and Tasks
#### Groups #### Groups
- `ammlu`: All 57 subjects of the ArabicMMLU dataset, evaluated following the methodology in MMLU's original implementation. * Not part of a group yet.
#### Tasks #### Tasks
* `copa_ar`
The following tasks evaluate subjects in the ArabicMMLU dataset using loglikelihood-based multiple-choice scoring:
- `ammlu_{subject_english}`
### Checklist ### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature? * [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task? * [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
* [x] Yes, original implementation contributed by author of the benchmark
If other tasks on this dataset are already supported: If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted? * [x] Is the "Main" variant of this task clearly denoted?
......
group: ammlu task: copa_ar
dataset_path: Hennara/ammlu dataset_path: Hennara/copa_ar
test_split: test dataset_name: null
fewshot_split: dev
fewshot_config:
sampler: first_n
output_type: multiple_choice output_type: multiple_choice
doc_to_text: "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nالجواب:" training_split: null
doc_to_choice: ["A", "B", "C", "D"] validation_split: null
doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer)}}" test_split: test
doc_to_text: "السؤال: {{query}}\nالجواب:"
doc_to_choice: "{{[sol1, sol2]}}"
doc_to_target: label
should_decontaminate: true
doc_to_decontamination_query: query
metric_list: metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
...@@ -16,4 +18,4 @@ metric_list: ...@@ -16,4 +18,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 0.0 version: 1.0
#Arabic PIQA
### Paper
Original Title: `PIQA: Reasoning about Physical Commonsense in Natural Language`
Original paper: [PICA](https://arxiv.org/abs/1911.11641)
Physical Interaction: Question Answering (PIQA) is a physical commonsense
reasoning and a corresponding benchmark dataset. PIQA was designed to investigate
the physical knowledge of existing models. To what extent are current approaches
actually learning about the world?
[Homepage](https://yonatanbisk.com/piqa)
AlGhafa has translated this dataset to Arabic[AlGafa](https://aclanthology.org/2023.arabicnlp-1.21.pdf)
The link to the Arabic version of the dataset [PICA](https://gitlab.com/tiiuae/alghafa/-/tree/main/arabic-eval/pica_ar)
### Citation
### Groups and Tasks
#### Groups
* Not part of a group yet.
#### Tasks
* `piqa_ar`
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
task: piqa_ar
dataset_path: Hennara/pica_ar
dataset_name: null
output_type: multiple_choice
training_split: null
validation_split: null
test_split: test
doc_to_text: "السؤال: {{goal}}\nالجواب:"
doc_to_choice: "{{[sol1, sol2]}}"
doc_to_target: label
should_decontaminate: true
doc_to_decontamination_query: goal
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
"""
Take in a YAML, and output all other splits with this YAML
"""
import argparse
import os
import yaml
from tqdm import tqdm
SUBJECTS = {
"abstract_algebra": "ألعلوم وتقنية المعلومات و الرياضيات",
"anatomy": "ألعلوم وتقنية المعلومات و الرياضيات",
"astronomy": "ألعلوم وتقنية المعلومات و الرياضيات",
"business_ethics": "علوم أخرى",
"clinical_knowledge": "علوم أخرى",
"college_biology": "ألعلوم وتقنية المعلومات و الرياضيات",
"college_chemistry": "ألعلوم وتقنية المعلومات و الرياضيات",
"college_computer_science": "ألعلوم وتقنية المعلومات و الرياضيات",
"college_mathematics": "ألعلوم وتقنية المعلومات و الرياضيات",
"college_medicine": "علوم أخرى",
"college_physics": "ألعلوم وتقنية المعلومات و الرياضيات",
"computer_security": "ألعلوم وتقنية المعلومات و الرياضيات",
"conceptual_physics": "ألعلوم وتقنية المعلومات و الرياضيات",
"econometrics": "العلوم الإجتماعية",
"electrical_engineering": "ألعلوم وتقنية المعلومات و الرياضيات",
"elementary_mathematics": "ألعلوم وتقنية المعلومات و الرياضيات",
"formal_logic": "العلوم الانسانية",
"global_facts": "علوم أخرى",
"high_school_biology": "ألعلوم وتقنية المعلومات و الرياضيات",
"high_school_chemistry": "ألعلوم وتقنية المعلومات و الرياضيات",
"high_school_computer_science": "ألعلوم وتقنية المعلومات و الرياضيات",
"high_school_european_history": "العلوم الانسانية",
"high_school_geography": "العلوم الإجتماعية",
"high_school_government_and_politics": "العلوم الإجتماعية",
"high_school_macroeconomics": "العلوم الإجتماعية",
"high_school_mathematics": "ألعلوم وتقنية المعلومات و الرياضيات",
"high_school_microeconomics": "العلوم الإجتماعية",
"high_school_physics": "ألعلوم وتقنية المعلومات و الرياضيات",
"high_school_psychology": "العلوم الإجتماعية",
"high_school_statistics": "ألعلوم وتقنية المعلومات و الرياضيات",
"high_school_us_history": "العلوم الانسانية",
"high_school_world_history": "العلوم الانسانية",
"human_aging": "علوم أخرى",
"human_sexuality": "العلوم الإجتماعية",
"international_law": "العلوم الانسانية",
"jurisprudence": "العلوم الانسانية",
"logical_fallacies": "العلوم الانسانية",
"machine_learning": "ألعلوم وتقنية المعلومات و الرياضيات",
"management": "علوم أخرى",
"marketing": "علوم أخرى",
"medical_genetics": "علوم أخرى",
"miscellaneous": "علوم أخرى",
"moral_disputes": "العلوم الانسانية",
"moral_scenarios": "العلوم الانسانية",
"nutrition": "علوم أخرى",
"philosophy": "العلوم الانسانية",
"prehistory": "العلوم الانسانية",
"professional_accounting": "علوم أخرى",
"professional_law": "العلوم الانسانية",
"professional_medicine": "علوم أخرى",
"professional_psychology": "العلوم الإجتماعية",
"public_relations": "العلوم الإجتماعية",
"security_studies": "العلوم الإجتماعية",
"sociology": "العلوم الإجتماعية",
"us_foreign_policy": "العلوم الإجتماعية",
"virology": "علوم أخرى",
"world_religions": "العلوم الانسانية",
}
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", required=True)
parser.add_argument("--save_prefix_path", default="ammlu")
parser.add_argument("--cot_prompt_path", default=None)
parser.add_argument("--task_prefix", default="")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path, encoding="utf-8") as f:
base_yaml = yaml.full_load(f)
if args.cot_prompt_path is not None:
import json
with open(args.cot_prompt_path, encoding="utf-8") as f:
cot_file = json.load(f)
for subject_eng, category in tqdm(SUBJECTS.items()):
if args.cot_prompt_path is not None:
description = cot_file[subject_eng]
else:
description = f"فم بعملية التقييم في مجال {category} \n\n"
yaml_dict = {
"include": base_yaml_name,
"task": f"ammlu_{args.task_prefix}_{subject_eng}"
if args.task_prefix != ""
else f"ammlu_{subject_eng}",
"dataset_name": subject_eng,
"description": description,
}
file_save_path = args.save_prefix_path + f"_{subject_eng}.yaml"
print(f"Saving yaml for subset {subject_eng} to {file_save_path}")
with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
width=float("inf"),
allow_unicode=True,
default_style='"',
)
"dataset_name": "abstract_algebra"
"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
"include": "_default_template_yaml"
"task": "ammlu_abstract_algebra"
"dataset_name": "anatomy"
"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
"include": "_default_template_yaml"
"task": "ammlu_anatomy"
"dataset_name": "astronomy"
"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
"include": "_default_template_yaml"
"task": "ammlu_astronomy"
"dataset_name": "business_ethics"
"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
"include": "_default_template_yaml"
"task": "ammlu_business_ethics"
"dataset_name": "clinical_knowledge"
"description": "فم بعملية التقييم في مجال علوم أخرى \n\n"
"include": "_default_template_yaml"
"task": "ammlu_clinical_knowledge"
"dataset_name": "college_biology"
"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
"include": "_default_template_yaml"
"task": "ammlu_college_biology"
"dataset_name": "college_chemistry"
"description": "فم بعملية التقييم في مجال ألعلوم وتقنية المعلومات و الرياضيات \n\n"
"include": "_default_template_yaml"
"task": "ammlu_college_chemistry"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment