@@ -17,7 +18,6 @@ from lm_eval.models.utils import (
undistribute,
)
fromlm_eval.utilsimport(
eval_logger,
get_rolling_token_windows,
make_disjoint_window,
)
...
...
@@ -34,7 +34,7 @@ except ModuleNotFoundError:
ifTYPE_CHECKING:
pass
eval_logger=eval_logger
eval_logger=logging.getLogger(__name__)
@register_model("vllm")
...
...
@@ -75,7 +75,6 @@ class VLLM(TemplateLM):
"Please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
)
assert"cuda"indeviceordeviceisNone,"vLLM only supports CUDA"
assertmax_lengthisNoneormax_model_lenisNone,(
"Either max_length or max_model_len may be provided, but not both"
)
...
...
@@ -110,7 +109,7 @@ class VLLM(TemplateLM):
eval_logger.warning(
"You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached."
| [eus_proficiency](eus_proficiency/README.md) | Tasks designed to test proficiency in the Basque language across various topics. | Basque |
| [eus_reading](eus_reading/README.md) | Reading comprehension tasks specifically designed for the Basque language. | Basque |
| [eus_trivia](eus_trivia/README.md) | Trivia and knowledge testing tasks in the Basque language. | Basque |
| [evalita-LLM](evalita-LLM/README.md) | A native Italian benchmark with diverse tasks formats and multiple prompts. | Italian |
| [fda](fda/README.md) | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English |
| [fld](fld/README.md) | Tasks involving free-form and directed dialogue understanding. | English |
| [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French |
...
...
@@ -50,6 +51,7 @@
| [glue](glue/README.md) | General Language Understanding Evaluation benchmark to test broad language abilities. | English |
| [gpqa](gpqa/README.md) | Tasks designed for general public question answering and knowledge verification. | English |
| [gsm8k](gsm8k/README.md) | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English |
| [groundcocoa](groundcocoa/README.md) | A benchmark evaluating the conditional and compositional reasoning of language models using a grounding task. | English |
| [haerae](haerae/README.md) | Tasks focused on assessing detailed factual and historical knowledge. | Korean |
| [headqa](headqa/README.md) | A high-level education-based question answering dataset to test specialized knowledge. | Spanish, English |
| [hellaswag](hellaswag/README.md) | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity. | English |
| [mmlu](mmlu/README.md) | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English |
| [mmlu_pro](mmlu_pro/README.md) | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options. | English |
| [mmlu-pro-plus](mmlu-pro-plus/README.md) | A new test set for evaluating shortcut learning and higher-order reasoning of LLMs. | English |
| [mmlu-pro-plus](mmlu-pro-plus/README.md)| A new test set for evaluating shortcut learning and higher-order reasoning of LLMs. | English |
| [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigorous. | English |
| model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | |
| [moral_stories](moral_stories/README.md) | A crowd-sourced dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations. | English
BasqueBench is a benchmark for evaluating language models in Basque tasks. This is, it evaluates the ability of a language model to understand and generate Basque text. BasqueBench offers a combination of pre-existing, open datasets and datasets developed exclusivelly for this benchmark. All the details of BasqueBench will be published in a paper soon.
The new evaluation datasets included in BasqueBench are:
| Task | Category | Homepage |
|:-------------:|:-----:|:-----:|
| MGSM_eu | Math | https://huggingface.co/datasets/HiTZ/MGSM-eu |