Commit b2e1bfc6 authored by artemorloff's avatar artemorloff
Browse files

Merge remote-tracking branch 'origin' into feature/eval_from_config

parents b5d16d61 e4a7b69f
...@@ -20,13 +20,12 @@ jobs: ...@@ -20,13 +20,12 @@ jobs:
with: with:
fetch-depth: 2 # OR "2" -> To retrieve the preceding commit. fetch-depth: 2 # OR "2" -> To retrieve the preceding commit.
# Uses the dorny/paths-filter@v3 action to check for changes. # Uses the tj-actions/changed-files action to check for changes.
# Outputs provided here: https://github.com/dorny/paths-filter#outputs
# The `files_yaml` input optionally takes a yaml string to specify filters, # The `files_yaml` input optionally takes a yaml string to specify filters,
# and prepends the filter name to the standard output names. # and prepends the filter name to the standard output names.
- name: Check task folders - name: Check task folders
id: changed-tasks id: changed-tasks
uses: dorny/paths-filter@v3 uses: tj-actions/changed-files@v46.0.5
with: with:
# tasks checks the tasks folder and api checks the api folder for changes # tasks checks the tasks folder and api checks the api folder for changes
files_yaml: | files_yaml: |
......
...@@ -20,64 +20,95 @@ jobs: ...@@ -20,64 +20,95 @@ jobs:
timeout-minutes: 5 timeout-minutes: 5
steps: steps:
- name: Checkout Code - name: Checkout Code
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Set up Python 3.9 - name: Set up Python 3.9
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: 3.9 python-version: 3.9
cache: pip cache: pip
cache-dependency-path: pyproject.toml cache-dependency-path: pyproject.toml
- name: Pre-Commit - name: Pre-Commit
env: env:
SKIP: "no-commit-to-branch,mypy" SKIP: "no-commit-to-branch,mypy"
uses: pre-commit/action@v3.0.1 uses: pre-commit/action@v3.0.1
# Job 2 # Job 2
testcpu: testcpu:
name: CPU Tests name: CPU Tests
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
fail-fast: true
matrix: matrix:
python-version: ["3.9", "3.10", "3.11", "3.12" ] python-version: ["3.9", "3.10", "3.11"]
timeout-minutes: 30 timeout-minutes: 30
steps: steps:
- name: Checkout Code - name: Checkout Code
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
cache: pip cache: pip
cache-dependency-path: pyproject.toml cache-dependency-path: pyproject.toml
- name: Install dependencies
run: | # Cache HuggingFace cache directory for CPU tests
python -m pip install --upgrade pip - name: Cache HuggingFace cache (CPU tests)
pip install -e '.[dev,sentencepiece,api]' --extra-index-url https://download.pytorch.org/whl/cpu uses: actions/cache@v3
- name: Test with pytest id: cache-hf-cpu
run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py --ignore=tests/models/test_hf_steered.py with:
- name: Archive artifacts path: ~/.cache/huggingface
uses: actions/upload-artifact@v4 key: ${{ runner.os }}-hf-cache-cpu
with: restore-keys: |
name: output_testcpu${{ matrix.python-version }} ${{ runner.os }}-hf-cache-cpu
path: |
test_logs/* - name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e '.[dev]' --extra-index-url https://download.pytorch.org/whl/cpu
pip install hf_xet
- name: Test with pytest
run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py --ignore=tests/models/test_hf_steered.py
continue-on-error: true # Continue workflow even if tests fail
# Save test artifacts
- name: Archive test artifacts
uses: actions/upload-artifact@v4
with:
name: output_testcpu${{ matrix.python-version }}
path: |
test_logs/*
testmodels: testmodels:
name: External LM Tests name: External LM Tests
runs-on: ubuntu-latest runs-on: ubuntu-latest
timeout-minutes: 30 timeout-minutes: 30
steps: steps:
- name: Checkout Code - name: Checkout Code
uses: actions/checkout@v4 uses: actions/checkout@v4
- name: Set up Python 3.9 - name: Set up Python 3.9
uses: actions/setup-python@v5 uses: actions/setup-python@v5
with: with:
python-version: 3.9 python-version: 3.9
cache: pip cache: pip
cache-dependency-path: pyproject.toml cache-dependency-path: pyproject.toml
- name: Install dependencies
run: | # Cache HuggingFace cache directory for External LM tests
python -m pip install --upgrade pip - name: Cache HuggingFace cache (External LM tests)
pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu uses: actions/cache@v3
pip install -U transformers peft id: cache-hf-lm
- name: Test with pytest with:
run: python -m pytest tests/models --showlocals -s -vv path: ~/.cache/huggingface
key: ${{ runner.os }}-hf-cache-external-lm
restore-keys: |
${{ runner.os }}-hf-cache-external-lm
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu
pip install -U transformers peft accelerate
- name: Test with pytest
run: python -m pytest tests/models --showlocals -s -vv
continue-on-error: true # Continue workflow even if tests fail
...@@ -113,6 +113,9 @@ class TaskConfig(dict): ...@@ -113,6 +113,9 @@ class TaskConfig(dict):
) )
if "until" not in self.generation_kwargs: if "until" not in self.generation_kwargs:
eval_logger.warning(
f"{self.task}: No `until` specified in `generation_kwargs`! Defaulting to the fewshot_delimiter={repr(self.fewshot_delimiter)}"
)
self.generation_kwargs["until"] = [self.fewshot_delimiter] self.generation_kwargs["until"] = [self.fewshot_delimiter]
else: else:
if self.output_type == "generate_until": if self.output_type == "generate_until":
...@@ -124,7 +127,11 @@ class TaskConfig(dict): ...@@ -124,7 +127,11 @@ class TaskConfig(dict):
else [self.fewshot_delimiter] else [self.fewshot_delimiter]
), ),
"do_sample": False, "do_sample": False,
"temperature": 0,
} }
eval_logger.warning(
f"{self.task}: No `generation_kwargs` specified in task config, defaulting to {self.generation_kwargs}"
)
def __getitem__(self, item): def __getitem__(self, item):
return getattr(self, item) return getattr(self, item)
...@@ -928,11 +935,17 @@ class ConfigurableTask(Task): ...@@ -928,11 +935,17 @@ class ConfigurableTask(Task):
num_choice = len(test_choice) num_choice = len(test_choice)
if isinstance(test_text, int): if isinstance(test_text, int):
eval_logger.debug(
"doc_to_text returned an int. Assuming multiple inputs."
)
self.multiple_input = num_choice self.multiple_input = num_choice
else: else:
test_choice = None test_choice = None
if isinstance(test_target, list): if isinstance(test_target, list):
eval_logger.debug(
"doc_to_target returned a list. Assuming multiple targets."
)
self.multiple_target = len(test_target) self.multiple_target = len(test_target)
else: else:
if (isinstance(test_target, int)) and (test_choice is not None): if (isinstance(test_target, int)) and (test_choice is not None):
......
...@@ -49,6 +49,11 @@ class HFMultimodalLM(HFLM): ...@@ -49,6 +49,11 @@ class HFMultimodalLM(HFLM):
max_pixels: Optional[int] = None, max_pixels: Optional[int] = None,
**kwargs, **kwargs,
): ):
# init pixels before calling tokenizer creation to avoid errors
self.pixels = ({"min_pixels": min_pixels} if min_pixels else {}) | (
{"max_pixels": max_pixels} if max_pixels else {}
)
# We initialize using HFLM's init. Sub-methods like _create_model and _create_tokenizer # We initialize using HFLM's init. Sub-methods like _create_model and _create_tokenizer
# modify init behavior. # modify init behavior.
super().__init__(pretrained, **kwargs) super().__init__(pretrained, **kwargs)
...@@ -65,9 +70,6 @@ class HFMultimodalLM(HFLM): ...@@ -65,9 +70,6 @@ class HFMultimodalLM(HFLM):
self.interleave = interleave self.interleave = interleave
self.max_images = max_images self.max_images = max_images
self.rgb = convert_img_format self.rgb = convert_img_format
self.pixels = ({"min_pixels": min_pixels} if min_pixels else {}) | (
{"max_pixels": max_pixels} if max_pixels else {}
)
# WARNING: improperly set image_token_id can lead to ignored image input or other (potentially silent) errors! # WARNING: improperly set image_token_id can lead to ignored image input or other (potentially silent) errors!
if not image_string: if not image_string:
self.image_token_id = ( self.image_token_id = (
......
...@@ -3,7 +3,7 @@ import logging ...@@ -3,7 +3,7 @@ import logging
import os import os
from datetime import timedelta from datetime import timedelta
from pathlib import Path from pathlib import Path
from typing import Dict, List, Literal, Optional, Tuple, Union from typing import Any, Dict, List, Literal, Optional, Tuple, Union
import jinja2 import jinja2
import torch import torch
...@@ -74,6 +74,7 @@ class HFLM(TemplateLM): ...@@ -74,6 +74,7 @@ class HFLM(TemplateLM):
max_length: Optional[int] = None, max_length: Optional[int] = None,
device: Optional[str] = "cuda", device: Optional[str] = "cuda",
dtype: Optional[Union[str, torch.dtype]] = "auto", dtype: Optional[Union[str, torch.dtype]] = "auto",
softmax_dtype: Optional[Union[str, torch.dtype]] = None,
batch_size: Optional[Union[int, str]] = 1, batch_size: Optional[Union[int, str]] = 1,
max_batch_size: Optional[int] = 64, max_batch_size: Optional[int] = 64,
trust_remote_code: Optional[bool] = False, trust_remote_code: Optional[bool] = False,
...@@ -204,6 +205,7 @@ class HFLM(TemplateLM): ...@@ -204,6 +205,7 @@ class HFLM(TemplateLM):
autogptq=autogptq, autogptq=autogptq,
gptqmodel=gptqmodel, gptqmodel=gptqmodel,
gguf_file=gguf_file, gguf_file=gguf_file,
quantization_config=getattr(self.config, "quantization_config", None),
**kwargs, **kwargs,
) )
...@@ -233,6 +235,9 @@ class HFLM(TemplateLM): ...@@ -233,6 +235,9 @@ class HFLM(TemplateLM):
self.batch_schedule = 1 self.batch_schedule = 1
self.batch_sizes = {} self.batch_sizes = {}
self.max_batch_size = max_batch_size self.max_batch_size = max_batch_size
self.softmax_dtype = (
get_dtype(softmax_dtype) if softmax_dtype is not None else None
)
if str(batch_size).startswith("auto"): if str(batch_size).startswith("auto"):
batch_size = batch_size.split(":") batch_size = batch_size.split(":")
...@@ -546,6 +551,7 @@ class HFLM(TemplateLM): ...@@ -546,6 +551,7 @@ class HFLM(TemplateLM):
autogptq: Optional[Union[bool, str]] = False, autogptq: Optional[Union[bool, str]] = False,
gptqmodel: Optional[bool] = False, gptqmodel: Optional[bool] = False,
gguf_file: Optional[str] = None, gguf_file: Optional[str] = None,
quantization_config: Optional[Dict[str, Any]] = None,
**kwargs, **kwargs,
) -> None: ) -> None:
""" """
...@@ -591,6 +597,7 @@ class HFLM(TemplateLM): ...@@ -591,6 +597,7 @@ class HFLM(TemplateLM):
torch_dtype=get_dtype(dtype), torch_dtype=get_dtype(dtype),
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
gguf_file=gguf_file, gguf_file=gguf_file,
quantization_config=quantization_config,
**model_kwargs, **model_kwargs,
) )
else: else:
...@@ -765,7 +772,11 @@ class HFLM(TemplateLM): ...@@ -765,7 +772,11 @@ class HFLM(TemplateLM):
(batch_size, max_length), device=self.device (batch_size, max_length), device=self.device
).long() ).long()
for _ in range(5): for _ in range(5):
out = F.log_softmax(self._model_call(test_batch, **call_kwargs), dim=-1) # noqa: F841 out = F.log_softmax( # noqa: F841
self._model_call(test_batch, **call_kwargs),
dim=-1,
dtype=self.softmax_dtype,
)
return batch_size return batch_size
...@@ -1197,7 +1208,9 @@ class HFLM(TemplateLM): ...@@ -1197,7 +1208,9 @@ class HFLM(TemplateLM):
} }
multi_logits = F.log_softmax( multi_logits = F.log_softmax(
self._model_call(batched_inps, **call_kwargs), dim=-1 self._model_call(batched_inps, **call_kwargs),
dim=-1,
dtype=self.softmax_dtype,
) # [batch, padding_length (inp or cont), vocab] ) # [batch, padding_length (inp or cont), vocab]
for (request_str, ctx_tokens, _), logits, inplen, cont_toks in zip( for (request_str, ctx_tokens, _), logits, inplen, cont_toks in zip(
......
...@@ -28,6 +28,9 @@ try: ...@@ -28,6 +28,9 @@ try:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
if parse_version(version("vllm")) >= parse_version("0.8.3"):
from vllm.entrypoints.chat_utils import resolve_hf_chat_template
except ModuleNotFoundError: except ModuleNotFoundError:
pass pass
...@@ -133,6 +136,16 @@ class VLLM(TemplateLM): ...@@ -133,6 +136,16 @@ class VLLM(TemplateLM):
"Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it." "Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it."
) )
if parse_version(version("vllm")) >= parse_version("0.8.3"):
self.hf_chat_template = resolve_hf_chat_template(
tokenizer=self.tokenizer,
chat_template=None,
tools=None,
trust_remote_code=trust_remote_code,
)
else:
self.hf_chat_template = None
self.custom_prefix_token_id = prefix_token_id self.custom_prefix_token_id = prefix_token_id
if prefix_token_id is not None: if prefix_token_id is not None:
eval_logger.info( eval_logger.info(
...@@ -195,6 +208,7 @@ class VLLM(TemplateLM): ...@@ -195,6 +208,7 @@ class VLLM(TemplateLM):
tokenize=False, tokenize=False,
add_generation_prompt=add_generation_prompt, add_generation_prompt=add_generation_prompt,
continue_final_message=not add_generation_prompt, continue_final_message=not add_generation_prompt,
chat_template=self.hf_chat_template,
) )
return chat_templated return chat_templated
......
...@@ -5,165 +5,167 @@ ...@@ -5,165 +5,167 @@
For more information, including a full list of task names and their precise meanings or sources, follow the links provided to the individual README.md files for each subfolder. For more information, including a full list of task names and their precise meanings or sources, follow the links provided to the individual README.md files for each subfolder.
| Task Family | Description | Language(s) | | Task Family | Description | Language(s) |
|--------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------| |--------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|
| [aclue](aclue/README.md) | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese | | [aclue](aclue/README.md) | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese |
| [acp_bench](acpbench/README.md) | Tasks evaluating the reasoning ability about Action, Change, and Planning | English | | [acp_bench](acpbench/README.md) | Tasks evaluating the reasoning ability about Action, Change, and Planning | English |
| [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic | | [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic |
| [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese | | [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese |
| [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English | | [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English |
| [arabic_leaderboard_complete](arabic_leaderboard_complete/README.md) | A full version of the tasks in the Open Arabic LLM Leaderboard, focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT) | | [arabic_leaderboard_complete](arabic_leaderboard_complete/README.md) | A full version of the tasks in the Open Arabic LLM Leaderboard, focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT) |
| [arabic_leaderboard_light](arabic_leaderboard_light/README.md) | A light version of the tasks in the Open Arabic LLM Leaderboard (i.e., 10% samples of the test set in the original benchmarks), focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT) | | [arabic_leaderboard_light](arabic_leaderboard_light/README.md) | A light version of the tasks in the Open Arabic LLM Leaderboard (i.e., 10% samples of the test set in the original benchmarks), focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT) |
| [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic | | [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic |
| [AraDICE](aradice/README.md) | A collection of multiple tasks carefully designed to evaluate dialectal and cultural capabilities in large language models (LLMs). | Arabic | | [AraDICE](aradice/README.md) | A collection of multiple tasks carefully designed to evaluate dialectal and cultural capabilities in large language models (LLMs). | Arabic |
| [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions. | English | | [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions. | English |
| [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English | | [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English |
| [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English | | [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English |
| [babi](babi/README.md) | Tasks designed as question and answering challenges based on simulated stories. | English | | [babi](babi/README.md) | Tasks designed as question and answering challenges based on simulated stories. | English |
| [basque_bench](basque_bench/README.md) | Collection of tasks in Basque encompassing various evaluation areas. | Basque | | [basque_bench](basque_bench/README.md) | Collection of tasks in Basque encompassing various evaluation areas. | Basque |
| [basqueglue](basqueglue/README.md) | Tasks designed to evaluate language understanding in Basque language. | Basque | | [basqueglue](basqueglue/README.md) | Tasks designed to evaluate language understanding in Basque language. | Basque |
| [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German | | [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German |
| [bbq](bbq/README.md) | A question-answering benchmark designed to measure social biases in language models across various demographic categories and contexts. | English | | [bbq](bbq/README.md) | A question-answering benchmark designed to measure social biases in language models across various demographic categories and contexts. | English |
| [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) | | [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) |
| benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. | | | benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. | |
| [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) | | [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) |
| [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple | | [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple |
| [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English | | [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English |
| [careqa](careqa/README.md) | Multiple choice and open-ended medical question answering based on the Spanish Specialised Healthcare Training (MIR) exams. | English, Spanish | | [careqa](careqa/README.md) | Multiple choice and open-ended medical question answering based on the Spanish Specialised Healthcare Training (MIR) exams. | English, Spanish |
| [catalan_bench](catalan_bench/README.md) | Collection of tasks in Catalan encompassing various evaluation areas. | Catalan | | [catalan_bench](catalan_bench/README.md) | Collection of tasks in Catalan encompassing various evaluation areas. | Catalan |
| [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese | | [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese |
| [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese | | [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese |
| code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby | | code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby |
| [commonsense_qa](commonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English | | [commonsense_qa](commonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English |
| [copal_id](copal_id/README.md) United States | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian | | [copal_id](copal_id/README.md) United States | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian |
| [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English | | [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English |
| [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French | | [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French |
| csatqa | Tasks related to SAT and other standardized testing questions for academic assessment. | Korean | | csatqa | Tasks related to SAT and other standardized testing questions for academic assessment. | Korean |
| [darija_bench](darija_bench/README.md) | Traditional NLP tasks (Translation, Summariation, etc..) for Moroccan Darija | Moroccan Darija (some MT) | | [darija_bench](darija_bench/README.md) | Traditional NLP tasks (Translation, Summariation, etc..) for Moroccan Darija | Moroccan Darija (some MT) |
| [darijahellaswag](darijahellaswag/README.md) | Moroccan Darija version of HellaSwag. | Moroccan Darija (MT) | | [darijahellaswag](darijahellaswag/README.md) | Moroccan Darija version of HellaSwag. | Moroccan Darija (MT) |
| [darijammlu](darijammlu/README.md)| Multiple-choice QA in Moroccan Darija (an Arabic dialect). | Moroccan Darija (MT) | | [darijammlu](darijammlu/README.md) | Multiple-choice QA in Moroccan Darija (an Arabic dialect). | Moroccan Darija (MT) |
| [drop](drop/README.md) | Tasks requiring numerical reasoning, reading comprehension, and question answering. | English | | [drop](drop/README.md) | Tasks requiring numerical reasoning, reading comprehension, and question answering. | English |
| [eq_bench](eq_bench/README.md) | Tasks focused on equality and ethics in question answering and decision-making. | English | | [eq_bench](eq_bench/README.md) | Tasks focused on equality and ethics in question answering and decision-making. | English |
| [eus_exams](eus_exams/README.md) | Tasks based on various professional and academic exams in the Basque language. | Basque | | [eus_exams](eus_exams/README.md) | Tasks based on various professional and academic exams in the Basque language. | Basque |
| [eus_proficiency](eus_proficiency/README.md) | Tasks designed to test proficiency in the Basque language across various topics. | Basque | | [eus_proficiency](eus_proficiency/README.md) | Tasks designed to test proficiency in the Basque language across various topics. | Basque |
| [eus_reading](eus_reading/README.md) | Reading comprehension tasks specifically designed for the Basque language. | Basque | | [eus_reading](eus_reading/README.md) | Reading comprehension tasks specifically designed for the Basque language. | Basque |
| [eus_trivia](eus_trivia/README.md) | Trivia and knowledge testing tasks in the Basque language. | Basque | | [eus_trivia](eus_trivia/README.md) | Trivia and knowledge testing tasks in the Basque language. | Basque |
| [evalita-LLM](evalita-LLM/README.md) | A native Italian benchmark with diverse tasks formats and multiple prompts. | Italian | | [evalita_LLM](evalita_llm/README.md) | A native Italian benchmark with diverse tasks formats and multiple prompts. | Italian |
| [fda](fda/README.md) | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English | | [fda](fda/README.md) | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English |
| [fld](fld/README.md) | Tasks involving free-form and directed dialogue understanding. | English | | [fld](fld/README.md) | Tasks involving free-form and directed dialogue understanding. | English |
| [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French | | [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French |
| [galician_bench](galician_bench/README.md) | Collection of tasks in Galician encompassing various evaluation areas. | Galician | | [galician_bench](galician_bench/README.md) | Collection of tasks in Galician encompassing various evaluation areas. | Galician |
| [global_mmlu](global_mmlu/README.md) | Collection of culturally sensitive and culturally agnostic MMLU tasks in 15 languages with human translations or post-edits. | Multiple (15 languages) | | [global_mmlu](global_mmlu/README.md) | Collection of culturally sensitive and culturally agnostic MMLU tasks in 15 languages with human translations or post-edits. | Multiple (15 languages) |
| [glue](glue/README.md) | General Language Understanding Evaluation benchmark to test broad language abilities. | English | | [glue](glue/README.md) | General Language Understanding Evaluation benchmark to test broad language abilities. | English |
| [gpqa](gpqa/README.md) | Tasks designed for general public question answering and knowledge verification. | English | | [gpqa](gpqa/README.md) | Tasks designed for general public question answering and knowledge verification. | English |
| [gsm8k](gsm8k/README.md) | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English | | [gsm8k](gsm8k/README.md) | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English |
| [groundcocoa](groundcocoa/README.md) | A benchmark evaluating the conditional and compositional reasoning of language models using a grounding task. | English | | [groundcocoa](groundcocoa/README.md) | A benchmark evaluating the conditional and compositional reasoning of language models using a grounding task. | English |
| [haerae](haerae/README.md) | Tasks focused on assessing detailed factual and historical knowledge. | Korean | | [haerae](haerae/README.md) | Tasks focused on assessing detailed factual and historical knowledge. | Korean |
| [headqa](headqa/README.md) | A high-level education-based question answering dataset to test specialized knowledge. | Spanish, English | | [headqa](headqa/README.md) | A high-level education-based question answering dataset to test specialized knowledge. | Spanish, English |
| [hellaswag](hellaswag/README.md) | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity. | English | | [hellaswag](hellaswag/README.md) | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity. | English |
| [hendrycks_ethics](hendrycks_ethics/README.md) | Tasks designed to evaluate the ethical reasoning capabilities of models. | English | | [hendrycks_ethics](hendrycks_ethics/README.md) | Tasks designed to evaluate the ethical reasoning capabilities of models. | English |
| [hendrycks_math](hendrycks_math/README.md) | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English | | [hendrycks_math](hendrycks_math/README.md) | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English |
| [histoires_morales](histoires_morales/README.md) | A dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations. | French (Some MT) | | [histoires_morales](histoires_morales/README.md) | A dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations. | French (Some MT) |
| [hrm8k](hrm8k/README.md) | A challenging bilingual math reasoning benchmark for Korean and English. | Korean (Some MT), English (Some MT) | | [hrm8k](hrm8k/README.md) | A challenging bilingual math reasoning benchmark for Korean and English. | Korean (Some MT), English (Some MT) |
| [humaneval](humaneval/README.md) | Code generation task that measure functional correctness for synthesizing programs from docstrings. | Python | | [humaneval](humaneval/README.md) | Code generation task that measure functional correctness for synthesizing programs from docstrings. | Python |
| [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English | | [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English |
| [inverse_scaling](inverse_scaling/README.md) | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English | | [inverse_scaling](inverse_scaling/README.md) | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English |
| [japanese_leaderboard](japanese_leaderboard/README.md) | Japanese language understanding tasks to benchmark model performance on various linguistic aspects. | Japanese | | [japanese_leaderboard](japanese_leaderboard/README.md) | Japanese language understanding tasks to benchmark model performance on various linguistic aspects. | Japanese |
| [jsonschema_bench](jsonschema_bench/README.md) | Evaluate the ability of LLMs to generate JSON objects that conform to a given JSON schema, including API, configuration files, and other structured data formats. | JSON | | [jsonschema_bench](jsonschema_bench/README.md) | Evaluate the ability of LLMs to generate JSON objects that conform to a given JSON schema, including API, configuration files, and other structured data formats. | JSON |
| [kbl](kbl/README.md) | Korean Benchmark for Legal Language Understanding. | Korean | | [kbl](kbl/README.md) | Korean Benchmark for Legal Language Understanding. | Korean |
| [kmmlu](kmmlu/README.md) | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean | | [kmmlu](kmmlu/README.md) | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean |
| [kobest](kobest/README.md) | A collection of tasks designed to evaluate understanding in Korean language. | Korean | | [kobest](kobest/README.md) | A collection of tasks designed to evaluate understanding in Korean language. | Korean |
| [kormedmcqa](kormedmcqa/README.md) | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean | | [kormedmcqa](kormedmcqa/README.md) | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean |
| [lambada](lambada/README.md) | Tasks designed to predict the endings of text passages, testing language prediction skills. | English | | [lambada](lambada/README.md) | Tasks designed to predict the endings of text passages, testing language prediction skills. | English |
| [lambada_cloze](lambada_cloze/README.md) | Cloze-style LAMBADA dataset. | English | | [lambada_cloze](lambada_cloze/README.md) | Cloze-style LAMBADA dataset. | English |
| [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian | | [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian |
| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese | | [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese |
| [leaderboard](leaderboard/README.md) | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English | | [leaderboard](leaderboard/README.md) | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English |
| [lingoly](lingoly/README.md) | Challenging logical reasoning benchmark in low-resource languages with controls for memorization | English, Multilingual | | [lingoly](lingoly/README.md) | Challenging logical reasoning benchmark in low-resource languages with controls for memorization | English, Multilingual |
| [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese | | [llama3](llama3/README.md) | Evals reproducing those provided by the LLAMA team in the Hugging Face repo (instruct) | English, Multilingual |
| [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese | | [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese |
| [mastermind](mastermind/README.md) | Reasoning benchmark based on the board game of Mastermind. | English | | [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese |
| [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English | | [longbench](longbench/README.md) | LongBench evaluates language models' ability to understand lengthy texts across multiple tasks and languages. | English, Chinese |
| [mbpp](mbpp/README.md) | A benchmark designed to measure the ability to synthesize short Python programs from natural language descriptions. | Python | | [mastermind](mastermind/README.md) | Reasoning benchmark based on the board game of Mastermind. | English |
| [meddialog](meddialog/README.md) | Medical open-ended QA and Question Entailment stemming from the MedDialog dataset. | English | | [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English |
| [medtext](medtext/README.md) | Medical open-ended QA from the MedText Clinical Notes dataset. | English | | [mbpp](mbpp/README.md) | A benchmark designed to measure the ability to synthesize short Python programs from natural language descriptions. | Python |
| [mimic_repsum](mimic_repsum/README.md) | Medical report summarization from the MIMIC-III dataset. | English | | [meddialog](meddialog/README.md) | Medical open-ended QA and Question Entailment stemming from the MedDialog dataset. | English |
| [mc_taco](mc_taco/README.md) | Question-answer pairs that require temporal commonsense comprehension. | English | | [medtext](medtext/README.md) | Medical open-ended QA from the MedText Clinical Notes dataset. | English |
| [med_concepts_qa](med_concepts_qa/README.md) | Benchmark for evaluating LLMs on their abilities to interpret medical codes and distinguish between medical concept. | English | | [mimic_repsum](mimic_repsum/README.md) | Medical report summarization from the MIMIC-III dataset. | English |
| [metabench](metabench/README.md) | Distilled versions of six popular benchmarks which are highly predictive of overall benchmark performance and of a single general ability latent trait. | English | | [mc_taco](mc_taco/README.md) | Question-answer pairs that require temporal commonsense comprehension. | English |
| [mediqa_qa2019](mediqa_qa2019/README.md) | Open-ended healthcare question answering benchmark from the MEDIQA 2019 challenge. | English | | [med_concepts_qa](med_concepts_qa/README.md) | Benchmark for evaluating LLMs on their abilities to interpret medical codes and distinguish between medical concept. | English |
| medmcqa | Medical multiple choice questions assessing detailed medical knowledge. | English | | [metabench](metabench/README.md) | Distilled versions of six popular benchmarks which are highly predictive of overall benchmark performance and of a single general ability latent trait. | English |
| medqa | Multiple choice question answering based on the United States Medical License Exams. | | | [mediqa_qa2019](mediqa_qa2019/README.md) | Open-ended healthcare question answering benchmark from the MEDIQA 2019 challenge. | English |
| [meqsum](meqsum/README.md) | Healtcare Question Entailment benchmark from the MeqSum dataset. | | | medmcqa | Medical multiple choice questions assessing detailed medical knowledge. | English |
| [mgsm](mgsm/README.md) | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu | | medqa | Multiple choice question answering based on the United States Medical License Exams. | |
| [minerva_math](minerva_math/README.md) | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills. | English | | [meqsum](meqsum/README.md) | Healtcare Question Entailment benchmark from the MeqSum dataset. | |
| [mlqa](mlqa/README.md) | MultiLingual Question Answering benchmark dataset for evaluating cross-lingual question answering performance. | English, Arabic, German, Spanish, Hindi, Vietnamese, Simplified Chinese | | [mgsm](mgsm/README.md) | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu |
| [mmlu](mmlu/README.md) | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English | | [minerva_math](minerva_math/README.md) | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills. | English |
| [mmlu_pro](mmlu_pro/README.md) | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options. | English | | [mlqa](mlqa/README.md) | MultiLingual Question Answering benchmark dataset for evaluating cross-lingual question answering performance. | English, Arabic, German, Spanish, Hindi, Vietnamese, Simplified Chinese |
| [mmlu-pro-plus](mmlu-pro-plus/README.md) | A new test set for evaluating shortcut learning and higher-order reasoning of LLMs. | English | | [mmlu](mmlu/README.md) | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English |
| [mmlu_prox](mmlu_prox/README.md) | A multilingual benchmark that extends MMLU-Pro to multiple typologically diverse languages with human validation. | English, Japanese, Chinese, Korean, French, German, Spanish, Portuguese, Swahili, Thai, Arabic, Hindi, Bengali | | [mmlu_pro](mmlu_pro/README.md) | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options. | English |
| [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigorous. | English | | [mmlu-pro-plus](mmlu-pro-plus/README.md) | A new test set for evaluating shortcut learning and higher-order reasoning of LLMs. | English |
| model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | | | [mmlu_prox](mmlu_prox/README.md) | A multilingual benchmark that extends MMLU-Pro to multiple typologically diverse languages with human validation. | English, Japanese, Chinese, Korean, French, German, Spanish, Portuguese, Swahili, Thai, Arabic, Hindi, Bengali |
| [moral_stories](moral_stories/README.md) | A crowd-sourced dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations. | English | [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigorous. | English |
| [mts_dialog](mts_dialog/README.md) | Open-ended healthcare QA from the MTS-Dialog dataset. | English | | model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | |
| [mutual](mutual/README.md) | A retrieval-based dataset for multi-turn dialogue reasoning. | English | | [moral_stories](moral_stories/README.md) | A crowd-sourced dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations. | English |
| [nq_open](nq_open/README.md) | Open domain question answering tasks based on the Natural Questions dataset. | English | | [mts_dialog](mts_dialog/README.md) | Open-ended healthcare QA from the MTS-Dialog dataset. | English |
| [okapi/arc_multilingual](okapi/arc_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.** | | [mutual](mutual/README.md) | A retrieval-based dataset for multi-turn dialogue reasoning. | English |
| [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (30 languages) **Machine Translated.** | | [nq_open](nq_open/README.md) | Open domain question answering tasks based on the Natural Questions dataset. | English |
| okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) **Machine Translated.** | | [okapi/arc_multilingual](okapi/arc_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.** |
| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.** | | [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (30 languages) **Machine Translated.** |
| [olaph](olaph/README.md) | Open-ended medical factuality Question Answering from the OLAPH dataset. | English | | okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) **Machine Translated.** |
| [openbookqa](openbookqa/README.md) | Open-book question answering tasks that require external knowledge and reasoning. | English | | [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.** |
| [paloma](paloma/README.md) | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit. | English | | [olaph](olaph/README.md) | Open-ended medical factuality Question Answering from the OLAPH dataset. | English |
| [paws-x](paws-x/README.md) | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean | | [openbookqa](openbookqa/README.md) | Open-book question answering tasks that require external knowledge and reasoning. | English |
| [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English | | [paloma](paloma/README.md) | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit. | English |
| [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English | | [paws-x](paws-x/README.md) | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean |
| [piqa](piqa/README.md) | Physical Interaction Question Answering tasks to test physical commonsense reasoning. | English | | [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English |
| [polemo2](polemo2/README.md) | Sentiment analysis and emotion detection tasks based on Polish language data. | Polish | | [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English |
| [portuguese_bench](portuguese_bench/README.md) | Collection of tasks in European Portuguese encompassing various evaluation areas. | Portuguese | | [piqa](piqa/README.md) | Physical Interaction Question Answering tasks to test physical commonsense reasoning. | English |
| [prost](prost/README.md) | Tasks requiring understanding of professional standards and ethics in various domains. | English | | [polemo2](polemo2/README.md) | Sentiment analysis and emotion detection tasks based on Polish language data. | Polish |
| [pubmedqa](pubmedqa/README.md) | Question answering tasks based on PubMed research articles for biomedical understanding. | English | | [portuguese_bench](portuguese_bench/README.md) | Collection of tasks in European Portuguese encompassing various evaluation areas. | Portuguese |
| [qa4mre](qa4mre/README.md) | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning. | English | | [prost](prost/README.md) | Tasks requiring understanding of professional standards and ethics in various domains. | English |
| [qasper](qasper/README.md) | Question Answering dataset based on academic papers, testing in-depth scientific knowledge. | English | | [pubmedqa](pubmedqa/README.md) | Question answering tasks based on PubMed research articles for biomedical understanding. | English |
| [race](race/README.md) | Reading comprehension assessment tasks based on English exams in China. | English | | [qa4mre](qa4mre/README.md) | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning. | English |
| realtoxicityprompts | Tasks to evaluate language models for generating text with potential toxicity. | | | [qasper](qasper/README.md) | Question Answering dataset based on academic papers, testing in-depth scientific knowledge. | English |
| [ruler](ruler/README.md) | RULER is a benchmark for testing how well language models handle long pieces of text. Requires custom arg (see readme) | English | | [race](race/README.md) | Reading comprehension assessment tasks based on English exams in China. | English |
| [sciq](sciq/README.md) | Science Question Answering tasks to assess understanding of scientific concepts. | English | | realtoxicityprompts | Tasks to evaluate language models for generating text with potential toxicity. | |
| [score](score/README.md) | Systematic consistency and robustness evaluation for LLMs on 3 datasets(MMLU-Pro, Agi Eval and MATH) | English | | [ruler](ruler/README.md) | RULER is a benchmark for testing how well language models handle long pieces of text. Requires custom arg (see readme) | English |
| [scrolls](scrolls/README.md) | Tasks that involve long-form reading comprehension across various domains. | English | | [sciq](sciq/README.md) | Science Question Answering tasks to assess understanding of scientific concepts. | English |
| [simple_cooccurrence_bias](simple_cooccurrence_bias/README.md) | A metric that evaluates language models for biases based on stereotypical word associations and co-occurrences in text. | English | | [score](score/README.md) | Systematic consistency and robustness evaluation for LLMs on 3 datasets(MMLU-Pro, Agi Eval and MATH) | English |
| [siqa](siqa/README.md) | Social Interaction Question Answering to evaluate common sense and social reasoning. | English | | [scrolls](scrolls/README.md) | Tasks that involve long-form reading comprehension across various domains. | English |
| [spanish_bench](spanish_bench/README.md) | Collection of tasks in Spanish encompassing various evaluation areas. | Spanish | | [simple_cooccurrence_bias](simple_cooccurrence_bias/README.md) | A metric that evaluates language models for biases based on stereotypical word associations and co-occurrences in text. | English |
| [squad_completion](squad_completion/README.md) | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs. | English | | [siqa](siqa/README.md) | Social Interaction Question Answering to evaluate common sense and social reasoning. | English |
| [squadv2](squadv2/README.md) | Stanford Question Answering Dataset version 2, a reading comprehension benchmark. | English | | [spanish_bench](spanish_bench/README.md) | Collection of tasks in Spanish encompassing various evaluation areas. | Spanish |
| [storycloze](storycloze/README.md) | Tasks to predict story endings, focusing on narrative logic and coherence. | English | | [squad_completion](squad_completion/README.md) | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs. | English |
| [super_glue](super_glue/README.md) | A suite of challenging tasks designed to test a range of language understanding skills. | English | | [squadv2](squadv2/README.md) | Stanford Question Answering Dataset version 2, a reading comprehension benchmark. | English |
| [swag](swag/README.md) | Situations With Adversarial Generations, predicting the next event in videos. | English | | [storycloze](storycloze/README.md) | Tasks to predict story endings, focusing on narrative logic and coherence. | English |
| [swde](swde/README.md) | Information extraction tasks from semi-structured web pages. | English | | [super_glue](super_glue/README.md) | A suite of challenging tasks designed to test a range of language understanding skills. | English |
| [tinyBenchmarks](tinyBenchmarks/README.md) | Evaluation of large language models with fewer examples using tiny versions of popular benchmarks. | English | | [swag](swag/README.md) | Situations With Adversarial Generations, predicting the next event in videos. | English |
| [tmmluplus](tmmluplus/README.md) | An extended set of tasks under the TMMLU framework for broader academic assessments. | Traditional Chinese | | [swde](swde/README.md) | Information extraction tasks from semi-structured web pages. | English |
| [toxigen](toxigen/README.md) | Tasks designed to evaluate language models on their propensity to generate toxic content. | English | | [tinyBenchmarks](tinyBenchmarks/README.md) | Evaluation of large language models with fewer examples using tiny versions of popular benchmarks. | English |
| [translation](translation/README.md) | Tasks focused on evaluating the language translation capabilities of models. | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese | | [tmmluplus](tmmluplus/README.md) | An extended set of tasks under the TMMLU framework for broader academic assessments. | Traditional Chinese |
| [triviaqa](triviaqa/README.md) | A large-scale dataset for trivia question answering to test general knowledge. | English | | [toxigen](toxigen/README.md) | Tasks designed to evaluate language models on their propensity to generate toxic content. | English |
| [truthfulqa](truthfulqa/README.md) | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English | | [translation](translation/README.md) | Tasks focused on evaluating the language translation capabilities of models. | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese |
| [turkishmmlu](turkishmmlu/README.md) | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams. | Turkish | | [triviaqa](triviaqa/README.md) | A large-scale dataset for trivia question answering to test general knowledge. | English |
| [unitxt](unitxt/README.md) | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI. | English | | [truthfulqa](truthfulqa/README.md) | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English |
| [unscramble](unscramble/README.md) | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding. | English | | [turkishmmlu](turkishmmlu/README.md) | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams. | Turkish |
| [webqs](webqs/README.md) | Web-based question answering tasks designed to evaluate internet search and retrieval. | English | | [unitxt](unitxt/README.md) | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI. | English |
| [wikitext](wikitext/README.md) | Tasks based on text from Wikipedia articles to assess language modeling and generation. | English | | [unscramble](unscramble/README.md) | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding. | English |
| [winogender](winogender/README.md) | A diagnostic dataset that tests for gender bias in coreference resolution by measuring how models associate pronouns with different occupations. | English | | [webqs](webqs/README.md) | Web-based question answering tasks designed to evaluate internet search and retrieval. | English |
| [winogrande](winogrande/README.md) | A large-scale dataset for coreference resolution, inspired by the Winograd Schema Challenge. | English | | [wikitext](wikitext/README.md) | Tasks based on text from Wikipedia articles to assess language modeling and generation. | English |
| [wmdp](wmdp/README.md) | A benchmark with the objective of minimizing performance, based on potentially-sensitive multiple-choice knowledge questions. | English | | [winogender](winogender/README.md) | A diagnostic dataset that tests for gender bias in coreference resolution by measuring how models associate pronouns with different occupations. | English |
| [wmt2016](wmt2016/README.md) | Tasks from the WMT 2016 shared task, focusing on translation between multiple languages. | English, Czech, German, Finnish, Russian, Romanian, Turkish | | [winogrande](winogrande/README.md) | A large-scale dataset for coreference resolution, inspired by the Winograd Schema Challenge. | English |
| [wsc273](wsc273/README.md) | The Winograd Schema Challenge, a test of commonsense reasoning and coreference resolution. | English | | [wmdp](wmdp/README.md) | A benchmark with the objective of minimizing performance, based on potentially-sensitive multiple-choice knowledge questions. | English |
| [xcopa](xcopa/README.md) | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages. | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese | | [wmt2016](wmt2016/README.md) | Tasks from the WMT 2016 shared task, focusing on translation between multiple languages. | English, Czech, German, Finnish, Russian, Romanian, Turkish |
| [wsc273](wsc273/README.md) | The Winograd Schema Challenge, a test of commonsense reasoning and coreference resolution. | English |
| [xcopa](xcopa/README.md) | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages. | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese |
| [xnli](xnli/README.md) | Cross-Lingual Natural Language Inference to test understanding across different languages. | Arabic, Bulgarian, German, Greek, English, Spanish, French, Hindi, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese | | [xnli](xnli/README.md) | Cross-Lingual Natural Language Inference to test understanding across different languages. | Arabic, Bulgarian, German, Greek, English, Spanish, French, Hindi, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese |
| [xnli_eu](xnli_eu/README.md) | Cross-lingual Natural Language Inference tasks in Basque. | Basque | | [xnli_eu](xnli_eu/README.md) | Cross-lingual Natural Language Inference tasks in Basque. | Basque |
| [xquad](xquad/README.md) | Cross-lingual Question Answering Dataset in multiple languages. | Arabic, German, Greek, English, Spanish, Hindi, Romanian, Russian, Thai, Turkish, Vietnamese, Chinese | | [xquad](xquad/README.md) | Cross-lingual Question Answering Dataset in multiple languages. | Arabic, German, Greek, English, Spanish, Hindi, Romanian, Russian, Thai, Turkish, Vietnamese, Chinese |
| [xstorycloze](xstorycloze/README.md) | Cross-lingual narrative understanding tasks to predict story endings in multiple languages. | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese | | [xstorycloze](xstorycloze/README.md) | Cross-lingual narrative understanding tasks to predict story endings in multiple languages. | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese |
| [xwinograd](xwinograd/README.md) | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages. | English, French, Japanese, Portuguese, Russian, Chinese | | [xwinograd](xwinograd/README.md) | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages. | English, French, Japanese, Portuguese, Russian, Chinese |
## Multilingual Tasks ## Multimodal Tasks
| Task Family | Description | Modality | | Task Family | Description | Modality |
|------------------------------|---------------------------------------------------------------------------------------------------------|-------------| |------------------------------|---------------------------------------------------------------------------------------------------------|-------------|
| [chartqa](chartqa/README.md) | A benchmark for question answering about charts that requires both visual and logical reasoning. | Image, Text | | [chartqa](chartqa/README.md) | A benchmark for question answering about charts that requires both visual and logical reasoning. | Image, Text |
......
...@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench ...@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench
test_split: test test_split: test
dataset_name: 2wikimqa dataset_name: 2wikimqa
doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:' doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
doc_to_target: '{{answers}}' doc_to_target: '{{answers[0]}}'
generation_kwargs: generation_kwargs:
max_gen_toks: 32 max_gen_toks: 32
temperature: 1 temperature: 1
do_sample: True do_sample: True
until: []
metric_list: metric_list:
- metric: !function metrics.qa_f1_score - metric: !function metrics.qa_f1_score
aggregation: mean aggregation: mean
higher_is_better: True higher_is_better: True
metadata: metadata:
version: 1.0 version: 2.0
tag: tag:
- longbench_e - longbench_e
task: longbench_2wikimqa_e task: longbench_2wikimqa_e
...@@ -5,14 +6,15 @@ dataset_path: THUDM/LongBench ...@@ -5,14 +6,15 @@ dataset_path: THUDM/LongBench
test_split: test test_split: test
dataset_name: 2wikimqa_e dataset_name: 2wikimqa_e
doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:' doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
doc_to_target: '{{answers}}' doc_to_target: '{{answers[0]}}'
generation_kwargs: generation_kwargs:
max_gen_toks: 32 max_gen_toks: 32
temperature: 1 temperature: 1
do_sample: True do_sample: True
until: []
metric_list: metric_list:
- metric: !function metrics.qa_f1_score - metric: !function metrics.qa_f1_score
aggregation: mean aggregation: mean
higher_is_better: True higher_is_better: True
metadata: metadata:
version: 1.0 version: 2.0
...@@ -95,3 +95,4 @@ If other tasks on this dataset are already supported: ...@@ -95,3 +95,4 @@ If other tasks on this dataset are already supported:
* [x] Have you noted which, if any, published evaluation setups are matched by this variant? * [x] Have you noted which, if any, published evaluation setups are matched by this variant?
### Changelog ### Changelog
v2.: fix doc_to_target; add vcsum
...@@ -138,7 +138,7 @@ DATASETS = [ ...@@ -138,7 +138,7 @@ DATASETS = [
def parse_args(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--save_prefix_path", default="longbench") parser.add_argument("--save_prefix_path", default="")
return parser.parse_args() return parser.parse_args()
...@@ -156,6 +156,7 @@ generation_kwargs: ...@@ -156,6 +156,7 @@ generation_kwargs:
max_gen_toks: {{ generation_kwargs.max_gen_toks }} max_gen_toks: {{ generation_kwargs.max_gen_toks }}
temperature: {{ generation_kwargs.temperature }} temperature: {{ generation_kwargs.temperature }}
do_sample: {{ generation_kwargs.do_sample }} do_sample: {{ generation_kwargs.do_sample }}
until: {{ generation_kwargs.until }}
metric_list: metric_list:
- metric: {{ metric_list[0].metric }} - metric: {{ metric_list[0].metric }}
aggregation: {{ metric_list[0].aggregation }} aggregation: {{ metric_list[0].aggregation }}
...@@ -171,10 +172,21 @@ if __name__ == "__main__": ...@@ -171,10 +172,21 @@ if __name__ == "__main__":
template = env.from_string(template_str) template = env.from_string(template_str)
for ds in DATASETS: for ds in DATASETS:
df = ds[:-2] if ds.endswith("_e") else ds df = ds[:-2] if ds.endswith("_e") else ds
# from https://github.com/THUDM/LongBench/blob/2e00731f8d0bff23dc4325161044d0ed8af94c1e/LongBench/eval.py#L52C25-L52C29
if df in ["trec", "triviaqa", "samsum", "lsht"] + [
"trec_e",
"triviaqa_e",
"samsum_e",
"lsht_e",
]:
until = ["\n"]
else:
until = []
generation_kwargs = { generation_kwargs = {
"max_gen_toks": dataset2maxlen[df], "max_gen_toks": dataset2maxlen[df],
"temperature": 1, "temperature": 1,
"do_sample": True, "do_sample": True,
"until": until,
} }
raw_doc_to_text = ( raw_doc_to_text = (
dataset2prompt[df] dataset2prompt[df]
...@@ -199,10 +211,10 @@ if __name__ == "__main__": ...@@ -199,10 +211,10 @@ if __name__ == "__main__":
"test_split": "test", "test_split": "test",
"dataset_name": ds, "dataset_name": ds,
"doc_to_text": raw_doc_to_text, "doc_to_text": raw_doc_to_text,
"doc_to_target": "{{answers}}", "doc_to_target": "{{answers[0]}}",
"generation_kwargs": generation_kwargs, "generation_kwargs": generation_kwargs,
"metric_list": metric_list, "metric_list": metric_list,
"metadata": {"version": "1.0"}, "metadata": {"version": "2.0"},
} }
# Render template # Render template
...@@ -211,35 +223,3 @@ if __name__ == "__main__": ...@@ -211,35 +223,3 @@ if __name__ == "__main__":
# Save to file # Save to file
with open(args.save_prefix_path + f"{ds}.yaml", "w") as f: with open(args.save_prefix_path + f"{ds}.yaml", "w") as f:
f.write(rendered_yaml) f.write(rendered_yaml)
# for ds in DATASETS:
# df = ds[:-2] if ds.endswith("_e") else ds
# generation_kwargs = {"max_gen_toks": dataset2maxlen[df], "temperature": 1, "do_sample": False}
# # Escape newlines and curly braces
# raw_doc_to_text = dataset2prompt[df].replace("\n", "\\n").replace("{", "{{").replace("}", "}}")
# metric_list = [
# {"metric": f"!function metrics.{dataset2metric[df]}", "aggregation": "mean", "higher_is_better": True}]
# yaml_dict = {
# "tag": ["longbench_e" if ds.endswith("_e") else "longbench"],
# "task": f"longbench_{ds}",
# "dataset_path": "THUDM/LongBench",
# "test_split": "test",
# "dataset_name": ds,
# "doc_to_text": raw_doc_to_text,
# "doc_to_target": "{{answers}}",
# "generation_kwargs": generation_kwargs,
# "metric_list": metric_list,
# "metadata": {"version": "1.0"}
# }
# template = env.from_string(yaml_dict)
#
#
# file_save_path = args.save_prefix_path + f"{ds}.yaml"
# with open(file_save_path, "w", encoding="utf-8") as yaml_file:
# yaml.dump(
# yaml_dict,
# yaml_file,
# allow_unicode=True,
# default_flow_style=False,
# sort_keys=False
# )
...@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench ...@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench
test_split: test test_split: test
dataset_name: dureader dataset_name: dureader
doc_to_text: '请基于给定的文章回答下述问题。\n\n文章:{{context}}\n\n请基于上述文章回答下面的问题。\n\n问题:{{input}}\n回答:' doc_to_text: '请基于给定的文章回答下述问题。\n\n文章:{{context}}\n\n请基于上述文章回答下面的问题。\n\n问题:{{input}}\n回答:'
doc_to_target: '{{answers}}' doc_to_target: '{{answers[0]}}'
generation_kwargs: generation_kwargs:
max_gen_toks: 128 max_gen_toks: 128
temperature: 1 temperature: 1
do_sample: True do_sample: True
until: []
metric_list: metric_list:
- metric: !function metrics.rouge_zh_score - metric: !function metrics.rouge_zh_score
aggregation: mean aggregation: mean
higher_is_better: True higher_is_better: True
metadata: metadata:
version: 1.0 version: 2.0
...@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench ...@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench
test_split: test test_split: test
dataset_name: gov_report dataset_name: gov_report
doc_to_text: 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{{context}}\n\nNow, write a one-page summary of the report.\n\nSummary:' doc_to_text: 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{{context}}\n\nNow, write a one-page summary of the report.\n\nSummary:'
doc_to_target: '{{answers}}' doc_to_target: '{{answers[0]}}'
generation_kwargs: generation_kwargs:
max_gen_toks: 512 max_gen_toks: 512
temperature: 1 temperature: 1
do_sample: True do_sample: True
until: []
metric_list: metric_list:
- metric: !function metrics.rouge_score - metric: !function metrics.rouge_score
aggregation: mean aggregation: mean
higher_is_better: True higher_is_better: True
metadata: metadata:
version: 1.0 version: 2.0
...@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench ...@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench
test_split: test test_split: test
dataset_name: gov_report_e dataset_name: gov_report_e
doc_to_text: 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{{context}}\n\nNow, write a one-page summary of the report.\n\nSummary:' doc_to_text: 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{{context}}\n\nNow, write a one-page summary of the report.\n\nSummary:'
doc_to_target: '{{answers}}' doc_to_target: '{{answers[0]}}'
generation_kwargs: generation_kwargs:
max_gen_toks: 512 max_gen_toks: 512
temperature: 1 temperature: 1
do_sample: True do_sample: True
until: []
metric_list: metric_list:
- metric: !function metrics.rouge_score - metric: !function metrics.rouge_score
aggregation: mean aggregation: mean
higher_is_better: True higher_is_better: True
metadata: metadata:
version: 1.0 version: 2.0
...@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench ...@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench
test_split: test test_split: test
dataset_name: hotpotqa dataset_name: hotpotqa
doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:' doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
doc_to_target: '{{answers}}' doc_to_target: '{{answers[0]}}'
generation_kwargs: generation_kwargs:
max_gen_toks: 32 max_gen_toks: 32
temperature: 1 temperature: 1
do_sample: True do_sample: True
until: []
metric_list: metric_list:
- metric: !function metrics.qa_f1_score - metric: !function metrics.qa_f1_score
aggregation: mean aggregation: mean
higher_is_better: True higher_is_better: True
metadata: metadata:
version: 1.0 version: 2.0
...@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench ...@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench
test_split: test test_split: test
dataset_name: hotpotqa_e dataset_name: hotpotqa_e
doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:' doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
doc_to_target: '{{answers}}' doc_to_target: '{{answers[0]}}'
generation_kwargs: generation_kwargs:
max_gen_toks: 32 max_gen_toks: 32
temperature: 1 temperature: 1
do_sample: True do_sample: True
until: []
metric_list: metric_list:
- metric: !function metrics.qa_f1_score - metric: !function metrics.qa_f1_score
aggregation: mean aggregation: mean
higher_is_better: True higher_is_better: True
metadata: metadata:
version: 1.0 version: 2.0
...@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench ...@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench
test_split: test test_split: test
dataset_name: lcc dataset_name: lcc
doc_to_text: 'Please complete the code given below. \n{{context}}Next line of code:\n' doc_to_text: 'Please complete the code given below. \n{{context}}Next line of code:\n'
doc_to_target: '{{answers}}' doc_to_target: '{{answers[0]}}'
generation_kwargs: generation_kwargs:
max_gen_toks: 64 max_gen_toks: 64
temperature: 1 temperature: 1
do_sample: True do_sample: True
until: []
metric_list: metric_list:
- metric: !function metrics.code_sim_score - metric: !function metrics.code_sim_score
aggregation: mean aggregation: mean
higher_is_better: True higher_is_better: True
metadata: metadata:
version: 1.0 version: 2.0
...@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench ...@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench
test_split: test test_split: test
dataset_name: lcc_e dataset_name: lcc_e
doc_to_text: 'Please complete the code given below. \n{{context}}Next line of code:\n' doc_to_text: 'Please complete the code given below. \n{{context}}Next line of code:\n'
doc_to_target: '{{answers}}' doc_to_target: '{{answers[0]}}'
generation_kwargs: generation_kwargs:
max_gen_toks: 64 max_gen_toks: 64
temperature: 1 temperature: 1
do_sample: True do_sample: True
until: []
metric_list: metric_list:
- metric: !function metrics.code_sim_score - metric: !function metrics.code_sim_score
aggregation: mean aggregation: mean
higher_is_better: True higher_is_better: True
metadata: metadata:
version: 1.0 version: 2.0
...@@ -6,14 +6,16 @@ dataset_path: THUDM/LongBench ...@@ -6,14 +6,16 @@ dataset_path: THUDM/LongBench
test_split: test test_split: test
dataset_name: lsht dataset_name: lsht
doc_to_text: '请判断给定新闻的类别,下面是一些例子。\n\n{{context}}\n{{input}}' doc_to_text: '请判断给定新闻的类别,下面是一些例子。\n\n{{context}}\n{{input}}'
doc_to_target: '{{answers}}' doc_to_target: '{{answers[0]}}'
process_results: !function metrics.classification_score
generation_kwargs: generation_kwargs:
max_gen_toks: 64 max_gen_toks: 64
temperature: 1 temperature: 1
do_sample: True do_sample: True
until: ['\n']
metric_list: metric_list:
- metric: !function metrics.classification_score - metric: "classification_score"
aggregation: mean aggregation: mean
higher_is_better: True higher_is_better: True
metadata: metadata:
version: 1.0 version: 2.0
...@@ -124,12 +124,10 @@ def code_sim_score(predictions: list[str], references: list[str], **kwargs) -> f ...@@ -124,12 +124,10 @@ def code_sim_score(predictions: list[str], references: list[str], **kwargs) -> f
return fuzz.ratio(prediction, ground_truth) / 100 return fuzz.ratio(prediction, ground_truth) / 100
def classification_score( def classification_score(doc: dict, results: list[str], **kwargs) -> dict:
predictions: list[str], references: list[str], **kwargs prediction, ground_truth = results[0], doc["answers"][0]
) -> float:
prediction, ground_truth = predictions[0], references[0]
em_match_list = [] em_match_list = []
all_classes = kwargs["all_classes"] all_classes = doc["all_classes"]
for class_name in all_classes: for class_name in all_classes:
if class_name in prediction: if class_name in prediction:
em_match_list.append(class_name) em_match_list.append(class_name)
...@@ -140,12 +138,14 @@ def classification_score( ...@@ -140,12 +138,14 @@ def classification_score(
score = 1.0 / len(em_match_list) score = 1.0 / len(em_match_list)
else: else:
score = 0.0 score = 0.0
return score return {"classification_score": score}
def rouge_score(predictions: list[str], references: list[str], **kwargs) -> float: def rouge_score(predictions: list[str], references: list[str], **kwargs) -> float:
global rouge
if "rouge" not in globals():
rouge = Rouge()
prediction, ground_truth = predictions[0], references[0] prediction, ground_truth = predictions[0], references[0]
rouge = Rouge()
try: try:
scores = rouge.get_scores([prediction], [ground_truth], avg=True) scores = rouge.get_scores([prediction], [ground_truth], avg=True)
# ruff: noqa # ruff: noqa
...@@ -162,7 +162,7 @@ def rouge_zh_score(predictions: list[str], references: list[str], **kwargs) -> f ...@@ -162,7 +162,7 @@ def rouge_zh_score(predictions: list[str], references: list[str], **kwargs) -> f
return score return score
def f1_score(predictions: list[str], references: list[str], **kwargs): def f1_score(predictions: list[str], references: list[str], **kwargs) -> float:
try: try:
prediction, ground_truth = predictions[0], references[0] prediction, ground_truth = predictions[0], references[0]
except: except:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment