Commit efb46937 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into convert_gen

# Conflicts:
#	lm_eval/__main__.py
#	lm_eval/evaluator.py
parents 7fbf899c ade01428
import copy
import logging
import os
from datetime import timedelta
from pathlib import Path
......@@ -39,7 +40,7 @@ from lm_eval.models.utils import (
)
eval_logger = utils.eval_logger
eval_logger = logging.getLogger(__name__)
@register_model("hf-auto", "hf", "huggingface")
......
import copy
import json
import logging
import os
from functools import lru_cache
from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Type, cast
......@@ -10,7 +11,10 @@ from lm_eval.api.instance import Instance
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.models.api_models import JsonChatStr
from lm_eval.utils import eval_logger, simple_parse_args_string
from lm_eval.utils import simple_parse_args_string
eval_logger = logging.getLogger(__name__)
class LogLikelihoodResult(NamedTuple):
......
......@@ -13,6 +13,7 @@
# limitations under the License.
import importlib
import logging
import pathlib
from copy import deepcopy
from typing import List, Literal
......@@ -27,13 +28,15 @@ from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.models.utils import Collator
from lm_eval.utils import (
eval_logger,
get_rolling_token_windows,
make_disjoint_window,
simple_parse_args_string,
)
eval_logger = logging.getLogger(__name__)
def _patch_pretrained_cfg(
pretrained_cfg, trainer, tensor_model_parallel_size, pipeline_model_parallel_size
):
......
import copy
import logging
from typing import List, Optional, Tuple, Union
import numpy
......@@ -13,7 +14,7 @@ from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM
eval_logger = utils.eval_logger
eval_logger = logging.getLogger(__name__)
@register_model("sparseml")
......
import logging
import os
from functools import cached_property
from operator import itemgetter
......@@ -6,7 +7,9 @@ from typing import Any, Dict, List, Optional, Tuple, Union
from lm_eval.api.registry import register_model
from lm_eval.models.api_models import TemplateAPI
from lm_eval.models.utils import handle_stop_sequences
from lm_eval.utils import eval_logger
eval_logger = logging.getLogger(__name__)
@register_model("local-completions")
......@@ -288,4 +291,6 @@ class OpenAIChatCompletion(LocalChatCompletion):
if "o1" in self.model:
output.pop("stop")
output["temperature"] = 1
elif "o3" in self.model:
output.pop("temperature")
return output
import logging
from importlib.util import find_spec
from lm_eval import utils
from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM
from lm_eval.models.utils import get_dtype
eval_logger = utils.eval_logger
eval_logger = logging.getLogger(__name__)
@register_model("ipex")
......
import json
import logging
from importlib.util import find_spec
from pathlib import Path
from lm_eval import utils
from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM
eval_logger = utils.eval_logger
eval_logger = logging.getLogger(__name__)
@register_model("openvino")
......
This diff is collapsed.
......@@ -2,6 +2,7 @@ import collections
import fnmatch
import gc
import itertools
import logging
import time
from functools import wraps
from typing import (
......@@ -22,7 +23,8 @@ from typing import (
import torch
import transformers
from lm_eval.utils import eval_logger
eval_logger = logging.getLogger(__name__)
if TYPE_CHECKING:
......
import copy
import logging
from importlib.metadata import version
from importlib.util import find_spec
from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
......@@ -17,7 +18,6 @@ from lm_eval.models.utils import (
undistribute,
)
from lm_eval.utils import (
eval_logger,
get_rolling_token_windows,
make_disjoint_window,
)
......@@ -34,7 +34,7 @@ except ModuleNotFoundError:
if TYPE_CHECKING:
pass
eval_logger = eval_logger
eval_logger = logging.getLogger(__name__)
@register_model("vllm")
......@@ -75,7 +75,6 @@ class VLLM(TemplateLM):
"Please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
)
assert "cuda" in device or device is None, "vLLM only supports CUDA"
assert max_length is None or max_model_len is None, (
"Either max_length or max_model_len may be provided, but not both"
)
......@@ -110,7 +109,7 @@ class VLLM(TemplateLM):
eval_logger.warning(
"You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached."
)
self.model_args["worker_use_ray"] = True
self.model_args["distributed_executor_backend"] = "ray"
self.batch_size = "auto"
eval_logger.info("Manual batching is not compatible with data parallelism.")
......@@ -244,15 +243,13 @@ class VLLM(TemplateLM):
temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
)
if self.data_parallel_size > 1:
# vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
# vLLM hangs if resources are set in ray.remote
# also seems to only work with decorator and not with ray.remote() fn
# see https://github.com/vllm-project/vllm/issues/973
# note: this has changed on 0.3.3, and it only works now if num_gpus are set.
# but then tensor_parallel breaks
@ray.remote
def run_inference_one_model(
model_args: dict,
sampling_params,
sampling_params: SamplingParams,
requests: List[List[int]],
lora_request: LoRARequest,
):
......
import copy
import logging
from typing import Dict, List, Optional
import transformers
......@@ -14,7 +15,9 @@ from lm_eval.models.utils import (
undistribute,
)
from lm_eval.models.vllm_causallms import VLLM
from lm_eval.utils import eval_logger
eval_logger = logging.getLogger(__name__)
try:
......@@ -106,11 +109,9 @@ class VLLM_VLM(VLLM):
temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
)
if self.data_parallel_size > 1:
# vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
# vLLM hangs if resources are set in ray.remote
# also seems to only work with decorator and not with ray.remote() fn
# see https://github.com/vllm-project/vllm/issues/973
# note: this has changed on 0.3.3, and it only works now if num_gpus are set.
# but then tensor_parallel breaks
@ray.remote
def run_inference_one_model(
model_args: dict, sampling_params, requests: List[List[dict]]
......
import ast
import logging
import os
from typing import Dict
from lm_eval import utils
from lm_eval.utils import eval_logger
eval_logger = logging.getLogger(__name__)
# Prompt library.
# Stores prompts in a dictionary indexed by 2 levels:
# prompt category name, and prompt name.
......
......@@ -42,6 +42,7 @@
| [eus_proficiency](eus_proficiency/README.md) | Tasks designed to test proficiency in the Basque language across various topics. | Basque |
| [eus_reading](eus_reading/README.md) | Reading comprehension tasks specifically designed for the Basque language. | Basque |
| [eus_trivia](eus_trivia/README.md) | Trivia and knowledge testing tasks in the Basque language. | Basque |
| [evalita-LLM](evalita-LLM/README.md) | A native Italian benchmark with diverse tasks formats and multiple prompts. | Italian |
| [fda](fda/README.md) | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English |
| [fld](fld/README.md) | Tasks involving free-form and directed dialogue understanding. | English |
| [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French |
......@@ -50,6 +51,7 @@
| [glue](glue/README.md) | General Language Understanding Evaluation benchmark to test broad language abilities. | English |
| [gpqa](gpqa/README.md) | Tasks designed for general public question answering and knowledge verification. | English |
| [gsm8k](gsm8k/README.md) | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English |
| [groundcocoa](groundcocoa/README.md) | A benchmark evaluating the conditional and compositional reasoning of language models using a grounding task. | English |
| [haerae](haerae/README.md) | Tasks focused on assessing detailed factual and historical knowledge. | Korean |
| [headqa](headqa/README.md) | A high-level education-based question answering dataset to test specialized knowledge. | Spanish, English |
| [hellaswag](hellaswag/README.md) | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity. | English |
......@@ -85,7 +87,7 @@
| [mlqa](mlqa/README.md) | MultiLingual Question Answering benchmark dataset for evaluating cross-lingual question answering performance. | English, Arabic, German, Spanish, Hindi, Vietnamese, Simplified Chinese |
| [mmlu](mmlu/README.md) | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English |
| [mmlu_pro](mmlu_pro/README.md) | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options. | English |
| [mmlu-pro-plus](mmlu-pro-plus/README.md) | A new test set for evaluating shortcut learning and higher-order reasoning of LLMs. | English |
| [mmlu-pro-plus](mmlu-pro-plus/README.md) | A new test set for evaluating shortcut learning and higher-order reasoning of LLMs. | English |
| [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigorous. | English |
| model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | |
| [moral_stories](moral_stories/README.md) | A crowd-sourced dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations. | English
......
......@@ -14,6 +14,8 @@ from lm_eval.tasks.mmlu_pro.utils import doc_to_text
GROUP_ONLY_KEYS = list(GroupConfig().to_dict().keys())
eval_logger = logging.getLogger(__name__)
def convert_mcq_to_generative(cfg: dict):
prompt = """Given the following question and candidate answers, choose the correct answer."""
......@@ -71,15 +73,14 @@ class TaskManager:
def __init__(
self,
verbosity="INFO",
verbosity: Optional[str] = None,
include_path: Optional[Union[str, List]] = None,
include_defaults: bool = True,
mcq_to_generative: bool = False,
) -> None:
self.verbosity = verbosity
if verbosity is not None:
utils.setup_logging(verbosity)
self.include_path = include_path
self.logger = utils.eval_logger
self.logger.setLevel(getattr(logging, f"{verbosity}"))
self._task_index = self.initialize_tasks(
include_path=include_path, include_defaults=include_defaults
......@@ -513,7 +514,7 @@ class TaskManager:
"yaml_path": -1,
}
elif tasks_and_groups[tag]["type"] != "tag":
self.logger.info(
eval_logger.info(
f"The tag '{tag}' is already registered as a group, this tag will not be registered. "
"This may affect tasks you want to call."
)
......@@ -576,7 +577,7 @@ class TaskManager:
config, task, tasks_and_groups, print_info
)
else:
self.logger.debug(f"File {f} in {root} could not be loaded")
eval_logger.debug(f"File {f} in {root} could not be loaded")
return tasks_and_groups
......
......@@ -10,7 +10,7 @@ import yaml
from tqdm import tqdm
eval_logger = logging.getLogger("lm-eval")
eval_logger = logging.getLogger(__name__)
SUBJECTS = {
......
......@@ -58,3 +58,6 @@ If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
### Changelog
version 2.0: (2025-Feb-14) set target delimiter to "" as the targets already start with a space.
......@@ -8,11 +8,12 @@ validation_split: validation
test_split: null
doc_to_text: "{{context}}"
doc_to_target: "{{completion}}"
target_delimiter: ""
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -5,14 +5,16 @@
BasqueBench is a benchmark for evaluating language models in Basque tasks. This is, it evaluates the ability of a language model to understand and generate Basque text. BasqueBench offers a combination of pre-existing, open datasets and datasets developed exclusivelly for this benchmark. All the details of BasqueBench will be published in a paper soon.
The new evaluation datasets included in BasqueBench are:
| Task | Category | Homepage |
|:-------------:|:-----:|:-----:|
| MGSM_eu | Math | https://huggingface.co/datasets/HiTZ/MGSM-eu |
| PIQA_eu | Question Answering | https://huggingface.co/datasets/HiTZ/PIQA-eu |
| WNLI_eu | Natural Language Inference | https://huggingface.co/datasets/HiTZ/wnli-eu |
| XCOPA_eu | Commonsense Reasoning | https://huggingface.co/datasets/HiTZ/XCOPA-eu |
| Task | Category | Homepage |
|:--------:|:--------------------------:|:---------------------------------------------:|
| ARC_eu | Question Answering | https://huggingface.co/datasets/HiTZ/ARC-eu |
| MGSM_eu | Math | https://huggingface.co/datasets/HiTZ/MGSM-eu |
| PAWS_eu | Paraphrasing | https://huggingface.co/datasets/HiTZ/PAWS-eu |
| PIQA_eu | Question Answering | https://huggingface.co/datasets/HiTZ/PIQA-eu |
| WNLI_eu | Natural Language Inference | https://huggingface.co/datasets/HiTZ/WNLI-eu |
| XCOPA_eu | Commonsense Reasoning | https://huggingface.co/datasets/HiTZ/XCOPA-eu |
The datasets included in BasqueBench that have been made public in previous pubications are:
The datasets included in BasqueBench that have been made public in previous publications are:
| Task | Category | Paper title | Homepage |
|:-------------:|:-----:|:-------------:|:-----:|
......@@ -28,7 +30,40 @@ The datasets included in BasqueBench that have been made public in previous pubi
### Citation
Paper for BasqueBench coming soon.
```
@inproceedings{baucells-etal-2025-iberobench,
title = "{I}bero{B}ench: A Benchmark for {LLM} Evaluation in {I}berian Languages",
author = "Baucells, Irene and
Aula-Blasco, Javier and
de-Dios-Flores, Iria and
Paniagua Su{\'a}rez, Silvia and
Perez, Naiara and
Salles, Anna and
Sotelo Docio, Susana and
Falc{\~a}o, J{\'u}lia and
Saiz, Jose Javier and
Sepulveda Torres, Robiert and
Barnes, Jeremy and
Gamallo, Pablo and
Gonzalez-Agirre, Aitor and
Rigau, German and
Villegas, Marta",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.699/",
pages = "10491--10519",
}
```
### Groups and Tasks
......@@ -40,6 +75,8 @@ Paper for BasqueBench coming soon.
#### Tasks
The following tasks evaluate tasks on BasqueBench dataset using various scoring methods.
- `arc_eu_challenge`
- `arc_eu_easy`
- `belebele_eus_Latn`
- `eus_exams_eu`
- `eus_proficiency`
......@@ -64,6 +101,7 @@ The following tasks evaluate tasks on BasqueBench dataset using various scoring
- `flores_pt-eu`
- `mgsm_direct_eu`
- `mgsm_native_cot_eu`
- `paws_eu`
- `piqa_eu`
- `qnlieu`
- `wnli_eu`
......
include: arc_eu_easy.yaml
task: arc_eu_challenge
dataset_name: ARC-Challenge
task: arc_eu_easy
dataset_path: HiTZ/ARC-eu
dataset_name: ARC-Easy
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
doc_to_text: "Galdera: {{question}}\nErantzuna:"
doc_to_target: "{{choices.label.index(answerKey)}}"
doc_to_choice: "{{choices.text}}"
should_decontaminate: true
doc_to_decontamination_query: "Galdera: {{question}}\nErantzuna:"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment