Merge branch 'main' into convert_gen

# Conflicts: # lm_eval/__main__.py # lm_eval/evaluator.py

Merge branch 'main' into convert_gen
# Conflicts: # lm_eval/__main__.py # lm_eval/evaluator.py
efb46937 · Baber · 7fbf899c · ade01428 · efb46937 · efb46937
Commit efb46937 authored Mar 03, 2025 by Baber
20 changed files
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
 import copy
+import logging
 import os
 from datetime import timedelta
 from pathlib import Path
@@ -39,7 +40,7 @@ from lm_eval.models.utils import (
 )


-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)


 @register_model("hf-auto", "hf", "huggingface")

--- a/lm_eval/models/ibm_watsonx_ai.py
+++ b/lm_eval/models/ibm_watsonx_ai.py
 import copy
 import json
+import logging
 import os
 from functools import lru_cache
 from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Type, cast
@@ -10,7 +11,10 @@ from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 from lm_eval.models.api_models import JsonChatStr
-from lm_eval.utils import eval_logger, simple_parse_args_string
+from lm_eval.utils import simple_parse_args_string
+
+
+eval_logger = logging.getLogger(__name__)


 class LogLikelihoodResult(NamedTuple):

--- a/lm_eval/models/nemo_lm.py
+++ b/lm_eval/models/nemo_lm.py
@@ -13,6 +13,7 @@
 # limitations under the License.

 import importlib
+import logging
 import pathlib
 from copy import deepcopy
 from typing import List, Literal
@@ -27,13 +28,15 @@ from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 from lm_eval.models.utils import Collator
 from lm_eval.utils import (
-    eval_logger,
    get_rolling_token_windows,
    make_disjoint_window,
    simple_parse_args_string,
 )


+eval_logger = logging.getLogger(__name__)
+
+
 def _patch_pretrained_cfg(
    pretrained_cfg, trainer, tensor_model_parallel_size, pipeline_model_parallel_size
 ):

--- a/lm_eval/models/neuralmagic.py
+++ b/lm_eval/models/neuralmagic.py
 import copy
+import logging
 from typing import List, Optional, Tuple, Union

 import numpy
@@ -13,7 +14,7 @@ from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM


-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)


 @register_model("sparseml")

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
+import logging
 import os
 from functools import cached_property
 from operator import itemgetter
@@ -6,7 +7,9 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 from lm_eval.api.registry import register_model
 from lm_eval.models.api_models import TemplateAPI
 from lm_eval.models.utils import handle_stop_sequences
-from lm_eval.utils import eval_logger
+
+
+eval_logger = logging.getLogger(__name__)


 @register_model("local-completions")
@@ -288,4 +291,6 @@ class OpenAIChatCompletion(LocalChatCompletion):
        if "o1" in self.model:
            output.pop("stop")
            output["temperature"] = 1
+        elif "o3" in self.model:
+            output.pop("temperature")
        return output
--- a/lm_eval/models/optimum_ipex.py
+++ b/lm_eval/models/optimum_ipex.py
+import logging
 from importlib.util import find_spec

-from lm_eval import utils
 from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM
 from lm_eval.models.utils import get_dtype


-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)


 @register_model("ipex")

--- a/lm_eval/models/optimum_lm.py
+++ b/lm_eval/models/optimum_lm.py
 import json
+import logging
 from importlib.util import find_spec
 from pathlib import Path

-from lm_eval import utils
 from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM


-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)


 @register_model("openvino")

--- a/lm_eval/models/sglang_causallms.py
+++ b/lm_eval/models/sglang_causallms.py
--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -2,6 +2,7 @@ import collections
 import fnmatch
 import gc
 import itertools
+import logging
 import time
 from functools import wraps
 from typing import (
@@ -22,7 +23,8 @@ from typing import (
 import torch
 import transformers

-from lm_eval.utils import eval_logger
+
+eval_logger = logging.getLogger(__name__)


 if TYPE_CHECKING:

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
 import copy
+import logging
 from importlib.metadata import version
 from importlib.util import find_spec
 from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
@@ -17,7 +18,6 @@ from lm_eval.models.utils import (
    undistribute,
 )
 from lm_eval.utils import (
-    eval_logger,
    get_rolling_token_windows,
    make_disjoint_window,
 )
@@ -34,7 +34,7 @@ except ModuleNotFoundError:
 if TYPE_CHECKING:
    pass

-eval_logger = eval_logger
+eval_logger = logging.getLogger(__name__)


 @register_model("vllm")
@@ -75,7 +75,6 @@ class VLLM(TemplateLM):
                "Please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
            )

-        assert "cuda" in device or device is None, "vLLM only supports CUDA"
        assert max_length is None or max_model_len is None, (
            "Either max_length or max_model_len may be provided, but not both"
        )
@@ -110,7 +109,7 @@ class VLLM(TemplateLM):
            eval_logger.warning(
                "You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached."
            )
-            self.model_args["worker_use_ray"] = True
+            self.model_args["distributed_executor_backend"] = "ray"
            self.batch_size = "auto"
            eval_logger.info("Manual batching is not compatible with data parallelism.")

@@ -244,15 +243,13 @@ class VLLM(TemplateLM):
                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
            )
        if self.data_parallel_size > 1:
-            # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
+            # vLLM hangs if resources are set in ray.remote
            # also seems to only work with decorator and not with ray.remote() fn
            # see https://github.com/vllm-project/vllm/issues/973
-            # note: this has changed on 0.3.3, and it only works now if num_gpus are set.
-            # but then tensor_parallel breaks
            @ray.remote
            def run_inference_one_model(
                model_args: dict,
-                sampling_params,
+                sampling_params: SamplingParams,
                requests: List[List[int]],
                lora_request: LoRARequest,
            ):

--- a/lm_eval/models/vllm_vlms.py
+++ b/lm_eval/models/vllm_vlms.py
 import copy
+import logging
 from typing import Dict, List, Optional

 import transformers
@@ -14,7 +15,9 @@ from lm_eval.models.utils import (
    undistribute,
 )
 from lm_eval.models.vllm_causallms import VLLM
-from lm_eval.utils import eval_logger
+
+
+eval_logger = logging.getLogger(__name__)


 try:
@@ -106,11 +109,9 @@ class VLLM_VLM(VLLM):
                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
            )
        if self.data_parallel_size > 1:
-            # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
+            # vLLM hangs if resources are set in ray.remote
            # also seems to only work with decorator and not with ray.remote() fn
            # see https://github.com/vllm-project/vllm/issues/973
-            # note: this has changed on 0.3.3, and it only works now if num_gpus are set.
-            # but then tensor_parallel breaks
            @ray.remote
            def run_inference_one_model(
                model_args: dict, sampling_params, requests: List[List[dict]]

--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
 import ast
+import logging
 import os
 from typing import Dict

 from lm_eval import utils
-from lm_eval.utils import eval_logger


+eval_logger = logging.getLogger(__name__)
+
 # Prompt library.
 # Stores prompts in a dictionary indexed by 2 levels:
 # prompt category name, and prompt name.

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -42,6 +42,7 @@
 | [eus_proficiency](eus_proficiency/README.md)                             | Tasks designed to test proficiency in the Basque language across various topics. | Basque                                                                                                                        |
 | [eus_reading](eus_reading/README.md)                                     | Reading comprehension tasks specifically designed for the Basque language. | Basque                                                                                                                        |
 | [eus_trivia](eus_trivia/README.md)                                       | Trivia and knowledge testing tasks in the Basque language. | Basque                                                                                                                        |
+| [evalita-LLM](evalita-LLM/README.md)                                     | A native Italian benchmark with diverse tasks formats and multiple prompts. | Italian                                                                                                                      |
 | [fda](fda/README.md)                                                     | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English                                                                                                                       |
 | [fld](fld/README.md)                                                     | Tasks involving free-form and directed dialogue understanding. | English                                                                                                                       |
 | [french_bench](french_bench/README.md)                                   | Set of tasks designed to assess language model performance in French. | French                                                                                                                        |
@@ -50,6 +51,7 @@
 | [glue](glue/README.md)                                                   | General Language Understanding Evaluation benchmark to test broad language abilities. | English                                                                                                                       |
 | [gpqa](gpqa/README.md)                                                   | Tasks designed for general public question answering and knowledge verification. | English                                                                                                                       |
 | [gsm8k](gsm8k/README.md)                                                 | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English                                                                                                                       |
+| [groundcocoa](groundcocoa/README.md)                                           | A benchmark evaluating the conditional and compositional reasoning of language models using a grounding task. | English                                                                                                                       |
 | [haerae](haerae/README.md)                                               | Tasks focused on assessing detailed factual and historical knowledge. | Korean                                                                                                                        |
 | [headqa](headqa/README.md)                                               | A high-level education-based question answering dataset to test specialized knowledge. | Spanish, English                                                                                                              |
 | [hellaswag](hellaswag/README.md)                                         | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity. | English                                                                                                                       |
@@ -85,7 +87,7 @@
 | [mlqa](mlqa/README.md)                                                   | MultiLingual Question Answering benchmark dataset for evaluating cross-lingual question answering performance. | English, Arabic, German, Spanish, Hindi, Vietnamese, Simplified Chinese                                                       |
 | [mmlu](mmlu/README.md)                                                   | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English                                                                                                                       |
 | [mmlu_pro](mmlu_pro/README.md)                                           | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options. | English                                                                                                                       |
-| [mmlu-pro-plus](mmlu-pro-plus/README.md) | A new test set for evaluating shortcut learning and higher-order reasoning of LLMs.                                                                                                                                                                                                                                                   | English |
+| [mmlu-pro-plus](mmlu-pro-plus/README.md)                                 | A new test set for evaluating shortcut learning and higher-order reasoning of LLMs.                                                                                                                                                                                                                                                   | English |
 | [mmlusr](mmlusr/README.md)                                               | Variation of MMLU designed to be more rigorous. | English                                                                                                                       |
 | model_written_evals                                                      | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. |                                                                                                                               |
 | [moral_stories](moral_stories/README.md)                                 | A crowd-sourced dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations. | English  

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -14,6 +14,8 @@ from lm_eval.tasks.mmlu_pro.utils import doc_to_text

 GROUP_ONLY_KEYS = list(GroupConfig().to_dict().keys())

+eval_logger = logging.getLogger(__name__)
+

 def convert_mcq_to_generative(cfg: dict):
    prompt = """Given the following question and candidate answers, choose the correct answer."""
@@ -71,15 +73,14 @@ class TaskManager:

    def __init__(
        self,
-        verbosity="INFO",
+        verbosity: Optional[str] = None,
        include_path: Optional[Union[str, List]] = None,
        include_defaults: bool = True,
        mcq_to_generative: bool = False,
    ) -> None:
-        self.verbosity = verbosity
+        if verbosity is not None:
+            utils.setup_logging(verbosity)
        self.include_path = include_path
-        self.logger = utils.eval_logger
-        self.logger.setLevel(getattr(logging, f"{verbosity}"))

        self._task_index = self.initialize_tasks(
            include_path=include_path, include_defaults=include_defaults
@@ -513,7 +514,7 @@ class TaskManager:
                            "yaml_path": -1,
                        }
                    elif tasks_and_groups[tag]["type"] != "tag":
-                        self.logger.info(
+                        eval_logger.info(
                            f"The tag '{tag}' is already registered as a group, this tag will not be registered. "
                            "This may affect tasks you want to call."
                        )
@@ -576,7 +577,7 @@ class TaskManager:
                            config, task, tasks_and_groups, print_info
                        )
                    else:
-                        self.logger.debug(f"File {f} in {root} could not be loaded")
+                        eval_logger.debug(f"File {f} in {root} could not be loaded")

        return tasks_and_groups


--- a/lm_eval/tasks/arabicmmlu/_generate_configs.py
+++ b/lm_eval/tasks/arabicmmlu/_generate_configs.py
@@ -10,7 +10,7 @@ import yaml
 from tqdm import tqdm


-eval_logger = logging.getLogger("lm-eval")
+eval_logger = logging.getLogger(__name__)


 SUBJECTS = {

--- a/lm_eval/tasks/arithmetic/README.md
+++ b/lm_eval/tasks/arithmetic/README.md
@@ -58,3 +58,6 @@ If other tasks on this dataset are already supported:
 * [ ] Is the "Main" variant of this task clearly denoted?
 * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
 * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
+
+### Changelog
+version 2.0: (2025-Feb-14) set target delimiter to "" as the targets already start with a space.
--- a/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
@@ -8,11 +8,12 @@ validation_split: validation
 test_split: null
 doc_to_text: "{{context}}"
 doc_to_target: "{{completion}}"
+target_delimiter: ""
 metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: 1.0
+  version: 2.0
 dataset_kwargs:
  trust_remote_code: true
--- a/lm_eval/tasks/basque_bench/README.md
+++ b/lm_eval/tasks/basque_bench/README.md
@@ -5,14 +5,16 @@
 BasqueBench is a benchmark for evaluating language models in Basque tasks. This is, it evaluates the ability of a language model to understand and generate Basque text. BasqueBench offers a combination of pre-existing, open datasets and datasets developed exclusivelly for this benchmark. All the details of BasqueBench will be published in a paper soon.

 The new evaluation datasets included in BasqueBench are:
-| Task          | Category       | Homepage  |
-|:-------------:|:-----:|:-----:|
-| MGSM_eu | Math | https://huggingface.co/datasets/HiTZ/MGSM-eu |
-| PIQA_eu | Question Answering | https://huggingface.co/datasets/HiTZ/PIQA-eu |
-| WNLI_eu | Natural Language Inference | https://huggingface.co/datasets/HiTZ/wnli-eu |
-| XCOPA_eu | Commonsense Reasoning | https://huggingface.co/datasets/HiTZ/XCOPA-eu |
+| Task     | Category                   | Homepage                                      |
+|:--------:|:--------------------------:|:---------------------------------------------:|
+| ARC_eu   | Question Answering         | https://huggingface.co/datasets/HiTZ/ARC-eu   |
+| MGSM_eu  | Math                       | https://huggingface.co/datasets/HiTZ/MGSM-eu  |
+| PAWS_eu  | Paraphrasing               | https://huggingface.co/datasets/HiTZ/PAWS-eu  |
+| PIQA_eu  | Question Answering         | https://huggingface.co/datasets/HiTZ/PIQA-eu  |
+| WNLI_eu  | Natural Language Inference | https://huggingface.co/datasets/HiTZ/WNLI-eu  |
+| XCOPA_eu | Commonsense Reasoning      | https://huggingface.co/datasets/HiTZ/XCOPA-eu |

-The datasets included in BasqueBench that have been made public in previous pubications are:
+The datasets included in BasqueBench that have been made public in previous publications are:

 | Task          | Category       | Paper title          | Homepage  |
 |:-------------:|:-----:|:-------------:|:-----:|
@@ -28,7 +30,40 @@ The datasets included in BasqueBench that have been made public in previous pubi


 ### Citation
-Paper for BasqueBench coming soon.
+
+```
+@inproceedings{baucells-etal-2025-iberobench,
+    title = "{I}bero{B}ench: A Benchmark for {LLM} Evaluation in {I}berian Languages",
+    author = "Baucells, Irene  and
+      Aula-Blasco, Javier  and
+      de-Dios-Flores, Iria  and
+      Paniagua Su{\'a}rez, Silvia  and
+      Perez, Naiara  and
+      Salles, Anna  and
+      Sotelo Docio, Susana  and
+      Falc{\~a}o, J{\'u}lia  and
+      Saiz, Jose Javier  and
+      Sepulveda Torres, Robiert  and
+      Barnes, Jeremy  and
+      Gamallo, Pablo  and
+      Gonzalez-Agirre, Aitor  and
+      Rigau, German  and
+      Villegas, Marta",
+    editor = "Rambow, Owen  and
+      Wanner, Leo  and
+      Apidianaki, Marianna  and
+      Al-Khalifa, Hend  and
+      Eugenio, Barbara Di  and
+      Schockaert, Steven",
+    booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
+    month = jan,
+    year = "2025",
+    address = "Abu Dhabi, UAE",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2025.coling-main.699/",
+    pages = "10491--10519",
+}
+```

 ### Groups and Tasks

@@ -40,6 +75,8 @@ Paper for BasqueBench coming soon.
 #### Tasks

 The following tasks evaluate tasks on BasqueBench dataset using various scoring methods.
+  - `arc_eu_challenge`
+  - `arc_eu_easy`
  - `belebele_eus_Latn`
  - `eus_exams_eu`
  - `eus_proficiency`
@@ -64,6 +101,7 @@ The following tasks evaluate tasks on BasqueBench dataset using various scoring
  - `flores_pt-eu`
  - `mgsm_direct_eu`
  - `mgsm_native_cot_eu`
+  - `paws_eu`
  - `piqa_eu`
  - `qnlieu`
  - `wnli_eu`

--- a/lm_eval/tasks/basque_bench/arc_eu_challenge.yaml
+++ b/lm_eval/tasks/basque_bench/arc_eu_challenge.yaml
+include: arc_eu_easy.yaml
+task: arc_eu_challenge
+dataset_name: ARC-Challenge
--- a/lm_eval/tasks/basque_bench/arc_eu_easy.yaml
+++ b/lm_eval/tasks/basque_bench/arc_eu_easy.yaml
+task: arc_eu_easy
+dataset_path: HiTZ/ARC-eu
+dataset_name: ARC-Easy
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: test
+doc_to_text: "Galdera: {{question}}\nErantzuna:"
+doc_to_target: "{{choices.label.index(answerKey)}}"
+doc_to_choice: "{{choices.text}}"
+should_decontaminate: true
+doc_to_decontamination_query: "Galdera: {{question}}\nErantzuna:"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0