Merge branch 'main' into llama

bf11ac93 · Baber · 83b1c564 · ade01428 · bf11ac93 · bf11ac93
Commit bf11ac93 authored Mar 03, 2025 by Baber
20 changed files
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
 import copy
+import logging
 from typing import Dict, List, Optional, Tuple, Union
 import torch
@@ -7,7 +8,6 @@ import transformers
 from tqdm import tqdm
 from transformers import BatchEncoding
-from lm_eval import utils
 from lm_eval.api.instance import Instance
 from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM
@@ -24,7 +24,7 @@ from lm_eval.models.utils import (
 DEFAULT_IMAGE_PLACEHOLDER = "<image>"
-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)
 @register_model("hf-multimodal")

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
 import copy
+import logging
 import os
 from datetime import timedelta
 from pathlib import Path
@@ -39,7 +40,7 @@ from lm_eval.models.utils import (
 )
-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)
 @register_model("hf-auto", "hf", "huggingface")

--- a/lm_eval/models/ibm_watsonx_ai.py
+++ b/lm_eval/models/ibm_watsonx_ai.py
 import copy
 import json
+import logging
 import os
 from functools import lru_cache
 from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Type, cast
@@ -10,7 +11,10 @@ from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 from lm_eval.models.api_models import JsonChatStr
-from lm_eval.utils import eval_logger, simple_parse_args_string
+from lm_eval.utils import simple_parse_args_string
+eval_logger = logging.getLogger(__name__)
 class LogLikelihoodResult(NamedTuple):

--- a/lm_eval/models/nemo_lm.py
+++ b/lm_eval/models/nemo_lm.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import importlib
+import logging
 import pathlib
 from copy import deepcopy
 from typing import List, Literal
@@ -27,13 +28,15 @@ from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 from lm_eval.models.utils import Collator
 from lm_eval.utils import (
-    eval_logger,
    get_rolling_token_windows,
    make_disjoint_window,
    simple_parse_args_string,
 )
+eval_logger = logging.getLogger(__name__)
 def _patch_pretrained_cfg(
    pretrained_cfg, trainer, tensor_model_parallel_size, pipeline_model_parallel_size
 ):

--- a/lm_eval/models/neuralmagic.py
+++ b/lm_eval/models/neuralmagic.py
 import copy
+import logging
 from typing import List, Optional, Tuple, Union
 import numpy
@@ -13,7 +14,7 @@ from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM
-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)
 @register_model("sparseml")

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
+import logging
 import os
 from functools import cached_property
 from operator import itemgetter
@@ -6,7 +7,9 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 from lm_eval.api.registry import register_model
 from lm_eval.models.api_models import TemplateAPI
 from lm_eval.models.utils import handle_stop_sequences
-from lm_eval.utils import eval_logger
+eval_logger = logging.getLogger(__name__)
 @register_model("local-completions")
@@ -288,4 +291,6 @@ class OpenAIChatCompletion(LocalChatCompletion):
        if "o1" in self.model:
            output.pop("stop")
            output["temperature"] = 1
+        elif "o3" in self.model:
+            output.pop("temperature")
        return output
--- a/lm_eval/models/optimum_ipex.py
+++ b/lm_eval/models/optimum_ipex.py
+import logging
 from importlib.util import find_spec
-from lm_eval import utils
 from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM
 from lm_eval.models.utils import get_dtype
-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)
 @register_model("ipex")

--- a/lm_eval/models/optimum_lm.py
+++ b/lm_eval/models/optimum_lm.py
 import json
+import logging
 from importlib.util import find_spec
 from pathlib import Path
-from lm_eval import utils
 from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM
-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)
 @register_model("openvino")

--- a/lm_eval/models/sglang_causallms.py
+++ b/lm_eval/models/sglang_causallms.py
--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
@@ -2,6 +2,7 @@ import collections
 import fnmatch
 import gc
 import itertools
+import logging
 import time
 from functools import wraps
 from typing import (
@@ -22,7 +23,8 @@ from typing import (
 import torch
 import transformers
-from lm_eval.utils import eval_logger
+eval_logger = logging.getLogger(__name__)
 if TYPE_CHECKING:

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
 import copy
+import logging
 from importlib.metadata import version
 from importlib.util import find_spec
 from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
@@ -17,7 +18,6 @@ from lm_eval.models.utils import (
    undistribute,
 )
 from lm_eval.utils import (
-    eval_logger,
    get_rolling_token_windows,
    make_disjoint_window,
 )
@@ -34,7 +34,7 @@ except ModuleNotFoundError:
 if TYPE_CHECKING:
    pass
-eval_logger = eval_logger
+eval_logger = logging.getLogger(__name__)
 @register_model("vllm")
@@ -75,7 +75,6 @@ class VLLM(TemplateLM):
                "Please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
            )
-        assert "cuda" in device or device is None, "vLLM only supports CUDA"
        assert max_length is None or max_model_len is None, (
            "Either max_length or max_model_len may be provided, but not both"
        )
@@ -110,7 +109,7 @@ class VLLM(TemplateLM):
            eval_logger.warning(
                "You might experience occasional issues with model weight downloading when data_parallel is in use. To ensure stable performance, run with data_parallel_size=1 until the weights are downloaded and cached."
            )
-            self.model_args["worker_use_ray"] = True
+            self.model_args["distributed_executor_backend"] = "ray"
            self.batch_size = "auto"
            eval_logger.info("Manual batching is not compatible with data parallelism.")
@@ -244,15 +243,13 @@ class VLLM(TemplateLM):
                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
            )
        if self.data_parallel_size > 1:
-            # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
+            # vLLM hangs if resources are set in ray.remote
            # also seems to only work with decorator and not with ray.remote() fn
            # see https://github.com/vllm-project/vllm/issues/973
-            # note: this has changed on 0.3.3, and it only works now if num_gpus are set.
-            # but then tensor_parallel breaks
            @ray.remote
            def run_inference_one_model(
                model_args: dict,
-                sampling_params,
+                sampling_params: SamplingParams,
                requests: List[List[int]],
                lora_request: LoRARequest,
            ):

--- a/lm_eval/models/vllm_vlms.py
+++ b/lm_eval/models/vllm_vlms.py
 import copy
+import logging
 from typing import Dict, List, Optional
 import transformers
@@ -14,7 +15,9 @@ from lm_eval.models.utils import (
    undistribute,
 )
 from lm_eval.models.vllm_causallms import VLLM
-from lm_eval.utils import eval_logger
+eval_logger = logging.getLogger(__name__)
 try:
@@ -106,11 +109,9 @@ class VLLM_VLM(VLLM):
                temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False
            )
        if self.data_parallel_size > 1:
-            # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote
+            # vLLM hangs if resources are set in ray.remote
            # also seems to only work with decorator and not with ray.remote() fn
            # see https://github.com/vllm-project/vllm/issues/973
-            # note: this has changed on 0.3.3, and it only works now if num_gpus are set.
-            # but then tensor_parallel breaks
            @ray.remote
            def run_inference_one_model(
                model_args: dict, sampling_params, requests: List[List[dict]]

--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
 import ast
+import logging
 import os
 from typing import Dict
 from lm_eval import utils
-from lm_eval.utils import eval_logger
+eval_logger = logging.getLogger(__name__)
 # Prompt library.
 # Stores prompts in a dictionary indexed by 2 levels:
 # prompt category name, and prompt name.

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -6,7 +6,7 @@
 For more information, including a full list of task names and their precise meanings or sources, follow the links provided to the individual README.md files for each subfolder.
 | Task Family                                                              | Description | Language(s)                                                                                                                   |
-|-------------|-------------|-------------------------------------------------------------------------------------------------------------------------------|
+|--------------------------------------------------------------------------|-------------|-------------------------------------------------------------------------------------------------------------------------------|
 | [aclue](aclue/README.md)                                                 | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese                                                                                                               |
 | [aexams](aexams/README.md)                                               | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic                                                                                                                        |
 | [agieval](agieval/README.md)                                             | Tasks involving historical data or questions related to history and historical texts. | English, Chinese                                                                                                              |
@@ -42,6 +42,7 @@
 | [eus_proficiency](eus_proficiency/README.md)                             | Tasks designed to test proficiency in the Basque language across various topics. | Basque                                                                                                                        |
 | [eus_reading](eus_reading/README.md)                                     | Reading comprehension tasks specifically designed for the Basque language. | Basque                                                                                                                        |
 | [eus_trivia](eus_trivia/README.md)                                       | Trivia and knowledge testing tasks in the Basque language. | Basque                                                                                                                        |
+| [evalita-LLM](evalita-LLM/README.md)                                     | A native Italian benchmark with diverse tasks formats and multiple prompts. | Italian                                                                                                                      |
 | [fda](fda/README.md)                                                     | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English                                                                                                                       |
 | [fld](fld/README.md)                                                     | Tasks involving free-form and directed dialogue understanding. | English                                                                                                                       |
 | [french_bench](french_bench/README.md)                                   | Set of tasks designed to assess language model performance in French. | French                                                                                                                        |
@@ -50,11 +51,13 @@
 | [glue](glue/README.md)                                                   | General Language Understanding Evaluation benchmark to test broad language abilities. | English                                                                                                                       |
 | [gpqa](gpqa/README.md)                                                   | Tasks designed for general public question answering and knowledge verification. | English                                                                                                                       |
 | [gsm8k](gsm8k/README.md)                                                 | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English                                                                                                                       |
+| [groundcocoa](groundcocoa/README.md)                                           | A benchmark evaluating the conditional and compositional reasoning of language models using a grounding task. | English                                                                                                                       |
 | [haerae](haerae/README.md)                                               | Tasks focused on assessing detailed factual and historical knowledge. | Korean                                                                                                                        |
 | [headqa](headqa/README.md)                                               | A high-level education-based question answering dataset to test specialized knowledge. | Spanish, English                                                                                                              |
 | [hellaswag](hellaswag/README.md)                                         | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity. | English                                                                                                                       |
 | [hendrycks_ethics](hendrycks_ethics/README.md)                           | Tasks designed to evaluate the ethical reasoning capabilities of models. | English                                                                                                                       |
 | [hendrycks_math](hendrycks_math/README.md)                               | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English                                                                                                                       |
+| [histoires_morales](histoires_morales/README.md)                         | A dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations.  | French (Some MT)                                                                                                                        |
 | [hrm8k](hrm8k/README.md)                                                 | A challenging bilingual math reasoning benchmark for Korean and English. | Korean (Some MT), English (Some MT)                                                                                           |
 | [humaneval](humaneval/README.md)                                         | Code generation task that measure functional correctness for synthesizing programs from docstrings. | Python                                                                                                                        |
 | [ifeval](ifeval/README.md)                                               | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English                                                                                                                       |
@@ -84,6 +87,7 @@
 | [mlqa](mlqa/README.md)                                                   | MultiLingual Question Answering benchmark dataset for evaluating cross-lingual question answering performance. | English, Arabic, German, Spanish, Hindi, Vietnamese, Simplified Chinese                                                       |
 | [mmlu](mmlu/README.md)                                                   | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English                                                                                                                       |
 | [mmlu_pro](mmlu_pro/README.md)                                           | A refined set of MMLU, integrating more challenging, reasoning-focused questions and expanding the choice set from four to ten options. | English                                                                                                                       |
+| [mmlu-pro-plus](mmlu-pro-plus/README.md)                                 | A new test set for evaluating shortcut learning and higher-order reasoning of LLMs.                                                                                                                                                                                                                                                   | English |
 | [mmlusr](mmlusr/README.md)                                               | Variation of MMLU designed to be more rigorous. | English                                                                                                                       |
 | model_written_evals                                                      | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. |                                                                                                                               |
 | [moral_stories](moral_stories/README.md)                                 | A crowd-sourced dataset of structured narratives that describe normative and norm-divergent actions taken by individuals to accomplish certain intentions in concrete situations. | English  

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -13,6 +13,8 @@ from lm_eval.evaluator_utils import get_subtask_list
 GROUP_ONLY_KEYS = list(GroupConfig().to_dict().keys())
+eval_logger = logging.getLogger(__name__)
 class TaskManager:
    """TaskManager indexes all tasks from the default `lm_eval/tasks/`
@@ -22,14 +24,13 @@ class TaskManager:
    def __init__(
        self,
-        verbosity="INFO",
+        verbosity: Optional[str] = None,
        include_path: Optional[Union[str, List]] = None,
        include_defaults: bool = True,
    ) -> None:
-        self.verbosity = verbosity
+        if verbosity is not None:
+            utils.setup_logging(verbosity)
        self.include_path = include_path
-        self.logger = utils.eval_logger
-        self.logger.setLevel(getattr(logging, f"{verbosity}"))
        self._task_index = self.initialize_tasks(
            include_path=include_path, include_defaults=include_defaults
@@ -456,7 +457,7 @@ class TaskManager:
                            "yaml_path": -1,
                        }
                    elif tasks_and_groups[tag]["type"] != "tag":
-                        self.logger.info(
+                        eval_logger.info(
                            f"The tag '{tag}' is already registered as a group, this tag will not be registered. "
                            "This may affect tasks you want to call."
                        )
@@ -519,7 +520,7 @@ class TaskManager:
                            config, task, tasks_and_groups, print_info
                        )
                    else:
-                        self.logger.debug(f"File {f} in {root} could not be loaded")
+                        eval_logger.debug(f"File {f} in {root} could not be loaded")
        return tasks_and_groups

--- a/lm_eval/tasks/arabicmmlu/_generate_configs.py
+++ b/lm_eval/tasks/arabicmmlu/_generate_configs.py
@@ -10,7 +10,7 @@ import yaml
 from tqdm import tqdm
-eval_logger = logging.getLogger("lm-eval")
+eval_logger = logging.getLogger(__name__)
 SUBJECTS = {

--- a/lm_eval/tasks/arithmetic/README.md
+++ b/lm_eval/tasks/arithmetic/README.md
@@ -58,3 +58,6 @@ If other tasks on this dataset are already supported:
 * [ ] Is the "Main" variant of this task clearly denoted?
 * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
 * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
+### Changelog
+version 2.0: (2025-Feb-14) set target delimiter to "" as the targets already start with a space.
--- a/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
@@ -8,11 +8,12 @@ validation_split: validation
 test_split: null
 doc_to_text: "{{context}}"
 doc_to_target: "{{completion}}"
+target_delimiter: ""
 metric_list:
  - metric: acc
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: 1.0
+  version: 2.0
 dataset_kwargs:
  trust_remote_code: true
--- a/lm_eval/tasks/basque_bench/README.md
+++ b/lm_eval/tasks/basque_bench/README.md
@@ -6,13 +6,15 @@ BasqueBench is a benchmark for evaluating language models in Basque tasks. This
 The new evaluation datasets included in BasqueBench are:
 | Task     | Category                   | Homepage                                      |
-|:-------------:|:-----:|:-----:|
+|:--------:|:--------------------------:|:---------------------------------------------:|
+| ARC_eu   | Question Answering         | https://huggingface.co/datasets/HiTZ/ARC-eu   |
 | MGSM_eu  | Math                       | https://huggingface.co/datasets/HiTZ/MGSM-eu  |
+| PAWS_eu  | Paraphrasing               | https://huggingface.co/datasets/HiTZ/PAWS-eu  |
 | PIQA_eu  | Question Answering         | https://huggingface.co/datasets/HiTZ/PIQA-eu  |
-| WNLI_eu | Natural Language Inference | https://huggingface.co/datasets/HiTZ/wnli-eu |
+| WNLI_eu  | Natural Language Inference | https://huggingface.co/datasets/HiTZ/WNLI-eu  |
 | XCOPA_eu | Commonsense Reasoning      | https://huggingface.co/datasets/HiTZ/XCOPA-eu |
-The datasets included in BasqueBench that have been made public in previous pubications are:
+The datasets included in BasqueBench that have been made public in previous publications are:
 | Task          | Category       | Paper title          | Homepage  |
 |:-------------:|:-----:|:-------------:|:-----:|
@@ -28,7 +30,40 @@ The datasets included in BasqueBench that have been made public in previous pubi
 ### Citation
-Paper for BasqueBench coming soon.
+```
+@inproceedings{baucells-etal-2025-iberobench,
+    title = "{I}bero{B}ench: A Benchmark for {LLM} Evaluation in {I}berian Languages",
+    author = "Baucells, Irene  and
+      Aula-Blasco, Javier  and
+      de-Dios-Flores, Iria  and
+      Paniagua Su{\'a}rez, Silvia  and
+      Perez, Naiara  and
+      Salles, Anna  and
+      Sotelo Docio, Susana  and
+      Falc{\~a}o, J{\'u}lia  and
+      Saiz, Jose Javier  and
+      Sepulveda Torres, Robiert  and
+      Barnes, Jeremy  and
+      Gamallo, Pablo  and
+      Gonzalez-Agirre, Aitor  and
+      Rigau, German  and
+      Villegas, Marta",
+    editor = "Rambow, Owen  and
+      Wanner, Leo  and
+      Apidianaki, Marianna  and
+      Al-Khalifa, Hend  and
+      Eugenio, Barbara Di  and
+      Schockaert, Steven",
+    booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
+    month = jan,
+    year = "2025",
+    address = "Abu Dhabi, UAE",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2025.coling-main.699/",
+    pages = "10491--10519",
+}
+```
 ### Groups and Tasks
@@ -40,6 +75,8 @@ Paper for BasqueBench coming soon.
 #### Tasks
 The following tasks evaluate tasks on BasqueBench dataset using various scoring methods.
+  - `arc_eu_challenge`
+  - `arc_eu_easy`
  - `belebele_eus_Latn`
  - `eus_exams_eu`
  - `eus_proficiency`
@@ -64,6 +101,7 @@ The following tasks evaluate tasks on BasqueBench dataset using various scoring
  - `flores_pt-eu`
  - `mgsm_direct_eu`
  - `mgsm_native_cot_eu`
+  - `paws_eu`
  - `piqa_eu`
  - `qnlieu`
  - `wnli_eu`

--- a/lm_eval/tasks/basque_bench/arc_eu_challenge.yaml
+++ b/lm_eval/tasks/basque_bench/arc_eu_challenge.yaml
+include: arc_eu_easy.yaml
+task: arc_eu_challenge
+dataset_name: ARC-Challenge