Merge branch 'main' into llama

bf11ac93 · Baber · 83b1c564 · ade01428 · bf11ac93 · bf11ac93
Commit bf11ac93 authored Mar 03, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/mmlu-pro-plus/mmlu_pro_plus_psychology.yaml
+++ b/lm_eval/tasks/mmlu-pro-plus/mmlu_pro_plus_psychology.yaml
+description: "The following are multiple choice questions (with answers) about psychology. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n"
+include: "_default_template_yaml"
+task: "mmlu_pro_plus_psychology"
+task_alias: "psychology"
+process_docs: !function utils.process_psychology
--- a/lm_eval/tasks/mmlu-pro-plus/utils.py
+++ b/lm_eval/tasks/mmlu-pro-plus/utils.py
+from functools import partial
+
+
+choices = [
+    "A",
+    "B",
+    "C",
+    "D",
+    "E",
+    "F",
+    "G",
+    "H",
+    "I",
+    "J",
+    "K",
+    "L",
+    "M",
+    "N",
+    "O",
+    "P",
+]
+
+
+def format_cot_example(example, including_answer=True):
+    prompt = "Question:\n"
+    question = example["question"]
+    options = example["options"]
+    prompt += question + "\n"
+    prompt += "Options:\n"
+    for i, opt in enumerate(options):
+        prompt += "{}. {}\n".format(choices[i], opt)
+    if including_answer:
+        cot_content = example["cot_content"].replace(
+            "A: Let's think step by step.", "Answer: Let's think step by step."
+        )
+        prompt += cot_content + "\n\n"
+    else:
+        prompt += "Answer: Let's think step by step."
+    return prompt
+
+
+doc_to_text = partial(format_cot_example, including_answer=False)
+fewshot_to_text = partial(format_cot_example, including_answer=True)
+
+
+def process_docs(dataset, subject):
+    return dataset.filter(lambda x: x["category"] == subject)
+
+
+process_biology = partial(process_docs, subject="biology")
+process_business = partial(process_docs, subject="business")
+process_chemistry = partial(process_docs, subject="chemistry")
+process_computer_science = partial(process_docs, subject="computer science")
+process_economics = partial(process_docs, subject="economics")
+process_engineering = partial(process_docs, subject="engineering")
+process_health = partial(process_docs, subject="health")
+process_history = partial(process_docs, subject="history")
+process_law = partial(process_docs, subject="law")
+process_math = partial(process_docs, subject="math")
+process_other = partial(process_docs, subject="other")
+process_philosophy = partial(process_docs, subject="philosophy")
+process_physics = partial(process_docs, subject="physics")
+process_psychology = partial(process_docs, subject="psychology")
--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
@@ -11,7 +11,7 @@ import yaml
 from tqdm import tqdm


-eval_logger = logging.getLogger("lm-eval")
+eval_logger = logging.getLogger(__name__)


 SUBJECTS = {

--- a/lm_eval/tasks/mmlusr/config.py
+++ b/lm_eval/tasks/mmlusr/config.py
@@ -10,7 +10,7 @@ import yaml
 from tqdm import tqdm


-eval_logger = logging.getLogger("lm-eval")
+eval_logger = logging.getLogger(__name__)


 SUBJECTS = {

--- a/lm_eval/tasks/moral_stories/moral_stories.yaml
+++ b/lm_eval/tasks/moral_stories/moral_stories.yaml
-tag:
-  - moral_stories
 task: moral_stories
 dataset_path: demelin/moral_stories
 dataset_name: full

--- a/lm_eval/tasks/portuguese_bench/README.md
+++ b/lm_eval/tasks/portuguese_bench/README.md
@@ -14,7 +14,40 @@ The datasets included in PortugueseBench are:


 ### Citation
-Paper for PortugueseBench coming soon.
+
+```
+@inproceedings{baucells-etal-2025-iberobench,
+    title = "{I}bero{B}ench: A Benchmark for {LLM} Evaluation in {I}berian Languages",
+    author = "Baucells, Irene  and
+      Aula-Blasco, Javier  and
+      de-Dios-Flores, Iria  and
+      Paniagua Su{\'a}rez, Silvia  and
+      Perez, Naiara  and
+      Salles, Anna  and
+      Sotelo Docio, Susana  and
+      Falc{\~a}o, J{\'u}lia  and
+      Saiz, Jose Javier  and
+      Sepulveda Torres, Robiert  and
+      Barnes, Jeremy  and
+      Gamallo, Pablo  and
+      Gonzalez-Agirre, Aitor  and
+      Rigau, German  and
+      Villegas, Marta",
+    editor = "Rambow, Owen  and
+      Wanner, Leo  and
+      Apidianaki, Marianna  and
+      Al-Khalifa, Hend  and
+      Eugenio, Barbara Di  and
+      Schockaert, Steven",
+    booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
+    month = jan,
+    year = "2025",
+    address = "Abu Dhabi, UAE",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2025.coling-main.699/",
+    pages = "10491--10519",
+}
+```

 ### Groups and Tasks


--- a/lm_eval/tasks/spanish_bench/README.md
+++ b/lm_eval/tasks/spanish_bench/README.md
@@ -15,6 +15,7 @@ The datasets included in SpanishBench that have been made public in previous pub
 | Task          | Category       | Paper title          | Homepage  |
 |:-------------:|:-----:|:-------------:|:-----:|
 | Belebele_es | Reading Comprehension | [The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants](https://arxiv.org/abs/2308.16884) | https://huggingface.co/datasets/facebook/belebele |
+| Cocoteros_es | Commonsense Reasoning | [COCOTEROS: A Spanish Corpus with Contextual Knowledge for Natural Language Generation](https://besaya.infor.uva.es/sepln24/paper04.pdf) | https://huggingface.co/datasets/gplsi/cocoteros |
 | EsCoLA | Linguistic Acceptability | [EsCoLA: Spanish Corpus of Linguistic Acceptability](https://aclanthology.org/2024.lrec-main.554/) | https://huggingface.co/datasets/nbel/EsCoLA |
 | FLORES_es | Translation | [The FLORES-101  Evaluation Benchmark for Low-Resource and Multilingual Machine Translation](https://arxiv.org/abs/2106.03193) | https://huggingface.co/datasets/facebook/flores |
 | MGSM_es | Math | [Language Models are Multilingual Chain-of-Thought Reasoners](https://arxiv.org/abs/2210.03057) | https://huggingface.co/datasets/juletxara/mgsm |
@@ -28,7 +29,40 @@ The datasets included in SpanishBench that have been made public in previous pub


 ### Citation
-Paper for SpanishBench coming soon.
+
+```
+@inproceedings{baucells-etal-2025-iberobench,
+    title = "{I}bero{B}ench: A Benchmark for {LLM} Evaluation in {I}berian Languages",
+    author = "Baucells, Irene  and
+      Aula-Blasco, Javier  and
+      de-Dios-Flores, Iria  and
+      Paniagua Su{\'a}rez, Silvia  and
+      Perez, Naiara  and
+      Salles, Anna  and
+      Sotelo Docio, Susana  and
+      Falc{\~a}o, J{\'u}lia  and
+      Saiz, Jose Javier  and
+      Sepulveda Torres, Robiert  and
+      Barnes, Jeremy  and
+      Gamallo, Pablo  and
+      Gonzalez-Agirre, Aitor  and
+      Rigau, German  and
+      Villegas, Marta",
+    editor = "Rambow, Owen  and
+      Wanner, Leo  and
+      Apidianaki, Marianna  and
+      Al-Khalifa, Hend  and
+      Eugenio, Barbara Di  and
+      Schockaert, Steven",
+    booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
+    month = jan,
+    year = "2025",
+    address = "Abu Dhabi, UAE",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2025.coling-main.699/",
+    pages = "10491--10519",
+}
+```

 ### Groups and Tasks

@@ -44,6 +78,7 @@ Paper for SpanishBench coming soon.

 The following tasks evaluate tasks on SpanishBench dataset using various scoring methods.
  - `belebele_spa_Latn`
+  - `cocoteros_es`
  - `copa_es`
  - `escola`
  - `flores_es`

--- a/lm_eval/tasks/spanish_bench/cocoteros_es.yaml
+++ b/lm_eval/tasks/spanish_bench/cocoteros_es.yaml
+task: cocoteros_es
+dataset_path: gplsi/cocoteros
+dataset_name: null
+output_type: generate_until
+doc_to_text: "Genera una frase corta con estas palabras: {{keywords}}. El contexto es: {{context}} \n\nRespuesta:"
+doc_to_target: "{{text}}"
+training_split: train
+test_split: test
+target_delimiter: ' '
+generation_kwargs:
+  max_gen_toks: 40
+  until:
+    - "\n"
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: !function utils.rouge1
+    aggregation: !function utils.rouge1_agg
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/spanish_bench/spanish_bench.yaml
+++ b/lm_eval/tasks/spanish_bench/spanish_bench.yaml
@@ -13,5 +13,6 @@ task:
  - mgsm_direct_es_spanish_bench
  - flores_es
  - phrases_es
+  - cocoteros_es
 metadata:
  version: 1.0
--- a/lm_eval/tasks/squad_completion/task.py
+++ b/lm_eval/tasks/squad_completion/task.py
@@ -33,7 +33,9 @@ class SQUADCompletion(ConfigurableTask):
    def doc_to_target(self, doc):
        return doc["value"]

-    def construct_requests(self, doc, ctx, **kwargs):
+    def construct_requests(
+        self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
+    ):
        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.


--- a/lm_eval/tasks/squadv2/task.py
+++ b/lm_eval/tasks/squadv2/task.py
@@ -105,7 +105,9 @@ class SQuAD2(ConfigurableTask):
            answer = "unanswerable"
        return " " + answer

-    def construct_requests(self, doc, ctx, **kwargs):
+    def construct_requests(
+        self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
+    ):
        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.


--- a/lm_eval/tasks/swde/task.py
+++ b/lm_eval/tasks/swde/task.py
@@ -33,7 +33,9 @@ class SWDE(ConfigurableTask):
    def doc_to_target(self, doc):
        return doc["value"]

-    def construct_requests(self, doc, ctx, **kwargs):
+    def construct_requests(
+        self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
+    ):
        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.


--- a/lm_eval/tasks/turkishmmlu/config/_turkishmmlu_default_yaml
+++ b/lm_eval/tasks/turkishmmlu/config/_turkishmmlu_default_yaml
@@ -9,7 +9,7 @@ fewshot_config:
 output_type: multiple_choice
 doc_to_text: "Soru: {{ question.strip() }}\nA. {{ choices[0] }}\nB. {{ choices[1] }}\nC. {{ choices[2] }}\nD. {{ choices[3] }}\nE. {{ choices[4] }}\nCevap:"
 doc_to_choice: ["A", "B", "C", "D", "E"]
-doc_to_target: "{{['A', 'B', 'C', 'D', 'E'].index(answer)}}"
+doc_to_target: "{{ answer.strip() }}"
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/tasks/unitxt/task.py
+++ b/lm_eval/tasks/unitxt/task.py
@@ -109,6 +109,7 @@ class Unitxt(ConfigurableTask):
        apply_chat_template: bool = False,
        fewshot_as_multiturn: bool = False,
        chat_template: Optional[Callable] = None,
+        gen_prefix: Optional[str] = None,
    ) -> str:
        source = self.doc_to_text(doc)
        if isinstance(source, list):
@@ -134,6 +135,7 @@ class Unitxt(ConfigurableTask):
            part of the document for `doc`.
        """
        kwargs.pop("apply_chat_template", False)  # Not used by unitxt
+        kwargs.pop("chat_template", False)  # Not used by unitxt
        return [
            Instance(
                request_type="generate_until",

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -17,13 +17,6 @@ import yaml
 from jinja2 import BaseLoader, Environment, StrictUndefined


-logging.basicConfig(
-    format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
-    datefmt="%Y-%m-%d:%H:%M:%S",
-    level=logging.INFO,
-)
-eval_logger = logging.getLogger("lm-eval")
-
 SPACING = " " * 47

 HIGHER_IS_BETTER_SYMBOLS = {
@@ -32,6 +25,33 @@ HIGHER_IS_BETTER_SYMBOLS = {
 }


+def setup_logging(verbosity=logging.INFO):
+    # Configure the root logger
+    log_level = os.environ.get("LOGLEVEL", verbosity) or verbosity
+
+    level_map = {
+        "DEBUG": logging.DEBUG,
+        "INFO": logging.INFO,
+        "WARNING": logging.WARNING,
+        "ERROR": logging.ERROR,
+        "CRITICAL": logging.CRITICAL,
+    }
+
+    log_level = level_map.get(str(log_level).upper(), logging.INFO)
+    if not logging.root.handlers:
+        logging.basicConfig(
+            format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(name)s:%(lineno)d] %(message)s",
+            datefmt="%Y-%m-%d:%H:%M:%S",
+            level=log_level,
+        )
+        if log_level == logging.DEBUG:
+            third_party_loggers = ["urllib3", "filelock", "fsspec"]
+            for logger_name in third_party_loggers:
+                logging.getLogger(logger_name).setLevel(logging.INFO)
+    else:
+        logging.getLogger().setLevel(log_level)
+
+
 def hash_string(string: str) -> str:
    return hashlib.sha256(string.encode("utf-8")).hexdigest()


--- a/pyproject.toml
+++ b/pyproject.toml
@@ -66,7 +66,7 @@ ibm_watsonx_ai = ["ibm_watsonx_ai>=1.1.22"]
 ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"]
 neuronx = ["optimum[neuronx]"]
 mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
-math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"]
+math = ["sympy>=1.12", "antlr4-python3-runtime==4.11", "math_verify[antlr4_11_0]"]
 multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
 optimum = ["optimum[openvino]"]
 promptsource = ["promptsource>=0.2.3"]

--- a/scripts/build_benchmark.py
+++ b/scripts/build_benchmark.py
 import argparse
+import logging
 import os

 import yaml
 from promptsource.templates import DatasetTemplates
 from tqdm import tqdm

+
 # from lm_eval.api.registry import ALL_TASKS
-from lm_eval.logger import eval_logger
+eval_logger = logging.getLogger(__name__)


 # from lm_eval.tasks import include_task_folder

--- a/scripts/make_table_results.py
+++ b/scripts/make_table_results.py
@@ -10,7 +10,6 @@ import os
 from pytablewriter import LatexTableWriter, MarkdownTableWriter


-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)



--- a/scripts/make_table_tasks.py
+++ b/scripts/make_table_tasks.py
@@ -11,7 +11,6 @@ from pytablewriter import MarkdownTableWriter
 from lm_eval import tasks


-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)



--- a/scripts/model_comparator.py
+++ b/scripts/model_comparator.py
 import argparse
+import logging
 import os
 from typing import Dict, List, Tuple

@@ -8,11 +9,11 @@ import torch

 import lm_eval.evaluator
 import lm_eval.models.utils
-from lm_eval import tasks, utils
+from lm_eval import tasks


 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)


 def memory_stats():