Merge branch 'main' into convert_gen

# Conflicts: # lm_eval/__main__.py # lm_eval/evaluator.py

Merge branch 'main' into convert_gen
# Conflicts: # lm_eval/__main__.py # lm_eval/evaluator.py
efb46937 · Baber · 7fbf899c · ade01428 · efb46937 · efb46937
Commit efb46937 authored Mar 03, 2025 by Baber
17 changed files
--- a/lm_eval/tasks/spanish_bench/cocoteros_es.yaml
+++ b/lm_eval/tasks/spanish_bench/cocoteros_es.yaml
+task: cocoteros_es
+dataset_path: gplsi/cocoteros
+dataset_name: null
+output_type: generate_until
+doc_to_text: "Genera una frase corta con estas palabras: {{keywords}}. El contexto es: {{context}} \n\nRespuesta:"
+doc_to_target: "{{text}}"
+training_split: train
+test_split: test
+target_delimiter: ' '
+generation_kwargs:
+  max_gen_toks: 40
+  until:
+    - "\n"
+metric_list:
+  - metric: bleu
+    aggregation: bleu
+    higher_is_better: true
+  - metric: !function utils.rouge1
+    aggregation: !function utils.rouge1_agg
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/spanish_bench/spanish_bench.yaml
+++ b/lm_eval/tasks/spanish_bench/spanish_bench.yaml
@@ -13,5 +13,6 @@ task:
  - mgsm_direct_es_spanish_bench
  - flores_es
  - phrases_es
+  - cocoteros_es
 metadata:
  version: 1.0
--- a/lm_eval/tasks/squad_completion/task.py
+++ b/lm_eval/tasks/squad_completion/task.py
@@ -33,7 +33,9 @@ class SQUADCompletion(ConfigurableTask):
    def doc_to_target(self, doc):
        return doc["value"]
-    def construct_requests(self, doc, ctx, **kwargs):
+    def construct_requests(
+        self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
+    ):
        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.

--- a/lm_eval/tasks/squadv2/task.py
+++ b/lm_eval/tasks/squadv2/task.py
@@ -105,7 +105,9 @@ class SQuAD2(ConfigurableTask):
            answer = "unanswerable"
        return " " + answer
-    def construct_requests(self, doc, ctx, **kwargs):
+    def construct_requests(
+        self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
+    ):
        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.

--- a/lm_eval/tasks/swde/task.py
+++ b/lm_eval/tasks/swde/task.py
@@ -33,7 +33,9 @@ class SWDE(ConfigurableTask):
    def doc_to_target(self, doc):
        return doc["value"]
-    def construct_requests(self, doc, ctx, **kwargs):
+    def construct_requests(
+        self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
+    ):
        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.

--- a/lm_eval/tasks/turkishmmlu/config/_turkishmmlu_default_yaml
+++ b/lm_eval/tasks/turkishmmlu/config/_turkishmmlu_default_yaml
@@ -9,7 +9,7 @@ fewshot_config:
 output_type: multiple_choice
 doc_to_text: "Soru: {{ question.strip() }}\nA. {{ choices[0] }}\nB. {{ choices[1] }}\nC. {{ choices[2] }}\nD. {{ choices[3] }}\nE. {{ choices[4] }}\nCevap:"
 doc_to_choice: ["A", "B", "C", "D", "E"]
-doc_to_target: "{{['A', 'B', 'C', 'D', 'E'].index(answer)}}"
+doc_to_target: "{{ answer.strip() }}"
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/tasks/unitxt/task.py
+++ b/lm_eval/tasks/unitxt/task.py
@@ -109,6 +109,7 @@ class Unitxt(ConfigurableTask):
        apply_chat_template: bool = False,
        fewshot_as_multiturn: bool = False,
        chat_template: Optional[Callable] = None,
+        gen_prefix: Optional[str] = None,
    ) -> str:
        source = self.doc_to_text(doc)
        if isinstance(source, list):
@@ -134,6 +135,7 @@ class Unitxt(ConfigurableTask):
            part of the document for `doc`.
        """
        kwargs.pop("apply_chat_template", False)  # Not used by unitxt
+        kwargs.pop("chat_template", False)  # Not used by unitxt
        return [
            Instance(
                request_type="generate_until",

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -17,13 +17,6 @@ import yaml
 from jinja2 import BaseLoader, Environment, StrictUndefined
-logging.basicConfig(
-    format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
-    datefmt="%Y-%m-%d:%H:%M:%S",
-    level=logging.INFO,
-)
-eval_logger = logging.getLogger("lm-eval")
 SPACING = " " * 47
 HIGHER_IS_BETTER_SYMBOLS = {
@@ -32,6 +25,33 @@ HIGHER_IS_BETTER_SYMBOLS = {
 }
+def setup_logging(verbosity=logging.INFO):
+    # Configure the root logger
+    log_level = os.environ.get("LOGLEVEL", verbosity) or verbosity
+    level_map = {
+        "DEBUG": logging.DEBUG,
+        "INFO": logging.INFO,
+        "WARNING": logging.WARNING,
+        "ERROR": logging.ERROR,
+        "CRITICAL": logging.CRITICAL,
+    }
+    log_level = level_map.get(str(log_level).upper(), logging.INFO)
+    if not logging.root.handlers:
+        logging.basicConfig(
+            format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(name)s:%(lineno)d] %(message)s",
+            datefmt="%Y-%m-%d:%H:%M:%S",
+            level=log_level,
+        )
+        if log_level == logging.DEBUG:
+            third_party_loggers = ["urllib3", "filelock", "fsspec"]
+            for logger_name in third_party_loggers:
+                logging.getLogger(logger_name).setLevel(logging.INFO)
+    else:
+        logging.getLogger().setLevel(log_level)
 def hash_string(string: str) -> str:
    return hashlib.sha256(string.encode("utf-8")).hexdigest()

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -66,7 +66,7 @@ ibm_watsonx_ai = ["ibm_watsonx_ai>=1.1.22"]
 ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"]
 neuronx = ["optimum[neuronx]"]
 mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
-math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"]
+math = ["sympy>=1.12", "antlr4-python3-runtime==4.11", "math_verify[antlr4_11_0]"]
 multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
 optimum = ["optimum[openvino]"]
 promptsource = ["promptsource>=0.2.3"]

--- a/scripts/build_benchmark.py
+++ b/scripts/build_benchmark.py
 import argparse
+import logging
 import os
 import yaml
 from promptsource.templates import DatasetTemplates
 from tqdm import tqdm
 # from lm_eval.api.registry import ALL_TASKS
-from lm_eval.logger import eval_logger
+eval_logger = logging.getLogger(__name__)
 # from lm_eval.tasks import include_task_folder

--- a/scripts/make_table_results.py
+++ b/scripts/make_table_results.py
@@ -10,7 +10,6 @@ import os
 from pytablewriter import LatexTableWriter, MarkdownTableWriter
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

--- a/scripts/make_table_tasks.py
+++ b/scripts/make_table_tasks.py
@@ -11,7 +11,6 @@ from pytablewriter import MarkdownTableWriter
 from lm_eval import tasks
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)

--- a/scripts/model_comparator.py
+++ b/scripts/model_comparator.py
 import argparse
+import logging
 import os
 from typing import Dict, List, Tuple
@@ -8,11 +9,11 @@ import torch
 import lm_eval.evaluator
 import lm_eval.models.utils
-from lm_eval import tasks, utils
+from lm_eval import tasks
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)
 def memory_stats():

--- a/scripts/requests_caching.py
+++ b/scripts/requests_caching.py
@@ -4,6 +4,7 @@ Usage:
 """
 import argparse
+import logging
 import os
 from typing import List
@@ -14,7 +15,9 @@ from transformers import (
 from lm_eval import simple_evaluate
 from lm_eval.evaluator import request_caching_arg_to_dict
-from lm_eval.utils import eval_logger
+eval_logger = logging.getLogger(__name__)
 MODULE_DIR = os.path.dirname(os.path.realpath(__file__))

--- a/scripts/write_out.py
+++ b/scripts/write_out.py
 import argparse
+import logging
 import os
 import random
@@ -7,7 +8,10 @@ import numpy as np
 from lm_eval import tasks
 from lm_eval.evaluator_utils import get_task_list
 from lm_eval.tasks import TaskManager
-from lm_eval.utils import eval_logger, join_iters
+from lm_eval.utils import join_iters
+eval_logger = logging.getLogger(__name__)
 EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"

--- a/scripts/zeno_visualize.py
+++ b/scripts/zeno_visualize.py
 import argparse
 import json
+import logging
 import os
 import re
 from pathlib import Path
@@ -8,13 +9,15 @@ import pandas as pd
 from zeno_client import ZenoClient, ZenoMetric
 from lm_eval.utils import (
-    eval_logger,
    get_latest_filename,
    get_results_filenames,
    get_sample_results_filenames,
 )
+eval_logger = logging.getLogger(__name__)
 def parse_args():
    parser = argparse.ArgumentParser(
        description="Upload your data to the Zeno AI evaluation platform to visualize results. This requires a ZENO_API_KEY in your environment variables. The eleuther harness must be run with log_samples=True and an output_path set for data to be written to disk."

--- a/tests/models/test_sglang.py
+++ b/tests/models/test_sglang.py
+from typing import List
+import pytest
+import torch
+from lm_eval import evaluate, simple_evaluate, tasks
+from lm_eval.api.instance import Instance
+from lm_eval.tasks import get_task_dict
+task_manager = tasks.TaskManager()
+# We refer to vLLM's test but modify the trigger condition.
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
+# @pytest.mark.skip(reason="requires CUDA")
+class Test_SGlang:
+    sglang = pytest.importorskip("sglang")
+    task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
+    multiple_choice_task = task_list["arc_easy"]  # type: ignore
+    multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
+    MULTIPLE_CH: List[Instance] = multiple_choice_task.instances
+    generate_until_task = task_list["gsm8k"]  # type: ignore
+    generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
+    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
+    generate_until: List[Instance] = generate_until_task.instances
+    rolling_task = task_list["wikitext"]  # type: ignore
+    rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
+    ROLLING: List[Instance] = rolling_task.instances
+    @classmethod
+    def setup_class(cls):
+        try:
+            from lm_eval.models.sglang_causallms import SGLangLM
+            # NOTE(jinwei): EleutherAI/pythia-70m is not supported by SGlang yet. Instead we use Qwen models.
+            cls.LM = SGLangLM(
+                pretrained="Qwen/Qwen2-1.5B-Instruct",
+                batch_size=1,
+                tp_size=1,
+                max_model_len=1024,
+            )
+        except Exception as e:
+            pytest.fail(f"🔥 SGLangLM failed to initialize: {e}")
+    def test_logliklihood(self) -> None:
+        res = self.LM.loglikelihood(self.MULTIPLE_CH)
+        assert len(res) == len(self.MULTIPLE_CH)
+        for x in res:
+            assert isinstance(x[0], float)
+    def test_generate_until(self) -> None:
+        res = self.LM.generate_until(self.generate_until)
+        assert len(res) == len(self.generate_until)
+        for x in res:
+            assert isinstance(x, str)
+    # NOTE(Jinwei):A100 80GB is enough for our tests. If you run the last test "test_logliklihood_rolling" and OOM happens, please reduce the "max_model_len".
+    def test_logliklihood_rolling(self) -> None:
+        res = self.LM.loglikelihood_rolling(self.ROLLING)
+        for x in res:
+            assert isinstance(x, float)
+    # def test_simple_evaluate(self)-> None:
+    #     results = simple_evaluate(
+    #         model =self.LM,
+    #         tasks=["arc_easy"],
+    #         # num_fewshot=0,
+    #         task_manager=task_manager,
+    #         limit= 10,
+    #     )
+    #     print(results)
+    #     accuracy = results["results"]["arc_easy"]["acc,none"]
+    #     print(f"Accuracy: {accuracy}")
+    # def test_evaluate(self)-> None:
+    #     tasks=["arc_easy"]
+    #     task_dict = get_task_dict(tasks, task_manager)
+    #     results = evaluate(
+    #     lm=self.LM,
+    #     task_dict=task_dict,
+    #     limit= 10,
+    #     )
+    #     print(results)
+    #     accuracy = results["results"]["arc_easy"]["acc,none"]
+    #     print(f"Accuracy: {accuracy}")
+    # TODO(jinwei): find out the outpt differences for "gsm_8k" with simple_evalute() and evaluate(). There are some errors in parser as well.
+    def test_evaluator(self) -> None:
+        simple_results = simple_evaluate(
+            model=self.LM,
+            tasks=["arc_easy"],
+            task_manager=task_manager,
+            limit=10,
+        )
+        assert simple_results is not None, "simple_evaluate returned None"
+        # The accuracy for 10 data points is 0.7. Setting up a threshold of 0.5 provides a buffer to account for these fluctuations.
+        assert simple_results["results"]["arc_easy"]["acc,none"] >= 0.5, (
+            "The accuracy for simple_evaluate() is below 0.5!"
+        )
+        task_dict = get_task_dict(["arc_easy"], task_manager)
+        evaluate_results = evaluate(
+            lm=self.LM,
+            task_dict=task_dict,
+            limit=10,
+        )
+        assert evaluate_results is not None, "evaluate returned None"
+        # The accuracy for 10 data points is 0.7. Setting up a threshold of 0.5 provides a buffer to account for these fluctuations.
+        assert evaluate_results["results"]["arc_easy"]["acc,none"] >= 0.5, (
+            "The accuracy for evaluate() is below 0.5!"
+        )
+        assert set(simple_results["results"].keys()) == set(
+            evaluate_results["results"].keys()
+        ), "Mismatch in task keys between simple_evaluate and evaluate"
+        for task in simple_results["results"]:
+            assert (
+                simple_results["results"][task] == evaluate_results["results"][task]
+            ), f"Mismatch in results for {task}"
+        print(
+            "✅ test_evaluator passed: simple_evaluate and evaluate results are identical."
+        )