Merge branch 'main' into metrics

fb72009f · Baber · b0aca59b · ff41a856 · fb72009f · fb72009f
Commit fb72009f authored Jul 04, 2025 by Baber
20 changed files
--- a/README.md
+++ b/README.md
@@ -110,6 +110,28 @@ lm_eval --model hf \
 > [!Note]
 > Just like you can provide a local path to `transformers.AutoModel`, you can also provide a local path to `lm_eval` via `--model_args pretrained=/path/to/model`

+#### Evaluating GGUF Models
+
+`lm-eval` supports evaluating models in GGUF format using the Hugging Face (`hf`) backend. This allows you to use quantized models compatible with `transformers`, `AutoModel`, and llama.cpp conversions.
+
+To evaluate a GGUF model, pass the path to the directory containing the model weights, the `gguf_file`, and optionally a separate `tokenizer` path using the `--model_args` flag.
+
+**🚨 Important Note:**  
+If no separate tokenizer is provided, Hugging Face will attempt to reconstruct the tokenizer from the GGUF file — this can take **hours** or even hang indefinitely. Passing a separate tokenizer avoids this issue and can reduce tokenizer loading time from hours to seconds.
+
+**✅ Recommended usage:**
+
+```bash
+lm_eval --model hf \
+    --model_args pretrained=/path/to/gguf_folder,gguf_file=model-name.gguf,tokenizer=/path/to/tokenizer \
+    --tasks hellaswag \
+    --device cuda:0 \
+    --batch_size 8
+```
+
+> [!Tip]
+> Ensure the tokenizer path points to a valid Hugging Face tokenizer directory (e.g., containing tokenizer_config.json, vocab.json, etc.).
+
 #### Multi-GPU Evaluation with Hugging Face `accelerate`

 We support three main ways of using Hugging Face's [accelerate 🚀](https://github.com/huggingface/accelerate) library for multi-GPU evaluation.

--- a/lm_eval/__init__.py
+++ b/lm_eval/__init__.py
 import logging
 import os

-from .evaluator import evaluate, simple_evaluate
-

 __version__ = "0.4.9"
+
+
+# Lazy-load .evaluator module to improve CLI startup
+def __getattr__(name):
+    if name == "evaluate":
+        from .evaluator import evaluate
+
+        return evaluate
+    elif name == "simple_evaluate":
+        from .evaluator import simple_evaluate
+
+        return simple_evaluate
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
+
+__all__ = ["evaluate", "simple_evaluate", "__version__"]
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -7,16 +7,6 @@ from functools import partial
 from pathlib import Path
 from typing import Union

-from lm_eval import evaluator, utils
-from lm_eval.evaluator import request_caching_arg_to_dict
-from lm_eval.loggers import EvaluationTracker, WandbLogger
-from lm_eval.tasks import TaskManager
-from lm_eval.utils import (
-    handle_non_serializable,
-    make_table,
-    simple_parse_args_string,
-)
-

 def try_parse_json(value: str) -> Union[str, dict, None]:
    if value is None:
@@ -314,6 +304,17 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        parser = setup_parser()
        args = parse_eval_args(parser)

+    # defer loading `lm_eval` submodules for faster CLI load
+    from lm_eval import evaluator, utils
+    from lm_eval.evaluator import request_caching_arg_to_dict
+    from lm_eval.loggers import EvaluationTracker, WandbLogger
+    from lm_eval.tasks import TaskManager
+    from lm_eval.utils import (
+        handle_non_serializable,
+        make_table,
+        simple_parse_args_string,
+    )
+
    if args.wandb_args:
        wandb_args_dict = simple_parse_args_string(args.wandb_args)
        wandb_config_args_dict = simple_parse_args_string(args.wandb_config_args)

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -3,7 +3,7 @@ import logging
 import os
 from datetime import timedelta
 from pathlib import Path
-from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union

 import jinja2
 import torch
@@ -17,8 +17,6 @@ from accelerate import (
 from accelerate.utils import get_max_memory
 from huggingface_hub import HfApi
 from packaging import version
-from peft import PeftModel
-from peft import __version__ as PEFT_VERSION
 from tqdm import tqdm
 from transformers.models.auto.modeling_auto import (
    MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
@@ -40,6 +38,9 @@ from lm_eval.models.utils import (
 )


+if TYPE_CHECKING:
+    from transformers.quantizers import AutoQuantizationConfig
+
 eval_logger = logging.getLogger(__name__)


@@ -188,6 +189,13 @@ class HFLM(TemplateLM):
            add_bos_token=add_bos_token,
        )

+        if (
+            quantization_config := getattr(self.config, "quantization_config", None)
+        ) is not None and isinstance(quantization_config, dict):
+            from transformers.quantizers import AutoQuantizationConfig
+
+            quantization_config = AutoQuantizationConfig.from_dict(quantization_config)
+
        # if we passed `pretrained` as a string, initialize our model now
        if isinstance(pretrained, str):
            self._create_model(
@@ -205,7 +213,7 @@ class HFLM(TemplateLM):
                autogptq=autogptq,
                gptqmodel=gptqmodel,
                gguf_file=gguf_file,
-                quantization_config=getattr(self.config, "quantization_config", None),
+                quantization_config=quantization_config,
                subfolder=subfolder,
                **kwargs,
            )
@@ -554,7 +562,7 @@ class HFLM(TemplateLM):
        autogptq: Optional[Union[bool, str]] = False,
        gptqmodel: Optional[bool] = False,
        gguf_file: Optional[str] = None,
-        quantization_config: Optional[Dict[str, Any]] = None,
+        quantization_config: Optional["AutoQuantizationConfig"] = None,
        subfolder: str = "",
        **kwargs,
    ) -> None:
@@ -649,6 +657,9 @@ class HFLM(TemplateLM):
            )

        if peft:
+            from peft import PeftModel
+            from peft import __version__ as PEFT_VERSION
+
            if model_kwargs.get("load_in_4bit", None):
                if version.parse(PEFT_VERSION) < version.parse("0.4.0"):
                    raise AssertionError("load_in_4bit requires peft >= 0.4.0")
@@ -716,7 +727,7 @@ class HFLM(TemplateLM):
        }

        # gguf format embeds tokenizer and is not compatible with hf tokenizer `use_fast` param
-        if gguf_file is not None:
+        if not tokenizer and gguf_file is not None:
            kwargs["gguf_file"] = gguf_file
        else:
            kwargs["use_fast"] = use_fast_tokenizer

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -150,6 +150,7 @@
 | [translation](translation/README.md)                                     | Tasks focused on evaluating the language translation capabilities of models.                                                                                                                                                                                                                                                           | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese                       |
 | [triviaqa](triviaqa/README.md)                                           | A large-scale dataset for trivia question answering to test general knowledge.                                                                                                                                                                                                                                                         | English                                                                                                               |
 | [truthfulqa](truthfulqa/README.md)                                       | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses.                                                                                                                                                                                                                                                | English                                                                                                               |
+| [truthfulqa-multi](truthfulqa-multi/README.md) | Is a multilingual version of TruthfulQA, a QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English, Spanish, Catalan, Basque, Galician |
 | [turkishmmlu](turkishmmlu/README.md)                                     | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams.                                                                                                                                                                                                                             | Turkish                                                                                                               |
 | [unitxt](unitxt/README.md)                                               | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI.                                                                                                                                                                                        | English                                                                                                               |
 | [unscramble](unscramble/README.md)                                       | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding.                                                                                                                                                                                                                                              | English                                                                                                               |

--- a/lm_eval/tasks/humaneval/README.md
+++ b/lm_eval/tasks/humaneval/README.md
@@ -50,3 +50,5 @@ If other tasks on this dataset are already supported:

 ### Changelog
 v2 20-MAR-2025: `humaneval_instruct`, `humaneval_instruct_64`: fixed typo in gen_prefix
+
+v3 30-JUN-2025: Updated prompt generation and output parsing to align with the official `Llama-3.1-70B-Instruct-evals`. This corrects the prompt format and fixes a bug in locating the code block. See PR [#3092](https://github.com/EleutherAI/lm-evaluation-harness/pull/3092).
--- a/lm_eval/tasks/humaneval/humaneval_instruct.yaml
+++ b/lm_eval/tasks/humaneval/humaneval_instruct.yaml
 include: humaneval.yaml
 task: humaneval_instruct
-doc_to_text: "Write a solution to the following problem and make sure that it passes the tests:\n```{{prompt}}"
-gen_prefix: "Here is the completed function:\n```python\n{{prompt}}\n"
+doc_to_text: "Write a solution to the following problem and make sure that it passes the tests:\n```python\n{{ prompt }}\n```\n "
+gen_prefix: "Here is the completed function:\n```python\n{{ prompt }}\n "
 filter_list:
  - name: "create_test"
    filter:
      - function: "custom"
        filter_fn: !function utils.build_predictions_instruct
 metadata:
-  version: 2.0
+  version: 3.0
--- a/lm_eval/tasks/humaneval/utils.py
+++ b/lm_eval/tasks/humaneval/utils.py
@@ -32,7 +32,7 @@ def build_predictions_instruct(
 ) -> list[list[str]]:
    return [
        [
-            doc["prompt"] + (r if r.rfind("```") == -1 else r[: r.rfind("```")])
+            doc["prompt"] + (r if r.find("```") == -1 else r[: r.find("```")])
            for r in resp
        ]
        for resp, doc in zip(resps, docs)

--- a/lm_eval/tasks/truthfulqa-multi/README.md
+++ b/lm_eval/tasks/truthfulqa-multi/README.md
+# TruthfulQA-Multi
+
+## Paper
+
+Title: `Truth Knows No Language: Evaluating Truthfulness Beyond English`
+
+Abstract: `[https://arxiv.org/abs/2502.09387v1](https://arxiv.org/abs/2502.09387v1)`
+
+We introduce a professionally translated extension of the TruthfulQA benchmark designed to evaluate truthfulness in Basque, Catalan, Galician, and Spanish. Truthfulness evaluations of large language models (LLMs) have primarily been conducted in English. However, the ability of LLMs to maintain truthfulness across languages remains under-explored. Our study evaluates 12 state-of-the-art open LLMs, comparing base and instruction-tuned models using human evaluation, multiple-choice metrics, and LLM-as-a-Judge scoring. Our findings reveal that, while LLMs perform best in English and worst in Basque (the lowest-resourced language), overall truthfulness discrepancies across languages are smaller than anticipated. Furthermore, we show that LLM-as-a-Judge correlates more closely with human judgments than multiple-choice metrics, and that informativeness plays a critical role in truthfulness assessment. Our results also indicate that machine translation provides a viable approach for extending truthfulness benchmarks to additional languages, offering a scalable alternative to professional translation. Finally, we observe that universal knowledge questions are better handled across languages than context- and time-dependent ones, highlighting the need for truthfulness evaluations that account for cultural and temporal variability. Dataset and code are publicly available under open licenses.
+
+### Citation
+
+```text
+@misc{figueras2025truthknowslanguageevaluating,
+      title={Truth Knows No Language: Evaluating Truthfulness Beyond English},
+      author={Blanca Calvo Figueras and Eneko Sagarzazu and Julen Etxaniz and Jeremy Barnes and Pablo Gamallo and Iria De Dios Flores and Rodrigo Agerri},
+      year={2025},
+      eprint={2502.09387},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2502.09387},
+}
+```
+
+### Groups, Tags, and Tasks
+
+#### Groups
+
+* `truthfulqa`: This task follows the [TruthfulQA dataset](https://arxiv.org/abs/2109.07958), but expands it to new languages.
+
+#### Tasks
+
+* `truthfulqa-multi_mc2_es`: `Multiple-choice, multiple answers in Spanish`
+* `truthfulqa-multi_gen_es`: `Answer generation in Spanish`
+* `truthfulqa-multi_mc2_ca`: `Multiple-choice, multiple answers in Catalan`
+* `truthfulqa-multi_gen_ca`: `Answer generation in Catalan`
+* `truthfulqa-multi_mc2_eu`: `Multiple-choice, multiple answers in Basque`
+* `truthfulqa-multi_gen_eu`: `Answer generation in Basque`
+* `truthfulqa-multi_mc2_gl`: `Multiple-choice, multiple answers in Galician`
+* `truthfulqa-multi_gen_gl`: `Answer generation in Galician`
+* `truthfulqa-multi_mc2_en`: `Multiple-choice, multiple answers in English`
+* `truthfulqa-multi_gen_en`: `Answer generation in English`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+
+* [X] Is the task an existing benchmark in the literature?
+  * [X] Have you referenced the original paper that introduced the task?
+  * [X] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+If other tasks on this dataset are already supported:
+
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
+
+### Changelog
--- a/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_gen_ca.yaml
+++ b/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_gen_ca.yaml
+include: truthfulqa-multi_gen_common
+task: truthfulqa-multi_gen_ca
+dataset_name: ca
--- a/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_gen_common
+++ b/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_gen_common
+tag:
+  - truthfulqa_multi
+dataset_path: HiTZ/truthfulqa-multi
+output_type: generate_until
+generation_kwargs:
+  until:
+    - "!\n\n"
+    - "Q:"
+    - ".\n\n"
+training_split: train
+validation_split: validation
+test_split: null
+doc_to_target: "{{'A: ' + best_answer}}"
+fewshot_split: train
+fewshot_config:
+  sampler: first_n
+process_docs: !function utils.process_docs_gen
+process_results: !function utils.process_results_gen
+doc_to_text: "{{'Q: ' + question}}"
+should_decontaminate: True
+doc_to_decontamination_query: question
+metric_list:
+  # - metric: bleurt_max
+  #   aggregation: mean
+  #   higher_is_better: true
+  # - metric: bleurt_acc
+  #   aggregation: mean
+  #   higher_is_better: true
+  # - metric: bleurt_diff
+  #   aggregation: mean
+  #   higher_is_better: true
+  - metric: bleu_max
+    aggregation: mean
+    higher_is_better: true
+  - metric: bleu_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: bleu_diff
+    aggregation: mean
+    higher_is_better: true
+  #- metric: rouge1_max
+  #  aggregation: mean
+  #  higher_is_better: true
+  #- metric: rouge1_acc
+  #  aggregation: mean
+  #   higher_is_better: true
+  # - metric: rouge1_diff
+  #   aggregation: mean
+  #   higher_is_better: true
+  # - metric: rouge2_max
+  #   aggregation: mean
+  #   higher_is_better: true
+  # - metric: rouge2_acc
+  #   aggregation: mean
+  #   higher_is_better: true
+  # - metric: rouge2_diff
+  #   aggregation: mean
+  #   higher_is_better: true
+  # - metric: rougeL_max
+  #   aggregation: mean
+  #   higher_is_better: true
+  # - metric: rougeL_acc
+  #   aggregation: mean
+  #   higher_is_better: true
+  # - metric: rougeL_diff
+  #   aggregation: mean
+  #   higher_is_better: true
+metadata:
+  version: 3.0
--- a/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_gen_en.yaml
+++ b/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_gen_en.yaml
+include: truthfulqa-multi_gen_common
+task: truthfulqa-multi_gen_en
+dataset_name: en
--- a/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_gen_es.yaml
+++ b/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_gen_es.yaml
+include: truthfulqa-multi_gen_common
+task: truthfulqa-multi_gen_es
+dataset_name: es
--- a/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_gen_eu.yaml
+++ b/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_gen_eu.yaml
+include: truthfulqa-multi_gen_common
+task: truthfulqa-multi_gen_eu
+dataset_name: eu
--- a/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_gen_gl.yaml
+++ b/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_gen_gl.yaml
+include: truthfulqa-multi_gen_common
+task: truthfulqa-multi_gen_gl
+dataset_name: gl
--- a/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc1_ca.yaml
+++ b/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc1_ca.yaml
+include: truthfulqa-multi_mc_common
+task: truthfulqa-multi_mc1_ca
+dataset_name: ca
--- a/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc1_en.yaml
+++ b/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc1_en.yaml
+include: truthfulqa-multi_mc_common
+task: truthfulqa-multi_mc1_en
+dataset_name: en
--- a/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc1_es.yaml
+++ b/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc1_es.yaml
+include: truthfulqa-multi_mc_common
+task: truthfulqa-multi_mc1_es
+dataset_name: es
--- a/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc1_eu.yaml
+++ b/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc1_eu.yaml
+include: truthfulqa-multi_mc_common
+task: truthfulqa-multi_mc1_eu
+dataset_name: eu
--- a/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc1_gl.yaml
+++ b/lm_eval/tasks/truthfulqa-multi/truthfulqa-multi_mc1_gl.yaml
+include: truthfulqa-multi_mc_common
+task: truthfulqa-multi_mc1_gl
+dataset_name: gl