"tests/vscode:/vscode.git/clone" did not exist on "1e6c547a7ecde3c9e01d6bcaa33cf6836a90ed3a"
Commit 2ebef470 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into feature/eval_from_config

# Conflicts:
#	lm_eval/__main__.py
parents d816f64a ff41a856
...@@ -110,6 +110,28 @@ lm_eval --model hf \ ...@@ -110,6 +110,28 @@ lm_eval --model hf \
> [!Note] > [!Note]
> Just like you can provide a local path to `transformers.AutoModel`, you can also provide a local path to `lm_eval` via `--model_args pretrained=/path/to/model` > Just like you can provide a local path to `transformers.AutoModel`, you can also provide a local path to `lm_eval` via `--model_args pretrained=/path/to/model`
#### Evaluating GGUF Models
`lm-eval` supports evaluating models in GGUF format using the Hugging Face (`hf`) backend. This allows you to use quantized models compatible with `transformers`, `AutoModel`, and llama.cpp conversions.
To evaluate a GGUF model, pass the path to the directory containing the model weights, the `gguf_file`, and optionally a separate `tokenizer` path using the `--model_args` flag.
**🚨 Important Note:**
If no separate tokenizer is provided, Hugging Face will attempt to reconstruct the tokenizer from the GGUF file — this can take **hours** or even hang indefinitely. Passing a separate tokenizer avoids this issue and can reduce tokenizer loading time from hours to seconds.
**✅ Recommended usage:**
```bash
lm_eval --model hf \
--model_args pretrained=/path/to/gguf_folder,gguf_file=model-name.gguf,tokenizer=/path/to/tokenizer \
--tasks hellaswag \
--device cuda:0 \
--batch_size 8
```
> [!Tip]
> Ensure the tokenizer path points to a valid Hugging Face tokenizer directory (e.g., containing tokenizer_config.json, vocab.json, etc.).
#### Multi-GPU Evaluation with Hugging Face `accelerate` #### Multi-GPU Evaluation with Hugging Face `accelerate`
We support three main ways of using Hugging Face's [accelerate 🚀](https://github.com/huggingface/accelerate) library for multi-GPU evaluation. We support three main ways of using Hugging Face's [accelerate 🚀](https://github.com/huggingface/accelerate) library for multi-GPU evaluation.
......
import logging import logging
import os import os
from .evaluator import evaluate, simple_evaluate
__version__ = "0.4.9" __version__ = "0.4.9"
# Lazy-load .evaluator module to improve CLI startup
def __getattr__(name):
if name == "evaluate":
from .evaluator import evaluate
return evaluate
elif name == "simple_evaluate":
from .evaluator import simple_evaluate
return simple_evaluate
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
__all__ = ["evaluate", "simple_evaluate", "__version__"]
...@@ -7,24 +7,6 @@ from functools import partial ...@@ -7,24 +7,6 @@ from functools import partial
from pathlib import Path from pathlib import Path
from typing import Union from typing import Union
from lm_eval import evaluator, utils
from lm_eval.api.eval_config import (
EvaluationConfig,
TrackExplicitAction,
TrackExplicitStoreTrue,
)
# from lm_eval.evaluator import request_caching_arg_to_dict
from lm_eval.loggers import EvaluationTracker, WandbLogger
from lm_eval.tasks import TaskManager
from lm_eval.utils import (
handle_non_serializable,
make_table,
request_caching_arg_to_dict,
# non_default_update,
# parse_namespace,
)
def try_parse_json(value: str) -> Union[str, dict, None]: def try_parse_json(value: str) -> Union[str, dict, None]:
if value is None: if value is None:
...@@ -359,6 +341,17 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -359,6 +341,17 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
cfg = EvaluationConfig.from_cli(args) cfg = EvaluationConfig.from_cli(args)
# defer loading `lm_eval` submodules for faster CLI load
from lm_eval import evaluator, utils
from lm_eval.evaluator import request_caching_arg_to_dict
from lm_eval.loggers import EvaluationTracker, WandbLogger
from lm_eval.tasks import TaskManager
from lm_eval.utils import (
handle_non_serializable,
make_table,
simple_parse_args_string,
)
if args.wandb_args: if args.wandb_args:
wandb_logger = WandbLogger(cfg.wandb_args, cfg.wandb_config_args) wandb_logger = WandbLogger(cfg.wandb_args, cfg.wandb_config_args)
......
...@@ -458,11 +458,13 @@ class Task(abc.ABC): ...@@ -458,11 +458,13 @@ class Task(abc.ABC):
# sample fewshot context #TODO: need to offset doc_id by rank now! # sample fewshot context #TODO: need to offset doc_id by rank now!
fewshot_ctx = self.fewshot_context( fewshot_ctx = self.fewshot_context(
doc, doc,
0 if self.config.num_fewshot is None else self.config.num_fewshot, num_fewshot=0
system_instruction, if self.config.num_fewshot is None
apply_chat_template, else self.config.num_fewshot,
fewshot_as_multiturn, system_instruction=system_instruction,
chat_template, apply_chat_template=apply_chat_template,
fewshot_as_multiturn=fewshot_as_multiturn,
chat_template=chat_template,
gen_prefix=self.doc_to_prefix(doc), gen_prefix=self.doc_to_prefix(doc),
) )
......
...@@ -323,14 +323,29 @@ class AnthropicChat(LocalCompletionsAPI): ...@@ -323,14 +323,29 @@ class AnthropicChat(LocalCompletionsAPI):
) )
if system: if system:
messages = messages[1:] messages = messages[1:]
cleaned_messages = []
for msg in messages:
cleaned_msg = {
"role": msg["role"],
"content": [
{"type": msg["type"], msg["type"]: msg["content"]},
],
}
cleaned_messages.append(cleaned_msg)
gen_kwargs.pop("do_sample", False) gen_kwargs.pop("do_sample", False)
max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks) max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
temperature = gen_kwargs.pop("temperature", 0) temperature = gen_kwargs.pop("temperature", 0)
stop = handle_stop_sequences(gen_kwargs.pop("until", ["\n\nHuman:"]), eos=eos) stop = handle_stop_sequences(gen_kwargs.pop("until", ["\n\nHuman:"]), eos=eos)
if not isinstance(stop, list): if not isinstance(stop, list):
stop = [stop] stop = [stop]
# Filter out empty or whitespace-only stop sequences for Anthropic API
stop = [s for s in stop if s and s.strip()]
out = { out = {
"messages": messages, "messages": cleaned_messages,
"model": self.model, "model": self.model,
"max_tokens": max_tokens, "max_tokens": max_tokens,
"temperature": temperature, "temperature": temperature,
......
...@@ -3,7 +3,7 @@ import logging ...@@ -3,7 +3,7 @@ import logging
import os import os
from datetime import timedelta from datetime import timedelta
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Tuple, Union from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
import jinja2 import jinja2
import torch import torch
...@@ -17,8 +17,6 @@ from accelerate import ( ...@@ -17,8 +17,6 @@ from accelerate import (
from accelerate.utils import get_max_memory from accelerate.utils import get_max_memory
from huggingface_hub import HfApi from huggingface_hub import HfApi
from packaging import version from packaging import version
from peft import PeftModel
from peft import __version__ as PEFT_VERSION
from tqdm import tqdm from tqdm import tqdm
from transformers.models.auto.modeling_auto import ( from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
...@@ -40,6 +38,9 @@ from lm_eval.models.utils import ( ...@@ -40,6 +38,9 @@ from lm_eval.models.utils import (
) )
if TYPE_CHECKING:
from transformers.quantizers import AutoQuantizationConfig
eval_logger = logging.getLogger(__name__) eval_logger = logging.getLogger(__name__)
...@@ -61,7 +62,7 @@ class HFLM(TemplateLM): ...@@ -61,7 +62,7 @@ class HFLM(TemplateLM):
backend: Literal["default", "causal", "seq2seq"] = "default", backend: Literal["default", "causal", "seq2seq"] = "default",
# override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq) # override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
revision: Optional[str] = "main", revision: Optional[str] = "main",
subfolder: Optional[str] = None, subfolder: str = "",
tokenizer: Optional[ tokenizer: Optional[
Union[ Union[
str, str,
...@@ -162,14 +163,13 @@ class HFLM(TemplateLM): ...@@ -162,14 +163,13 @@ class HFLM(TemplateLM):
) )
revision = str(revision) # cast to string if not already one revision = str(revision) # cast to string if not already one
# TODO: update this to be less of a hack once subfolder is fixed in HF
revision = revision + ("/" + subfolder if subfolder is not None else "")
self._get_config( self._get_config(
pretrained, pretrained,
revision=revision, revision=revision,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
gguf_file=gguf_file, gguf_file=gguf_file,
subfolder=subfolder,
) )
# determine which of 'causal' and 'seq2seq' backends to use for HF models # determine which of 'causal' and 'seq2seq' backends to use for HF models
...@@ -182,12 +182,20 @@ class HFLM(TemplateLM): ...@@ -182,12 +182,20 @@ class HFLM(TemplateLM):
pretrained, pretrained,
tokenizer, tokenizer,
revision=revision, revision=revision,
subfolder=subfolder,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
use_fast_tokenizer=use_fast_tokenizer, use_fast_tokenizer=use_fast_tokenizer,
gguf_file=gguf_file, gguf_file=gguf_file,
add_bos_token=add_bos_token, add_bos_token=add_bos_token,
) )
if (
quantization_config := getattr(self.config, "quantization_config", None)
) is not None and isinstance(quantization_config, dict):
from transformers.quantizers import AutoQuantizationConfig
quantization_config = AutoQuantizationConfig.from_dict(quantization_config)
# if we passed `pretrained` as a string, initialize our model now # if we passed `pretrained` as a string, initialize our model now
if isinstance(pretrained, str): if isinstance(pretrained, str):
self._create_model( self._create_model(
...@@ -205,7 +213,8 @@ class HFLM(TemplateLM): ...@@ -205,7 +213,8 @@ class HFLM(TemplateLM):
autogptq=autogptq, autogptq=autogptq,
gptqmodel=gptqmodel, gptqmodel=gptqmodel,
gguf_file=gguf_file, gguf_file=gguf_file,
quantization_config=getattr(self.config, "quantization_config", None), quantization_config=quantization_config,
subfolder=subfolder,
**kwargs, **kwargs,
) )
...@@ -522,6 +531,7 @@ class HFLM(TemplateLM): ...@@ -522,6 +531,7 @@ class HFLM(TemplateLM):
revision: str = "main", revision: str = "main",
trust_remote_code: bool = False, trust_remote_code: bool = False,
gguf_file: Optional[str] = None, gguf_file: Optional[str] = None,
subfolder: str = "",
) -> None: ) -> None:
"""Return the model config for HuggingFace models""" """Return the model config for HuggingFace models"""
self._config = transformers.AutoConfig.from_pretrained( self._config = transformers.AutoConfig.from_pretrained(
...@@ -529,6 +539,7 @@ class HFLM(TemplateLM): ...@@ -529,6 +539,7 @@ class HFLM(TemplateLM):
revision=revision, revision=revision,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
gguf_file=gguf_file, gguf_file=gguf_file,
subfolder=subfolder,
) )
def _create_model( def _create_model(
...@@ -551,7 +562,8 @@ class HFLM(TemplateLM): ...@@ -551,7 +562,8 @@ class HFLM(TemplateLM):
autogptq: Optional[Union[bool, str]] = False, autogptq: Optional[Union[bool, str]] = False,
gptqmodel: Optional[bool] = False, gptqmodel: Optional[bool] = False,
gguf_file: Optional[str] = None, gguf_file: Optional[str] = None,
quantization_config: Optional[Dict[str, Any]] = None, quantization_config: Optional["AutoQuantizationConfig"] = None,
subfolder: str = "",
**kwargs, **kwargs,
) -> None: ) -> None:
""" """
...@@ -598,6 +610,7 @@ class HFLM(TemplateLM): ...@@ -598,6 +610,7 @@ class HFLM(TemplateLM):
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
gguf_file=gguf_file, gguf_file=gguf_file,
quantization_config=quantization_config, quantization_config=quantization_config,
subfolder=subfolder,
**model_kwargs, **model_kwargs,
) )
else: else:
...@@ -644,6 +657,9 @@ class HFLM(TemplateLM): ...@@ -644,6 +657,9 @@ class HFLM(TemplateLM):
) )
if peft: if peft:
from peft import PeftModel
from peft import __version__ as PEFT_VERSION
if model_kwargs.get("load_in_4bit", None): if model_kwargs.get("load_in_4bit", None):
if version.parse(PEFT_VERSION) < version.parse("0.4.0"): if version.parse(PEFT_VERSION) < version.parse("0.4.0"):
raise AssertionError("load_in_4bit requires peft >= 0.4.0") raise AssertionError("load_in_4bit requires peft >= 0.4.0")
...@@ -697,6 +713,7 @@ class HFLM(TemplateLM): ...@@ -697,6 +713,7 @@ class HFLM(TemplateLM):
use_fast_tokenizer: Optional[bool] = True, use_fast_tokenizer: Optional[bool] = True,
gguf_file: Optional[str] = None, gguf_file: Optional[str] = None,
add_bos_token: Optional[bool] = False, add_bos_token: Optional[bool] = False,
subfolder: Optional[str] = "",
) -> None: ) -> None:
""" """
Helper method during initialization. Helper method during initialization.
...@@ -710,7 +727,7 @@ class HFLM(TemplateLM): ...@@ -710,7 +727,7 @@ class HFLM(TemplateLM):
} }
# gguf format embeds tokenizer and is not compatible with hf tokenizer `use_fast` param # gguf format embeds tokenizer and is not compatible with hf tokenizer `use_fast` param
if gguf_file is not None: if not tokenizer and gguf_file is not None:
kwargs["gguf_file"] = gguf_file kwargs["gguf_file"] = gguf_file
else: else:
kwargs["use_fast"] = use_fast_tokenizer kwargs["use_fast"] = use_fast_tokenizer
...@@ -718,6 +735,9 @@ class HFLM(TemplateLM): ...@@ -718,6 +735,9 @@ class HFLM(TemplateLM):
if add_bos_token: if add_bos_token:
kwargs["add_bos_token"] = True kwargs["add_bos_token"] = True
if subfolder:
kwargs["subfolder"] = subfolder
if tokenizer: if tokenizer:
if isinstance(tokenizer, str): if isinstance(tokenizer, str):
self.tokenizer = transformers.AutoTokenizer.from_pretrained( self.tokenizer = transformers.AutoTokenizer.from_pretrained(
......
...@@ -10,6 +10,7 @@ from queue import Empty ...@@ -10,6 +10,7 @@ from queue import Empty
from time import sleep from time import sleep
from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
import jinja2
from more_itertools import distribute from more_itertools import distribute
from packaging.version import parse as parse_version from packaging.version import parse as parse_version
from tqdm import tqdm from tqdm import tqdm
...@@ -300,14 +301,27 @@ class VLLM(TemplateLM): ...@@ -300,14 +301,27 @@ class VLLM(TemplateLM):
""" """
Method to apply a chat template to a list of chat history between user and model. Method to apply a chat template to a list of chat history between user and model.
""" """
chat_templated = self.tokenizer.apply_chat_template( try:
chat_history, chat_templated = self.tokenizer.apply_chat_template(
tokenize=False, chat_history,
add_generation_prompt=add_generation_prompt, tokenize=False,
continue_final_message=not add_generation_prompt, add_generation_prompt=add_generation_prompt,
chat_template=self.hf_chat_template, continue_final_message=not add_generation_prompt,
enable_thinking=self.enable_thinking, chat_template=self.hf_chat_template,
) enable_thinking=self.enable_thinking,
)
except jinja2.exceptions.TemplateError:
eval_logger.warning(
"Failed to apply chat template. removing the system role in chat history."
)
chat_templated = self.tokenizer.apply_chat_template(
[msg for msg in chat_history if msg["role"] != "system"],
tokenize=False,
add_generation_prompt=add_generation_prompt,
continue_final_message=not add_generation_prompt,
chat_template=self.hf_chat_template,
enable_thinking=self.enable_thinking,
)
return chat_templated return chat_templated
......
...@@ -150,6 +150,7 @@ ...@@ -150,6 +150,7 @@
| [translation](translation/README.md) | Tasks focused on evaluating the language translation capabilities of models. | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese | | [translation](translation/README.md) | Tasks focused on evaluating the language translation capabilities of models. | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese |
| [triviaqa](triviaqa/README.md) | A large-scale dataset for trivia question answering to test general knowledge. | English | | [triviaqa](triviaqa/README.md) | A large-scale dataset for trivia question answering to test general knowledge. | English |
| [truthfulqa](truthfulqa/README.md) | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English | | [truthfulqa](truthfulqa/README.md) | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English |
| [truthfulqa-multi](truthfulqa-multi/README.md) | Is a multilingual version of TruthfulQA, a QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English, Spanish, Catalan, Basque, Galician |
| [turkishmmlu](turkishmmlu/README.md) | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams. | Turkish | | [turkishmmlu](turkishmmlu/README.md) | A multiple-choice QA test modeled after MMLU, written in Turkish based on Turkish high-school level exams. | Turkish |
| [unitxt](unitxt/README.md) | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI. | English | | [unitxt](unitxt/README.md) | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI. | English |
| [unscramble](unscramble/README.md) | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding. | English | | [unscramble](unscramble/README.md) | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding. | English |
......
...@@ -50,3 +50,5 @@ If other tasks on this dataset are already supported: ...@@ -50,3 +50,5 @@ If other tasks on this dataset are already supported:
### Changelog ### Changelog
v2 20-MAR-2025: `humaneval_instruct`, `humaneval_instruct_64`: fixed typo in gen_prefix v2 20-MAR-2025: `humaneval_instruct`, `humaneval_instruct_64`: fixed typo in gen_prefix
v3 30-JUN-2025: Updated prompt generation and output parsing to align with the official `Llama-3.1-70B-Instruct-evals`. This corrects the prompt format and fixes a bug in locating the code block. See PR [#3092](https://github.com/EleutherAI/lm-evaluation-harness/pull/3092).
include: humaneval.yaml include: humaneval.yaml
task: humaneval_instruct task: humaneval_instruct
doc_to_text: "Write a solution to the following problem and make sure that it passes the tests:\n```{{prompt}}" doc_to_text: "Write a solution to the following problem and make sure that it passes the tests:\n```python\n{{ prompt }}\n```\n "
gen_prefix: "Here is the completed function:\n```python\n{{prompt}}\n" gen_prefix: "Here is the completed function:\n```python\n{{ prompt }}\n "
filter_list: filter_list:
- name: "create_test" - name: "create_test"
filter: filter:
- function: "custom" - function: "custom"
filter_fn: !function utils.build_predictions_instruct filter_fn: !function utils.build_predictions_instruct
metadata: metadata:
version: 2.0 version: 3.0
...@@ -32,7 +32,7 @@ def build_predictions_instruct( ...@@ -32,7 +32,7 @@ def build_predictions_instruct(
) -> list[list[str]]: ) -> list[list[str]]:
return [ return [
[ [
doc["prompt"] + (r if r.rfind("```") == -1 else r[: r.rfind("```")]) doc["prompt"] + (r if r.find("```") == -1 else r[: r.find("```")])
for r in resp for r in resp
] ]
for resp, doc in zip(resps, docs) for resp, doc in zip(resps, docs)
......
# TruthfulQA-Multi
## Paper
Title: `Truth Knows No Language: Evaluating Truthfulness Beyond English`
Abstract: `[https://arxiv.org/abs/2502.09387v1](https://arxiv.org/abs/2502.09387v1)`
We introduce a professionally translated extension of the TruthfulQA benchmark designed to evaluate truthfulness in Basque, Catalan, Galician, and Spanish. Truthfulness evaluations of large language models (LLMs) have primarily been conducted in English. However, the ability of LLMs to maintain truthfulness across languages remains under-explored. Our study evaluates 12 state-of-the-art open LLMs, comparing base and instruction-tuned models using human evaluation, multiple-choice metrics, and LLM-as-a-Judge scoring. Our findings reveal that, while LLMs perform best in English and worst in Basque (the lowest-resourced language), overall truthfulness discrepancies across languages are smaller than anticipated. Furthermore, we show that LLM-as-a-Judge correlates more closely with human judgments than multiple-choice metrics, and that informativeness plays a critical role in truthfulness assessment. Our results also indicate that machine translation provides a viable approach for extending truthfulness benchmarks to additional languages, offering a scalable alternative to professional translation. Finally, we observe that universal knowledge questions are better handled across languages than context- and time-dependent ones, highlighting the need for truthfulness evaluations that account for cultural and temporal variability. Dataset and code are publicly available under open licenses.
### Citation
```text
@misc{figueras2025truthknowslanguageevaluating,
title={Truth Knows No Language: Evaluating Truthfulness Beyond English},
author={Blanca Calvo Figueras and Eneko Sagarzazu and Julen Etxaniz and Jeremy Barnes and Pablo Gamallo and Iria De Dios Flores and Rodrigo Agerri},
year={2025},
eprint={2502.09387},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2502.09387},
}
```
### Groups, Tags, and Tasks
#### Groups
* `truthfulqa`: This task follows the [TruthfulQA dataset](https://arxiv.org/abs/2109.07958), but expands it to new languages.
#### Tasks
* `truthfulqa-multi_mc2_es`: `Multiple-choice, multiple answers in Spanish`
* `truthfulqa-multi_gen_es`: `Answer generation in Spanish`
* `truthfulqa-multi_mc2_ca`: `Multiple-choice, multiple answers in Catalan`
* `truthfulqa-multi_gen_ca`: `Answer generation in Catalan`
* `truthfulqa-multi_mc2_eu`: `Multiple-choice, multiple answers in Basque`
* `truthfulqa-multi_gen_eu`: `Answer generation in Basque`
* `truthfulqa-multi_mc2_gl`: `Multiple-choice, multiple answers in Galician`
* `truthfulqa-multi_gen_gl`: `Answer generation in Galician`
* `truthfulqa-multi_mc2_en`: `Multiple-choice, multiple answers in English`
* `truthfulqa-multi_gen_en`: `Answer generation in English`
### Checklist
For adding novel benchmarks/datasets to the library:
* [X] Is the task an existing benchmark in the literature?
* [X] Have you referenced the original paper that introduced the task?
* [X] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
### Changelog
include: truthfulqa-multi_gen_common
task: truthfulqa-multi_gen_ca
dataset_name: ca
tag:
- truthfulqa_multi
dataset_path: HiTZ/truthfulqa-multi
output_type: generate_until
generation_kwargs:
until:
- "!\n\n"
- "Q:"
- ".\n\n"
training_split: train
validation_split: validation
test_split: null
doc_to_target: "{{'A: ' + best_answer}}"
fewshot_split: train
fewshot_config:
sampler: first_n
process_docs: !function utils.process_docs_gen
process_results: !function utils.process_results_gen
doc_to_text: "{{'Q: ' + question}}"
should_decontaminate: True
doc_to_decontamination_query: question
metric_list:
# - metric: bleurt_max
# aggregation: mean
# higher_is_better: true
# - metric: bleurt_acc
# aggregation: mean
# higher_is_better: true
# - metric: bleurt_diff
# aggregation: mean
# higher_is_better: true
- metric: bleu_max
aggregation: mean
higher_is_better: true
- metric: bleu_acc
aggregation: mean
higher_is_better: true
- metric: bleu_diff
aggregation: mean
higher_is_better: true
#- metric: rouge1_max
# aggregation: mean
# higher_is_better: true
#- metric: rouge1_acc
# aggregation: mean
# higher_is_better: true
# - metric: rouge1_diff
# aggregation: mean
# higher_is_better: true
# - metric: rouge2_max
# aggregation: mean
# higher_is_better: true
# - metric: rouge2_acc
# aggregation: mean
# higher_is_better: true
# - metric: rouge2_diff
# aggregation: mean
# higher_is_better: true
# - metric: rougeL_max
# aggregation: mean
# higher_is_better: true
# - metric: rougeL_acc
# aggregation: mean
# higher_is_better: true
# - metric: rougeL_diff
# aggregation: mean
# higher_is_better: true
metadata:
version: 3.0
include: truthfulqa-multi_gen_common
task: truthfulqa-multi_gen_en
dataset_name: en
include: truthfulqa-multi_gen_common
task: truthfulqa-multi_gen_es
dataset_name: es
include: truthfulqa-multi_gen_common
task: truthfulqa-multi_gen_eu
dataset_name: eu
include: truthfulqa-multi_gen_common
task: truthfulqa-multi_gen_gl
dataset_name: gl
include: truthfulqa-multi_mc_common
task: truthfulqa-multi_mc1_ca
dataset_name: ca
include: truthfulqa-multi_mc_common
task: truthfulqa-multi_mc1_en
dataset_name: en
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment