Commit bf11ac93 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into llama

parents 83b1c564 ade01428
description: "The following are multiple choice questions (with answers) about psychology. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n"
include: "_default_template_yaml"
task: "mmlu_pro_plus_psychology"
task_alias: "psychology"
process_docs: !function utils.process_psychology
from functools import partial
choices = [
"A",
"B",
"C",
"D",
"E",
"F",
"G",
"H",
"I",
"J",
"K",
"L",
"M",
"N",
"O",
"P",
]
def format_cot_example(example, including_answer=True):
prompt = "Question:\n"
question = example["question"]
options = example["options"]
prompt += question + "\n"
prompt += "Options:\n"
for i, opt in enumerate(options):
prompt += "{}. {}\n".format(choices[i], opt)
if including_answer:
cot_content = example["cot_content"].replace(
"A: Let's think step by step.", "Answer: Let's think step by step."
)
prompt += cot_content + "\n\n"
else:
prompt += "Answer: Let's think step by step."
return prompt
doc_to_text = partial(format_cot_example, including_answer=False)
fewshot_to_text = partial(format_cot_example, including_answer=True)
def process_docs(dataset, subject):
return dataset.filter(lambda x: x["category"] == subject)
process_biology = partial(process_docs, subject="biology")
process_business = partial(process_docs, subject="business")
process_chemistry = partial(process_docs, subject="chemistry")
process_computer_science = partial(process_docs, subject="computer science")
process_economics = partial(process_docs, subject="economics")
process_engineering = partial(process_docs, subject="engineering")
process_health = partial(process_docs, subject="health")
process_history = partial(process_docs, subject="history")
process_law = partial(process_docs, subject="law")
process_math = partial(process_docs, subject="math")
process_other = partial(process_docs, subject="other")
process_philosophy = partial(process_docs, subject="philosophy")
process_physics = partial(process_docs, subject="physics")
process_psychology = partial(process_docs, subject="psychology")
......@@ -11,7 +11,7 @@ import yaml
from tqdm import tqdm
eval_logger = logging.getLogger("lm-eval")
eval_logger = logging.getLogger(__name__)
SUBJECTS = {
......
......@@ -10,7 +10,7 @@ import yaml
from tqdm import tqdm
eval_logger = logging.getLogger("lm-eval")
eval_logger = logging.getLogger(__name__)
SUBJECTS = {
......
tag:
- moral_stories
task: moral_stories
dataset_path: demelin/moral_stories
dataset_name: full
......
......@@ -14,7 +14,40 @@ The datasets included in PortugueseBench are:
### Citation
Paper for PortugueseBench coming soon.
```
@inproceedings{baucells-etal-2025-iberobench,
title = "{I}bero{B}ench: A Benchmark for {LLM} Evaluation in {I}berian Languages",
author = "Baucells, Irene and
Aula-Blasco, Javier and
de-Dios-Flores, Iria and
Paniagua Su{\'a}rez, Silvia and
Perez, Naiara and
Salles, Anna and
Sotelo Docio, Susana and
Falc{\~a}o, J{\'u}lia and
Saiz, Jose Javier and
Sepulveda Torres, Robiert and
Barnes, Jeremy and
Gamallo, Pablo and
Gonzalez-Agirre, Aitor and
Rigau, German and
Villegas, Marta",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.699/",
pages = "10491--10519",
}
```
### Groups and Tasks
......
......@@ -15,6 +15,7 @@ The datasets included in SpanishBench that have been made public in previous pub
| Task | Category | Paper title | Homepage |
|:-------------:|:-----:|:-------------:|:-----:|
| Belebele_es | Reading Comprehension | [The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants](https://arxiv.org/abs/2308.16884) | https://huggingface.co/datasets/facebook/belebele |
| Cocoteros_es | Commonsense Reasoning | [COCOTEROS: A Spanish Corpus with Contextual Knowledge for Natural Language Generation](https://besaya.infor.uva.es/sepln24/paper04.pdf) | https://huggingface.co/datasets/gplsi/cocoteros |
| EsCoLA | Linguistic Acceptability | [EsCoLA: Spanish Corpus of Linguistic Acceptability](https://aclanthology.org/2024.lrec-main.554/) | https://huggingface.co/datasets/nbel/EsCoLA |
| FLORES_es | Translation | [The FLORES-101 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation](https://arxiv.org/abs/2106.03193) | https://huggingface.co/datasets/facebook/flores |
| MGSM_es | Math | [Language Models are Multilingual Chain-of-Thought Reasoners](https://arxiv.org/abs/2210.03057) | https://huggingface.co/datasets/juletxara/mgsm |
......@@ -28,7 +29,40 @@ The datasets included in SpanishBench that have been made public in previous pub
### Citation
Paper for SpanishBench coming soon.
```
@inproceedings{baucells-etal-2025-iberobench,
title = "{I}bero{B}ench: A Benchmark for {LLM} Evaluation in {I}berian Languages",
author = "Baucells, Irene and
Aula-Blasco, Javier and
de-Dios-Flores, Iria and
Paniagua Su{\'a}rez, Silvia and
Perez, Naiara and
Salles, Anna and
Sotelo Docio, Susana and
Falc{\~a}o, J{\'u}lia and
Saiz, Jose Javier and
Sepulveda Torres, Robiert and
Barnes, Jeremy and
Gamallo, Pablo and
Gonzalez-Agirre, Aitor and
Rigau, German and
Villegas, Marta",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.699/",
pages = "10491--10519",
}
```
### Groups and Tasks
......@@ -44,6 +78,7 @@ Paper for SpanishBench coming soon.
The following tasks evaluate tasks on SpanishBench dataset using various scoring methods.
- `belebele_spa_Latn`
- `cocoteros_es`
- `copa_es`
- `escola`
- `flores_es`
......
task: cocoteros_es
dataset_path: gplsi/cocoteros
dataset_name: null
output_type: generate_until
doc_to_text: "Genera una frase corta con estas palabras: {{keywords}}. El contexto es: {{context}} \n\nRespuesta:"
doc_to_target: "{{text}}"
training_split: train
test_split: test
target_delimiter: ' '
generation_kwargs:
max_gen_toks: 40
until:
- "\n"
metric_list:
- metric: bleu
aggregation: bleu
higher_is_better: true
- metric: !function utils.rouge1
aggregation: !function utils.rouge1_agg
higher_is_better: true
metadata:
version: 1.0
......@@ -13,5 +13,6 @@ task:
- mgsm_direct_es_spanish_bench
- flores_es
- phrases_es
- cocoteros_es
metadata:
version: 1.0
......@@ -33,7 +33,9 @@ class SQUADCompletion(ConfigurableTask):
def doc_to_target(self, doc):
return doc["value"]
def construct_requests(self, doc, ctx, **kwargs):
def construct_requests(
self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
......
......@@ -105,7 +105,9 @@ class SQuAD2(ConfigurableTask):
answer = "unanswerable"
return " " + answer
def construct_requests(self, doc, ctx, **kwargs):
def construct_requests(
self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
......
......@@ -33,7 +33,9 @@ class SWDE(ConfigurableTask):
def doc_to_target(self, doc):
return doc["value"]
def construct_requests(self, doc, ctx, **kwargs):
def construct_requests(
self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
......
......@@ -9,7 +9,7 @@ fewshot_config:
output_type: multiple_choice
doc_to_text: "Soru: {{ question.strip() }}\nA. {{ choices[0] }}\nB. {{ choices[1] }}\nC. {{ choices[2] }}\nD. {{ choices[3] }}\nE. {{ choices[4] }}\nCevap:"
doc_to_choice: ["A", "B", "C", "D", "E"]
doc_to_target: "{{['A', 'B', 'C', 'D', 'E'].index(answer)}}"
doc_to_target: "{{ answer.strip() }}"
metric_list:
- metric: acc
aggregation: mean
......
......@@ -109,6 +109,7 @@ class Unitxt(ConfigurableTask):
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
chat_template: Optional[Callable] = None,
gen_prefix: Optional[str] = None,
) -> str:
source = self.doc_to_text(doc)
if isinstance(source, list):
......@@ -134,6 +135,7 @@ class Unitxt(ConfigurableTask):
part of the document for `doc`.
"""
kwargs.pop("apply_chat_template", False) # Not used by unitxt
kwargs.pop("chat_template", False) # Not used by unitxt
return [
Instance(
request_type="generate_until",
......
......@@ -17,13 +17,6 @@ import yaml
from jinja2 import BaseLoader, Environment, StrictUndefined
logging.basicConfig(
format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=logging.INFO,
)
eval_logger = logging.getLogger("lm-eval")
SPACING = " " * 47
HIGHER_IS_BETTER_SYMBOLS = {
......@@ -32,6 +25,33 @@ HIGHER_IS_BETTER_SYMBOLS = {
}
def setup_logging(verbosity=logging.INFO):
# Configure the root logger
log_level = os.environ.get("LOGLEVEL", verbosity) or verbosity
level_map = {
"DEBUG": logging.DEBUG,
"INFO": logging.INFO,
"WARNING": logging.WARNING,
"ERROR": logging.ERROR,
"CRITICAL": logging.CRITICAL,
}
log_level = level_map.get(str(log_level).upper(), logging.INFO)
if not logging.root.handlers:
logging.basicConfig(
format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(name)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=log_level,
)
if log_level == logging.DEBUG:
third_party_loggers = ["urllib3", "filelock", "fsspec"]
for logger_name in third_party_loggers:
logging.getLogger(logger_name).setLevel(logging.INFO)
else:
logging.getLogger().setLevel(log_level)
def hash_string(string: str) -> str:
return hashlib.sha256(string.encode("utf-8")).hexdigest()
......
......@@ -66,7 +66,7 @@ ibm_watsonx_ai = ["ibm_watsonx_ai>=1.1.22"]
ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"]
neuronx = ["optimum[neuronx]"]
mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"]
math = ["sympy>=1.12", "antlr4-python3-runtime==4.11", "math_verify[antlr4_11_0]"]
multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
optimum = ["optimum[openvino]"]
promptsource = ["promptsource>=0.2.3"]
......
import argparse
import logging
import os
import yaml
from promptsource.templates import DatasetTemplates
from tqdm import tqdm
# from lm_eval.api.registry import ALL_TASKS
from lm_eval.logger import eval_logger
eval_logger = logging.getLogger(__name__)
# from lm_eval.tasks import include_task_folder
......
......@@ -10,7 +10,6 @@ import os
from pytablewriter import LatexTableWriter, MarkdownTableWriter
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
......
......@@ -11,7 +11,6 @@ from pytablewriter import MarkdownTableWriter
from lm_eval import tasks
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
......
import argparse
import logging
import os
from typing import Dict, List, Tuple
......@@ -8,11 +9,11 @@ import torch
import lm_eval.evaluator
import lm_eval.models.utils
from lm_eval import tasks, utils
from lm_eval import tasks
os.environ["TOKENIZERS_PARALLELISM"] = "false"
eval_logger = utils.eval_logger
eval_logger = logging.getLogger(__name__)
def memory_stats():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment