"tests/squeezebert/test_modeling_squeezebert.py" did not exist on "505f2d749eb52f4b8b803d8c9a5f04442446e6c2"
Commit bf11ac93 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into llama

parents 83b1c564 ade01428
description: "The following are multiple choice questions (with answers) about psychology. Think step by step and then finish your answer with \"the answer is (X)\" where X is the correct letter choice.\n"
include: "_default_template_yaml"
task: "mmlu_pro_plus_psychology"
task_alias: "psychology"
process_docs: !function utils.process_psychology
from functools import partial
choices = [
"A",
"B",
"C",
"D",
"E",
"F",
"G",
"H",
"I",
"J",
"K",
"L",
"M",
"N",
"O",
"P",
]
def format_cot_example(example, including_answer=True):
prompt = "Question:\n"
question = example["question"]
options = example["options"]
prompt += question + "\n"
prompt += "Options:\n"
for i, opt in enumerate(options):
prompt += "{}. {}\n".format(choices[i], opt)
if including_answer:
cot_content = example["cot_content"].replace(
"A: Let's think step by step.", "Answer: Let's think step by step."
)
prompt += cot_content + "\n\n"
else:
prompt += "Answer: Let's think step by step."
return prompt
doc_to_text = partial(format_cot_example, including_answer=False)
fewshot_to_text = partial(format_cot_example, including_answer=True)
def process_docs(dataset, subject):
return dataset.filter(lambda x: x["category"] == subject)
process_biology = partial(process_docs, subject="biology")
process_business = partial(process_docs, subject="business")
process_chemistry = partial(process_docs, subject="chemistry")
process_computer_science = partial(process_docs, subject="computer science")
process_economics = partial(process_docs, subject="economics")
process_engineering = partial(process_docs, subject="engineering")
process_health = partial(process_docs, subject="health")
process_history = partial(process_docs, subject="history")
process_law = partial(process_docs, subject="law")
process_math = partial(process_docs, subject="math")
process_other = partial(process_docs, subject="other")
process_philosophy = partial(process_docs, subject="philosophy")
process_physics = partial(process_docs, subject="physics")
process_psychology = partial(process_docs, subject="psychology")
...@@ -11,7 +11,7 @@ import yaml ...@@ -11,7 +11,7 @@ import yaml
from tqdm import tqdm from tqdm import tqdm
eval_logger = logging.getLogger("lm-eval") eval_logger = logging.getLogger(__name__)
SUBJECTS = { SUBJECTS = {
......
...@@ -10,7 +10,7 @@ import yaml ...@@ -10,7 +10,7 @@ import yaml
from tqdm import tqdm from tqdm import tqdm
eval_logger = logging.getLogger("lm-eval") eval_logger = logging.getLogger(__name__)
SUBJECTS = { SUBJECTS = {
......
tag:
- moral_stories
task: moral_stories task: moral_stories
dataset_path: demelin/moral_stories dataset_path: demelin/moral_stories
dataset_name: full dataset_name: full
......
...@@ -14,7 +14,40 @@ The datasets included in PortugueseBench are: ...@@ -14,7 +14,40 @@ The datasets included in PortugueseBench are:
### Citation ### Citation
Paper for PortugueseBench coming soon.
```
@inproceedings{baucells-etal-2025-iberobench,
title = "{I}bero{B}ench: A Benchmark for {LLM} Evaluation in {I}berian Languages",
author = "Baucells, Irene and
Aula-Blasco, Javier and
de-Dios-Flores, Iria and
Paniagua Su{\'a}rez, Silvia and
Perez, Naiara and
Salles, Anna and
Sotelo Docio, Susana and
Falc{\~a}o, J{\'u}lia and
Saiz, Jose Javier and
Sepulveda Torres, Robiert and
Barnes, Jeremy and
Gamallo, Pablo and
Gonzalez-Agirre, Aitor and
Rigau, German and
Villegas, Marta",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.699/",
pages = "10491--10519",
}
```
### Groups and Tasks ### Groups and Tasks
......
...@@ -15,6 +15,7 @@ The datasets included in SpanishBench that have been made public in previous pub ...@@ -15,6 +15,7 @@ The datasets included in SpanishBench that have been made public in previous pub
| Task | Category | Paper title | Homepage | | Task | Category | Paper title | Homepage |
|:-------------:|:-----:|:-------------:|:-----:| |:-------------:|:-----:|:-------------:|:-----:|
| Belebele_es | Reading Comprehension | [The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants](https://arxiv.org/abs/2308.16884) | https://huggingface.co/datasets/facebook/belebele | | Belebele_es | Reading Comprehension | [The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants](https://arxiv.org/abs/2308.16884) | https://huggingface.co/datasets/facebook/belebele |
| Cocoteros_es | Commonsense Reasoning | [COCOTEROS: A Spanish Corpus with Contextual Knowledge for Natural Language Generation](https://besaya.infor.uva.es/sepln24/paper04.pdf) | https://huggingface.co/datasets/gplsi/cocoteros |
| EsCoLA | Linguistic Acceptability | [EsCoLA: Spanish Corpus of Linguistic Acceptability](https://aclanthology.org/2024.lrec-main.554/) | https://huggingface.co/datasets/nbel/EsCoLA | | EsCoLA | Linguistic Acceptability | [EsCoLA: Spanish Corpus of Linguistic Acceptability](https://aclanthology.org/2024.lrec-main.554/) | https://huggingface.co/datasets/nbel/EsCoLA |
| FLORES_es | Translation | [The FLORES-101 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation](https://arxiv.org/abs/2106.03193) | https://huggingface.co/datasets/facebook/flores | | FLORES_es | Translation | [The FLORES-101 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation](https://arxiv.org/abs/2106.03193) | https://huggingface.co/datasets/facebook/flores |
| MGSM_es | Math | [Language Models are Multilingual Chain-of-Thought Reasoners](https://arxiv.org/abs/2210.03057) | https://huggingface.co/datasets/juletxara/mgsm | | MGSM_es | Math | [Language Models are Multilingual Chain-of-Thought Reasoners](https://arxiv.org/abs/2210.03057) | https://huggingface.co/datasets/juletxara/mgsm |
...@@ -28,7 +29,40 @@ The datasets included in SpanishBench that have been made public in previous pub ...@@ -28,7 +29,40 @@ The datasets included in SpanishBench that have been made public in previous pub
### Citation ### Citation
Paper for SpanishBench coming soon.
```
@inproceedings{baucells-etal-2025-iberobench,
title = "{I}bero{B}ench: A Benchmark for {LLM} Evaluation in {I}berian Languages",
author = "Baucells, Irene and
Aula-Blasco, Javier and
de-Dios-Flores, Iria and
Paniagua Su{\'a}rez, Silvia and
Perez, Naiara and
Salles, Anna and
Sotelo Docio, Susana and
Falc{\~a}o, J{\'u}lia and
Saiz, Jose Javier and
Sepulveda Torres, Robiert and
Barnes, Jeremy and
Gamallo, Pablo and
Gonzalez-Agirre, Aitor and
Rigau, German and
Villegas, Marta",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.699/",
pages = "10491--10519",
}
```
### Groups and Tasks ### Groups and Tasks
...@@ -44,6 +78,7 @@ Paper for SpanishBench coming soon. ...@@ -44,6 +78,7 @@ Paper for SpanishBench coming soon.
The following tasks evaluate tasks on SpanishBench dataset using various scoring methods. The following tasks evaluate tasks on SpanishBench dataset using various scoring methods.
- `belebele_spa_Latn` - `belebele_spa_Latn`
- `cocoteros_es`
- `copa_es` - `copa_es`
- `escola` - `escola`
- `flores_es` - `flores_es`
......
task: cocoteros_es
dataset_path: gplsi/cocoteros
dataset_name: null
output_type: generate_until
doc_to_text: "Genera una frase corta con estas palabras: {{keywords}}. El contexto es: {{context}} \n\nRespuesta:"
doc_to_target: "{{text}}"
training_split: train
test_split: test
target_delimiter: ' '
generation_kwargs:
max_gen_toks: 40
until:
- "\n"
metric_list:
- metric: bleu
aggregation: bleu
higher_is_better: true
- metric: !function utils.rouge1
aggregation: !function utils.rouge1_agg
higher_is_better: true
metadata:
version: 1.0
...@@ -13,5 +13,6 @@ task: ...@@ -13,5 +13,6 @@ task:
- mgsm_direct_es_spanish_bench - mgsm_direct_es_spanish_bench
- flores_es - flores_es
- phrases_es - phrases_es
- cocoteros_es
metadata: metadata:
version: 1.0 version: 1.0
...@@ -33,7 +33,9 @@ class SQUADCompletion(ConfigurableTask): ...@@ -33,7 +33,9 @@ class SQUADCompletion(ConfigurableTask):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return doc["value"] return doc["value"]
def construct_requests(self, doc, ctx, **kwargs): def construct_requests(
self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
):
"""Uses RequestFactory to construct Requests and returns an iterable of """Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM. Requests which will be sent to the LM.
......
...@@ -105,7 +105,9 @@ class SQuAD2(ConfigurableTask): ...@@ -105,7 +105,9 @@ class SQuAD2(ConfigurableTask):
answer = "unanswerable" answer = "unanswerable"
return " " + answer return " " + answer
def construct_requests(self, doc, ctx, **kwargs): def construct_requests(
self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
):
"""Uses RequestFactory to construct Requests and returns an iterable of """Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM. Requests which will be sent to the LM.
......
...@@ -33,7 +33,9 @@ class SWDE(ConfigurableTask): ...@@ -33,7 +33,9 @@ class SWDE(ConfigurableTask):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return doc["value"] return doc["value"]
def construct_requests(self, doc, ctx, **kwargs): def construct_requests(
self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
):
"""Uses RequestFactory to construct Requests and returns an iterable of """Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM. Requests which will be sent to the LM.
......
...@@ -9,7 +9,7 @@ fewshot_config: ...@@ -9,7 +9,7 @@ fewshot_config:
output_type: multiple_choice output_type: multiple_choice
doc_to_text: "Soru: {{ question.strip() }}\nA. {{ choices[0] }}\nB. {{ choices[1] }}\nC. {{ choices[2] }}\nD. {{ choices[3] }}\nE. {{ choices[4] }}\nCevap:" doc_to_text: "Soru: {{ question.strip() }}\nA. {{ choices[0] }}\nB. {{ choices[1] }}\nC. {{ choices[2] }}\nD. {{ choices[3] }}\nE. {{ choices[4] }}\nCevap:"
doc_to_choice: ["A", "B", "C", "D", "E"] doc_to_choice: ["A", "B", "C", "D", "E"]
doc_to_target: "{{['A', 'B', 'C', 'D', 'E'].index(answer)}}" doc_to_target: "{{ answer.strip() }}"
metric_list: metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
......
...@@ -109,6 +109,7 @@ class Unitxt(ConfigurableTask): ...@@ -109,6 +109,7 @@ class Unitxt(ConfigurableTask):
apply_chat_template: bool = False, apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False, fewshot_as_multiturn: bool = False,
chat_template: Optional[Callable] = None, chat_template: Optional[Callable] = None,
gen_prefix: Optional[str] = None,
) -> str: ) -> str:
source = self.doc_to_text(doc) source = self.doc_to_text(doc)
if isinstance(source, list): if isinstance(source, list):
...@@ -134,6 +135,7 @@ class Unitxt(ConfigurableTask): ...@@ -134,6 +135,7 @@ class Unitxt(ConfigurableTask):
part of the document for `doc`. part of the document for `doc`.
""" """
kwargs.pop("apply_chat_template", False) # Not used by unitxt kwargs.pop("apply_chat_template", False) # Not used by unitxt
kwargs.pop("chat_template", False) # Not used by unitxt
return [ return [
Instance( Instance(
request_type="generate_until", request_type="generate_until",
......
...@@ -17,13 +17,6 @@ import yaml ...@@ -17,13 +17,6 @@ import yaml
from jinja2 import BaseLoader, Environment, StrictUndefined from jinja2 import BaseLoader, Environment, StrictUndefined
logging.basicConfig(
format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=logging.INFO,
)
eval_logger = logging.getLogger("lm-eval")
SPACING = " " * 47 SPACING = " " * 47
HIGHER_IS_BETTER_SYMBOLS = { HIGHER_IS_BETTER_SYMBOLS = {
...@@ -32,6 +25,33 @@ HIGHER_IS_BETTER_SYMBOLS = { ...@@ -32,6 +25,33 @@ HIGHER_IS_BETTER_SYMBOLS = {
} }
def setup_logging(verbosity=logging.INFO):
# Configure the root logger
log_level = os.environ.get("LOGLEVEL", verbosity) or verbosity
level_map = {
"DEBUG": logging.DEBUG,
"INFO": logging.INFO,
"WARNING": logging.WARNING,
"ERROR": logging.ERROR,
"CRITICAL": logging.CRITICAL,
}
log_level = level_map.get(str(log_level).upper(), logging.INFO)
if not logging.root.handlers:
logging.basicConfig(
format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(name)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=log_level,
)
if log_level == logging.DEBUG:
third_party_loggers = ["urllib3", "filelock", "fsspec"]
for logger_name in third_party_loggers:
logging.getLogger(logger_name).setLevel(logging.INFO)
else:
logging.getLogger().setLevel(log_level)
def hash_string(string: str) -> str: def hash_string(string: str) -> str:
return hashlib.sha256(string.encode("utf-8")).hexdigest() return hashlib.sha256(string.encode("utf-8")).hexdigest()
......
...@@ -66,7 +66,7 @@ ibm_watsonx_ai = ["ibm_watsonx_ai>=1.1.22"] ...@@ -66,7 +66,7 @@ ibm_watsonx_ai = ["ibm_watsonx_ai>=1.1.22"]
ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"] ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"]
neuronx = ["optimum[neuronx]"] neuronx = ["optimum[neuronx]"]
mamba = ["mamba_ssm", "causal-conv1d==1.0.2"] mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"] math = ["sympy>=1.12", "antlr4-python3-runtime==4.11", "math_verify[antlr4_11_0]"]
multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"] multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
optimum = ["optimum[openvino]"] optimum = ["optimum[openvino]"]
promptsource = ["promptsource>=0.2.3"] promptsource = ["promptsource>=0.2.3"]
......
import argparse import argparse
import logging
import os import os
import yaml import yaml
from promptsource.templates import DatasetTemplates from promptsource.templates import DatasetTemplates
from tqdm import tqdm from tqdm import tqdm
# from lm_eval.api.registry import ALL_TASKS # from lm_eval.api.registry import ALL_TASKS
from lm_eval.logger import eval_logger eval_logger = logging.getLogger(__name__)
# from lm_eval.tasks import include_task_folder # from lm_eval.tasks import include_task_folder
......
...@@ -10,7 +10,6 @@ import os ...@@ -10,7 +10,6 @@ import os
from pytablewriter import LatexTableWriter, MarkdownTableWriter from pytablewriter import LatexTableWriter, MarkdownTableWriter
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
......
...@@ -11,7 +11,6 @@ from pytablewriter import MarkdownTableWriter ...@@ -11,7 +11,6 @@ from pytablewriter import MarkdownTableWriter
from lm_eval import tasks from lm_eval import tasks
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
......
import argparse import argparse
import logging
import os import os
from typing import Dict, List, Tuple from typing import Dict, List, Tuple
...@@ -8,11 +9,11 @@ import torch ...@@ -8,11 +9,11 @@ import torch
import lm_eval.evaluator import lm_eval.evaluator
import lm_eval.models.utils import lm_eval.models.utils
from lm_eval import tasks, utils from lm_eval import tasks
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
eval_logger = utils.eval_logger eval_logger = logging.getLogger(__name__)
def memory_stats(): def memory_stats():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment