Commit bf11ac93 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into llama

parents 83b1c564 ade01428
dataset_path: evalitahf/word_in_context
dataset_name: default
output_type: multiple_choice
test_split: test
fewshot_split: dev
validation_split: dev
doc_to_target: label # 0: No, 1: Si
doc_to_choice: ["No", "Sì"]
metric_list:
- metric: f1
higher_is_better: true
aggregation: f1
metadata:
version: 1.0
import torch
from sklearn.metrics import f1_score, precision_score, recall_score
inference_decorator = (
torch.inference_mode if torch.__version__ >= "2.0.0" else torch.no_grad
)
def _aggreg_ls(predictions):
"""
Custom aggregation to compute corpus level metrics for the lexical substitution task
predictions is a list of tuples (prec, has_answ, has_annotation)
prec is the precision before dividing by |A|
has_answ is 0 if the model did not produce any answer
has_annotation is 0 if the gold answer is empty: no synonims from annotators
"""
# get |A| and |T| to compute the final precision and recall using a lambda function
A = sum([p[1] for p in predictions])
T = sum([p[2] for p in predictions])
# compute the final precision and recall
if A == 0:
prec = sum([p[0] for p in predictions]) / 1
else:
prec = sum([p[0] for p in predictions]) / A
if T == 0:
rec = sum([p[0] for p in predictions]) / 1
else:
rec = sum([p[0] for p in predictions]) / T
# compute the final F1 score
f1 = 0
if prec + rec != 0:
f1 = (2 * prec * rec) / (prec + rec)
return f1
def _aggreg_sa_v2(predictions):
"""
This aggregation considers the sentiment analysis task as a multiple choice one with four classes
the f1 score is computed as the average of the f1 scores for each class weighted by the number of samples
See sklearn.metrics.f1_score for more details
"""
predictions, references = zip(*predictions)
f1 = f1_score(references, predictions, average="weighted")
return f1
def _aggreg_sa(predictions):
"""
Custom aggregation function for the sentiment analysis task
The original tasks compute the F1 score for each class and then average them
Since the prompt cast the task to a multple choice one we need to aggregate the results in a different way
"""
# split the predictions and references in two lists (pred is a tuple)
predictions, references = zip(*predictions)
"""
Class 0: positivo -> 'opos': 1, 'oneg': 0
Class 1: negativo -> 'opos': 0, 'oneg': 1
etc.
"""
def _map_to_original_labels(x):
"""
Return two separate list of labels for opos and oneg
x is a list of integers
"""
opos = []
oneg = []
for i in x:
if i == 0:
# positive
opos.append(1)
oneg.append(0)
elif i == 1:
# negative
opos.append(0)
oneg.append(1)
elif i == 2:
# neutral
opos.append(0)
oneg.append(0)
elif i == 3:
# mixed
opos.append(1)
oneg.append(1)
else:
pass
return opos, oneg
pred_opos, pred_oneg = _map_to_original_labels(predictions)
ref_opos, ref_oneg = _map_to_original_labels(references)
opos_f1 = f1_score(ref_opos, pred_opos, average=None)
opos_f1_c0 = f1_score(ref_opos, pred_opos, average=None)[0]
if len(opos_f1) > 1:
opos_f1_c1 = opos_f1[1]
else:
opos_f1_c1 = 0
# oneg class
oneg_prec_c0, oneg_prec_c1 = precision_score(
ref_oneg, pred_oneg, labels=[0, 1], average=None
)
oneg_rec_c0, oneg_rec_c1 = recall_score(
ref_oneg, pred_oneg, labels=[0, 1], average=None
)
oneg_f1 = f1_score(ref_oneg, pred_oneg, average=None)
oneg_f1_c0 = f1_score(ref_oneg, pred_oneg, average=None)[0]
if len(oneg_f1) > 1:
oneg_f1_c1 = f1_score(ref_oneg, pred_oneg, average=None)[1]
else:
oneg_f1_c1 = 0
# average f1 score for each class (opos and oneg)
f1_score_opos = (opos_f1_c0 + opos_f1_c1) / 2
f1_score_oneg = (oneg_f1_c0 + oneg_f1_c1) / 2
# average f1 score for the two classes
f1_final = (f1_score_opos + f1_score_oneg) / 2
return f1_final
def _aggreg_ner(predictions):
pred, ref = zip(*predictions)
# concat all the predictions and references
all_pred = []
for p in pred:
all_pred.extend(p)
all_ref = []
for r in ref:
all_ref.extend(r)
# compute the F1 score
f1 = f1_score(all_ref, all_pred, average=None)
if len(f1) > 1:
f1_sum = sum(f1[:-1]) / (len(f1) - 1)
else:
f1_sum = f1[0]
return f1_sum
def _aggreg_rel(predictions):
pred, ref = zip(*predictions)
# concat all the predictions and references
all_pred = []
for p in pred:
all_pred.extend(p)
all_ref = []
for r in ref:
all_ref.extend(r)
# compute the F1 score
f1 = f1_score(all_ref, all_pred, average="macro")
return f1
# ------------------------ DOCUMENT DATING ---------------------------
def _aggreg_dd(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="macro")
return fscore
import logging
from evaluate import load
from sklearn.metrics import f1_score
eval_logger = logging.getLogger("lm-eval")
# ---------------------- SENTIMENT ANALYSIS ----------------------
def sa_doc_to_target(x):
"""
Function to extract the target from the dataset for sentiment analysis
"""
opos = x["opos"]
oneg = x["oneg"]
# return indexes matches the choices in sa_doc_to_choice
if opos == "1" and oneg == "0":
return 0
elif opos == "0" and oneg == "1":
return 1
elif opos == "0" and oneg == "0":
return 2
elif opos == "1" and oneg == "1":
return 3
else:
pass
def sa_doc_to_target_v2(x):
"""
Function to extract the target from the dataset for sentiment analysis
"""
opos = x["opos"]
oneg = x["oneg"]
# return indexes matches the choices in sa_doc_to_choice
if opos == "1" and oneg == "0":
return 0
elif opos == "0" and oneg == "1":
return 1
elif opos == "0" and oneg == "0":
return 2
elif opos == "1" and oneg == "1":
return 3
else:
pass
def sa_doc_to_choice(x):
"""
Function to return the choices from the dataset for sentiment analysis
"""
return ["Positivo", "Negativo", "Neutrale", "Misto"]
# ---------------------- LEXICAL SUBSTITUTION ----------------------
NO_SYN_STRING = "&&NOSYN&&"
def _ls_gold_to_target(x):
"""
Generate the target for the lexical similarity task
"""
# all_answers = [(i["word"], i["count"]) for i in x["answers"]]
if len(x["answers"]) == 0:
return NO_SYN_STRING
ans_str = ""
for i in x["answers"]:
ans_str += i["word"] + "$$" + str(i["count"]) + "::"
if len(ans_str) != 0 and ans_str[-2] == ":":
ans_str = ans_str[:-2]
# print(ans_str)
return ans_str
def ls_doc_to_target(x):
"""
Generate the target for the lexical similarity task
"""
if len(x["answers"]) == 0:
return NO_SYN_STRING
ans_str = ""
for i in x["answers"]:
ans_str += i["word"] + ", "
if len(ans_str) != 0 and ans_str[-2] == ",":
ans_str = ans_str[:-2]
return ans_str
def _ls_split_gold(x):
"""
Split the gold string into a list of tuples
"""
if x == NO_SYN_STRING:
return [], []
answers = x.split("::")
words = []
freqs = []
if len(answers) != 0:
for a in answers:
if "$$" in a:
word, count = a.split("$$")
words.append(word)
try:
freqs.append(int(count))
except ValueError:
freqs.append(0)
return words, freqs
def ls_process_results(doc, results):
"""
Process the results of the evaluation for the lexical substitution task
look at coqa for another example
"""
gold_to_target = _ls_gold_to_target(doc)
words, freqs = _ls_split_gold(gold_to_target)
prec = 0
# Considering a maximum of the first 10 synonyms
results = split_text_with_regex(results[0], LS_SPLIT_REGEX)
results = results[: min(10, len(results))]
# Remove non-alphabetic characters from the word at the end of the list
if results: # Check if results is not empty
results[-1] = "".join(char for char in results[-1] if char.isalpha())
has_answ = 0 if len(results) == 0 else 1 # so we can compute |A|
has_annotation = 0 if len(words) == 0 else 1 # so we can compute |T|
matching_res = [] # for debugging
for r in results:
if r in words:
# get frequency of the synonyms from annotators
idx = words.index(r.strip())
prec += freqs[idx]
matching_res.append(r)
# In the case of the OOT (out of ten) subtask, this normalization should not be applied
# ai = len(results) if len(results) != 0 else 1
# prec = prec / ai
Hi = sum(freqs)
if Hi != 0:
prec = prec / Hi
else:
eval_logger.debug("H_i is 0")
return {"f1": (prec, has_answ, has_annotation)}
# ---------------------- NER ----------------------
NO_ENT_STRING = "&&NOENT&&"
NER_ENTITY_SEPARATOR = ","
NER_TYPE_SEPARATOR = "$"
NER_MAPPING_V2 = {"PER": 0, "LOC": 1, "ORG": 2, NO_ENT_STRING: 3, "O": 4}
NER_MAPPING = {"PER": 0, "LOC": 1, "ORG": 2, "O": 3}
def _ner_gold_to_target(x: list) -> list:
"""
Convert the gold entities to the target format according to the NER_MAPPING
"""
res = [NER_MAPPING[e["type"]] for e in x]
return res
def _ner_gold_to_target_v2(x: list) -> list:
"""
Convert the gold entities to the target format according to the NER_MAPPING
"""
res = [NER_MAPPING[e["type"]] for e in x]
return res
def ner_doc_to_target(doc):
ents = doc["entities"]
targ_str = ""
# Entità$Tipo%Entità$Tipo.
if ents == []:
return NO_ENT_STRING
else:
for e in ents:
targ_str += (
e["entity_text"] + NER_TYPE_SEPARATOR + e["type"] + NER_ENTITY_SEPARATOR
)
return targ_str[:-1]
def ner_process_results(doc, results):
"""
Process the results of the Named Entity Recognition task
"""
# each document has a list of entities with the following format:
# [{"entity_text": "string", "type": "string"}]
gold = doc["entities"]
raw_results = results[0]
results = _ner_process_raw_output(raw_results)
gold_labels = _ner_gold_to_target(gold)
res_labels = [0] * len(gold_labels)
matched_gold_idx = []
if len(results) > len(gold):
for r in results:
r_text = r[0]
r_type = r[1]
for i in range(len(gold)):
if r_text == gold[i]["entity_text"] and r_type == gold[i]["type"]:
res_labels[i] = NER_MAPPING[r_type]
matched_gold_idx.append(i)
# Since we have more results than gold, we artificially set to false positive the remaining labels
# extend gold label list
for i in range(len(results) - len(gold)):
gold_labels.append(3)
res_labels.append(2)
elif len(results) == 0 and len(gold) == 0:
res_labels = [3]
gold_labels = res_labels
else: # len(results) <= len(gold)
for r in results:
r_text = r[0]
r_type = r[1]
for i in range(len(gold)):
if r_text == gold[i]["entity_text"] and r_type == gold[i]["type"]:
res_labels[i] = NER_MAPPING[r_type]
matched_gold_idx.append(i)
# we map all wrong predictions to the "O" class
for i in range(len(gold_labels)):
if i in matched_gold_idx:
continue
if gold_labels[i] == 1:
res_labels[i] = 3
elif gold_labels[i] == 0:
res_labels[i] = 3
else:
res_labels[i] = 3
assert len(gold_labels) == len(res_labels)
return {"f1": (res_labels, gold_labels)}
def ner_process_results_v2(doc, results):
"""
Process the results of the Named Entity Recognition task
This version considers and score explicitly when the model responds that there are no entities
"""
# each document has a list of entities with the following format:
# [{"entity_text": "string", "type": "string"}]
gold = doc["entities"]
raw_results = results[0]
results = _ner_process_raw_output_v2(raw_results)
# eval_logger.debug(f"results {results}")
# eval_logger.debug(f"gold {gold}")
gold_labels = _ner_gold_to_target_v2(gold)
res_labels = [0] * len(gold_labels)
matched_gold_idx = []
if len(results) > len(gold):
for r in results:
# print(r)
r_text = r[0]
r_type = r[1]
for i in range(len(gold)):
if r_text == gold[i]["entity_text"] and r_type == gold[i]["type"]:
res_labels[i] = NER_MAPPING[r_type]
matched_gold_idx.append(i)
# Since we have more results than gold, we artificially set to false positive the remaining labels
# extend gold label list
for i in range(len(results) - len(gold)):
# gold_labels.append(3)
# res_labels.append(2)
gold_labels.append(4)
res_labels.append(3)
elif len(results) == 0 and len(gold) == 0:
# res_labels = [random.choice([0, 1, 2, 3])]
res_labels = [3]
gold_labels = res_labels
elif len(results) == 1 and results[0] == NO_ENT_STRING:
# res_labels = [3]
res_labels = [4]
gold_labels = res_labels
else: # len(results) <= len(gold)
for r in results:
r_text = r[0]
r_type = r[1]
for i in range(len(gold)):
if r_text == gold[i]["entity_text"] and r_type == gold[i]["type"]:
res_labels[i] = NER_MAPPING[r_type]
matched_gold_idx.append(i)
# we map all wrong predictions to the "O" class
for i in range(len(gold_labels)):
if i in matched_gold_idx:
continue
if gold_labels[i] == 1:
# res_labels[i] = 2
res_labels[i] = 4
elif gold_labels[i] == 0:
# res_labels[i] = 1
res_labels[i] = 4
else:
res_labels[i] = 4
assert len(gold_labels) == len(res_labels)
return {"f1": (res_labels, gold_labels)}
def _ner_process_raw_output(llm_result: str) -> list[tuple]:
if NO_ENT_STRING in llm_result:
return []
if llm_result == "":
return ["WRONG"]
tmp_results = llm_result.split(NER_ENTITY_SEPARATOR)
results = []
for res in tmp_results:
r = res.strip()
# split on type separator
r_text = ""
r_type = ""
r_splitted = r.split(NER_TYPE_SEPARATOR)
if len(r_splitted) < 2:
r_text = r_splitted[0]
r_type = ""
else:
r_text = r_splitted[0]
r_type = r_splitted[1]
if r_text != "":
results.append((r_text, r_type.upper()))
return results
def _ner_process_raw_output_v2(llm_result: str) -> list[tuple]:
if NO_ENT_STRING in llm_result:
return [NO_ENT_STRING]
if llm_result == "":
return ["WRONG"]
tmp_results = llm_result.split(NER_ENTITY_SEPARATOR)
results = []
for res in tmp_results:
r = res.strip()
# split on type separator
r_text = ""
r_type = ""
r_splitted = r.split(NER_TYPE_SEPARATOR)
if len(r_splitted) < 2:
r_text = r_splitted[0]
r_type = ""
else:
r_text = r_splitted[0]
r_type = r_splitted[1]
if r_text != "":
results.append((r_text, r_type.upper()))
return results
# ---------------------- RELATION EXTRACTION ----------------------
def _rel_process_raw_output(llm_result: str) -> list[str]:
if NO_REL_STRING in llm_result:
return []
if llm_result == "":
return ["WRONG"]
tmp_results = llm_result.split(INTER_REL_SEPARATOR)
relations = []
for res in tmp_results:
r_text1 = ""
r_text2 = ""
r_splitted = res.split(INTRA_REL_SEPARATOR)
if len(r_splitted) < 2:
r_text1 = r_splitted[0].strip()
r_text2 = ""
else:
r_text1 = r_splitted[0].strip()
r_text2 = r_splitted[1].strip()
relations.append((r_text1, r_text2))
assert len(relations) == len(tmp_results)
return relations
INTER_REL_SEPARATOR = "%"
INTRA_REL_SEPARATOR = "$"
NO_REL_STRING = "&&NOREL&&"
def re_doc_to_target(doc):
ents = doc["relations"]
targ_str = ""
# Entità$Tipo%Entità$Tipo.
if ents == []:
return NO_ENT_STRING
else:
for e in ents:
targ_str += e[0] + INTRA_REL_SEPARATOR + e[1] + INTER_REL_SEPARATOR
return targ_str[:-1]
def _rel_gold_to_target(x: list) -> list:
if x == []:
return [0]
else:
return [1] * len(x)
def rel_doc_to_target(doc):
rel = doc["relations"]
targ_str = ""
# misura1$result1%misure2$result2.
if rel == []:
return NO_REL_STRING
else:
for r in rel:
targ_str += r[0] + "$" + r[1] + "%"
return targ_str[:-1]
def _extract_relations(results):
relations = []
for r in results:
r_text1 = ""
r_text2 = ""
r_splitted = r.split(INTRA_REL_SEPARATOR)
if len(r_splitted) < 2:
r_text1 = r_splitted[0]
r_text2 = ""
else:
r_text1 = r_splitted[0]
r_text2 = r_splitted[1]
relations.append((r_text1, r_text2))
assert len(relations) == len(results)
return relations
def rel_process_results_v3(doc, results):
"""
Process the results of the Relation extraction task not considering the order of the relation extracted
"""
# each document has a list of relation with the following format:
# [[text1, text2], [text3, text4]]
gold = doc["relations"]
raw_results = results[0]
has_results = 0 if NO_REL_STRING in raw_results else 1
has_gold = 1 if gold != [] else 0
res_labels = []
gold_labels = []
if has_results == 0 and has_gold:
# False negative
gold_labels = _rel_gold_to_target(gold)
res_labels = [0] * len(gold_labels)
elif has_results == 0 and has_gold == 0:
# True negative
gold_labels = _rel_gold_to_target(gold)
res_labels = gold_labels
elif has_results and has_gold == 0:
# False positive
gold_labels = _rel_gold_to_target(gold)
res_labels = [1] * len(gold_labels)
else:
results = _rel_process_raw_output(raw_results)
# results = raw_results.split(INTER_REL_SEPARATOR)
gold_labels = _rel_gold_to_target(gold)
res_labels = [0] * len(gold_labels)
assert len(gold) > 0
for i in range(len(gold)):
for j in range(len(results)):
r_text1 = results[j][0]
r_text2 = results[j][1]
if r_text1 == gold[i][0] and r_text2 == gold[i][1]: # list of lists
res_labels[i] = 1
results[j] = ("DELETED", "DELETED")
elif r_text1 == "DELETED" and r_text2 == "DELETED":
continue
else:
pass
# if there are more predictions than gold, we set the remaining predictions to false positive
if len(results) - len(gold) > 0:
for i in range(len(results) - len(gold)):
if results[i] == ("DELETED", "DELETED"):
continue
res_labels.append(1)
gold_labels.append(0)
assert len(gold_labels) == len(res_labels)
return {"f1": (res_labels, gold_labels)}
LS_SPLIT_REGEX = r"[^,]+"
def split_text_with_regex(text, pattern):
"""
pattern: str - a regex pattern to match the text
text: str - the text to split
"""
import re
# Get text with model-generated words for comparison with the gold standard
text = text.split("\n")[0]
# Find all matches for the pattern
matches = re.findall(pattern, text)
# Split each matched segment further if it contains a comma and is quoted
result = []
for match in matches:
if match.startswith('"') and match.endswith('"'):
# Remove the quotes and split inside the quoted string
inner_matches = re.findall(r"[^,]+", match[1:-1])
result.extend(inner_matches)
else:
result.append(match)
# Strip leading and trailing whitespaces from each element
result = [element.strip().replace('"', "") for element in result]
return result
# ---------------------- SUMMARIZATION ----------------------
def rouge1_score(references, predictions, **kwargs):
"""
suboptimal way of compute rouge because of the following issue:
https://github.com/EleutherAI/lm-evaluation-harness/issues/1302
"""
rouge = load("rouge")
return rouge.compute(predictions=predictions, references=references, **kwargs)[
"rouge1"
]
def process_results_sum(doc, results):
"""
Process the results of the Evalita summarization task
"""
ref = doc["summary"] if "summary" in doc.keys() else doc["target"]
rouge_scorer = load("rouge", keep_in_memory=True)
r1score = rouge_scorer.compute(predictions=results, references=[ref])["rouge1"]
return {
"rouge1": r1score,
}
def faq_doc_to_target(x):
if x["correct_answer"] == "A":
return 0
elif x["correct_answer"] == "B":
return 1
elif x["correct_answer"] == "C":
return 2
elif x["correct_answer"] == "D":
return 3
else:
eval_logger.warning(
'WARNING: correct answer not found or not in ["A", "B", "C", "D"]'
)
def ht_doc_to_target(x):
if x["source"] == "ilgiornale":
return 0
elif x["source"] == "repubblica":
return 1
else:
eval_logger.warning(
'WARNING: source not found or not in ["ilgiornale", "repubblica"]'
)
...@@ -33,7 +33,9 @@ class FDA(ConfigurableTask): ...@@ -33,7 +33,9 @@ class FDA(ConfigurableTask):
def doc_to_target(self, doc): def doc_to_target(self, doc):
return doc["value"] return doc["value"]
def construct_requests(self, doc, ctx, **kwargs): def construct_requests(
self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
):
"""Uses RequestFactory to construct Requests and returns an iterable of """Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM. Requests which will be sent to the LM.
......
...@@ -26,7 +26,40 @@ The datasets included in GalicianBench that have been made public in previous pu ...@@ -26,7 +26,40 @@ The datasets included in GalicianBench that have been made public in previous pu
### Citation ### Citation
Paper for GalicianBench coming soon.
```
@inproceedings{baucells-etal-2025-iberobench,
title = "{I}bero{B}ench: A Benchmark for {LLM} Evaluation in {I}berian Languages",
author = "Baucells, Irene and
Aula-Blasco, Javier and
de-Dios-Flores, Iria and
Paniagua Su{\'a}rez, Silvia and
Perez, Naiara and
Salles, Anna and
Sotelo Docio, Susana and
Falc{\~a}o, J{\'u}lia and
Saiz, Jose Javier and
Sepulveda Torres, Robiert and
Barnes, Jeremy and
Gamallo, Pablo and
Gonzalez-Agirre, Aitor and
Rigau, German and
Villegas, Marta",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.coling-main.699/",
pages = "10491--10519",
}
```
### Groups and Tasks ### Groups and Tasks
......
# GroundCocoa
### Paper
Title: `GroundCocoa: A Benchmark for Evaluating Compositional & Conditional Reasoning in Language Models`
Abstract: https://arxiv.org/abs/2404.04237
The rapid progress of large language models (LLMs) has seen them excel and frequently surpass human performance on standard benchmarks. This has enabled many downstream applications, such as LLM agents, to rely on their reasoning to address complex task requirements. However, LLMs are known to unexpectedly falter in simple tasks and under seemingly straightforward circumstances - underscoring the need for better and more diverse evaluation setups to measure their true capabilities. To this end, we choose to study compositional and conditional reasoning, two aspects that are central to human cognition, and introduce GroundCocoa - a lexically diverse benchmark connecting these reasoning skills to the real-world problem of flight booking. Our task involves aligning detailed user preferences with available flight options presented in a multiple-choice format. Results indicate a significant disparity in performance among current state-of-the-art LLMs with even the best performing model, GPT-4 Turbo, not exceeding 67% accuracy despite advanced prompting techniques.
Homepage: `https://osu-nlp-group.github.io/GroundCocoa/`
### Citation
```
@misc{kohli2025groundcocoabenchmarkevaluatingcompositional,
title={GroundCocoa: A Benchmark for Evaluating Compositional & Conditional Reasoning in Language Models},
author={Harsh Kohli and Sachin Kumar and Huan Sun},
year={2025},
eprint={2404.04237},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2404.04237},
}
```
### Groups and Tasks
#### Groups
- Not part of a group yet
#### Tasks
- `groundcocoa`
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
task: groundcocoa
dataset_path: harsh147/GroundCocoa
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
process_docs: !function utils.process_docs
doc_to_text: "{{criteria}}"
doc_to_target: gold
doc_to_choice: "choices"
target_delimiter: ""
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
dataset_kwargs:
trust_remote_code: true
streaming: true
import datasets
import pandas as pd
from datasets import Dataset
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
cocoa_dataset = [sample for sample in dataset]
processed = []
for doc in cocoa_dataset:
question = "A user has specified certain criteria for booking a flight. Below are five different flight options labeled 'A', 'B', 'C', 'D', and 'E'. Review these options and select the one that best matches the user requirements. Respond with a single option and the phrase 'The answer is Option ' followed by the correct letter - 'A', 'B', 'C', 'D', or 'E'\n\n"
question = question + "User Criteria: " + doc["query"]
question = question + "\n\n Option A: " + str(doc["Option A"]) + "\n"
question = question + "\n Option B: " + str(doc["Option B"]) + "\n"
question = question + "\n Option C: " + str(doc["Option C"]) + "\n"
question = question + "\n Option D: " + str(doc["Option D"]) + "\n"
question = question + "\n Option E: " + str(doc["Option E"]) + "\n"
out_doc = {
"criteria": question,
"choices": [
"The answer is Option A",
"The answer is Option B",
"The answer is Option C",
"The answer is Option D",
"The answer is Option E",
],
"gold": "The answer is Option " + doc["Answer"],
}
processed.append(out_doc)
df = pd.DataFrame(processed)
dataset = Dataset.from_pandas(df)
return dataset
# Histoires Morales
### Paper
Title: `Histoires Morales: A French Dataset for Assessing Moral Alignment`
Abstract: `https://arxiv.org/pdf/2501.17117`
⚖ Histoires Morales is the first dataset for moral model alignment evaluation in French. It consists of narratives describing normative and norm-divergent actions taken by individuals to achieve certain intentions in concrete situations, along with their respective consequences.
Each of the 12,000 stories (histoires) follows the same seven-sentence structure as the Moral Stories dataset:
Context:
1. Norm: A guideline for social conduct generally observed by most people in everyday situations.
2. Situation: The setting of the story, introducing participants and describing their environment.
3. Intention: A reasonable goal that one of the story participants (the actor) wants to achieve.
Normative path:
4. Normative action: An action by the actor that fulfills the intention while observing the norm.
5. Normative consequence: A possible effect of the normative action on the actor’s environment.
Norm-divergent path:
6. Divergent action: An action by the actor that fulfills the intention but diverges from the norm.
7. Divergent consequence: A possible effect of the divergent action on the actor’s environment.
Histoires Morales is adapted to French from the widely used Moral Stories dataset.
We translated the Moral Stories dataset and refined these translations through manual annotations.
See paper for more details.
Homepage: `https://huggingface.co/datasets/LabHC/histoires_morales`
### Citation
Coming soon (accepted to NAACL 2025)
### Groups, Tags, and Tasks
#### Groups
* Not part of a group yet
#### Tags
No tags, since there is a single task.
#### Tasks
* `histoires_morales.yaml`
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
task: histoires_morales
dataset_path: LabHC/histoires_morales
output_type: multiple_choice
test_split: train
process_docs: !function utils.process_docs
doc_to_text: "{{query}}"
doc_to_target: "{{label}}"
doc_to_choice: "choices"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
import datasets
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc):
ctx = (
doc["norm"].capitalize()
+ " "
+ doc["situation"].capitalize()
+ " "
+ doc["intention"].capitalize()
)
choices = [doc["moral_action"], doc["immoral_action"]]
out_doc = {
"query": ctx,
"choices": choices,
"label": 0,
}
return out_doc
return dataset.map(_process_doc)
include: humaneval.yaml
task: humaneval_plus
dataset_path: evalplus/humanevalplus
...@@ -2,7 +2,6 @@ import dataclasses ...@@ -2,7 +2,6 @@ import dataclasses
from typing import Dict, Optional, Union from typing import Dict, Optional, Union
from lm_eval.tasks.ifeval import instructions_registry from lm_eval.tasks.ifeval import instructions_registry
from lm_eval.utils import eval_logger
@dataclasses.dataclass @dataclasses.dataclass
......
tag:
- kobest
task: kobest_boolq task: kobest_boolq
dataset_path: skt/kobest_v1 dataset_path: skt/kobest_v1
dataset_name: boolq dataset_name: boolq
......
tag:
- kobest
task: kobest_copa task: kobest_copa
dataset_path: skt/kobest_v1 dataset_path: skt/kobest_v1
dataset_name: copa dataset_name: copa
......
tag:
- kobest
task: kobest_hellaswag task: kobest_hellaswag
dataset_path: skt/kobest_v1 dataset_path: skt/kobest_v1
dataset_name: hellaswag dataset_name: hellaswag
......
tag:
- kobest
task: kobest_sentineg task: kobest_sentineg
dataset_path: skt/kobest_v1 dataset_path: skt/kobest_v1
dataset_name: sentineg dataset_name: sentineg
......
tag:
- kobest
task: kobest_wic task: kobest_wic
dataset_path: skt/kobest_v1 dataset_path: skt/kobest_v1
dataset_name: wic dataset_name: wic
......
dataset_path: lighteval/MATH-Hard dataset_path: DigitalLearningGmbH/MATH-lighteval
process_docs: !function utils.process_docs process_docs: !function utils.process_docs
output_type: generate_until output_type: generate_until
training_split: train training_split: train
......
import logging
import re import re
import signal import signal
from typing import Dict, List, Optional from typing import Dict, List, Optional
import datasets import datasets
from lm_eval.utils import eval_logger
eval_logger = logging.getLogger(__name__)
try: try:
import sympy import sympy
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment