Merge branch 'main' into llama

bf11ac93 · Baber · 83b1c564 · ade01428 · bf11ac93 · bf11ac93
Commit bf11ac93 authored Mar 03, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/evalita_llm/_wic_template_yaml
+++ b/lm_eval/tasks/evalita_llm/_wic_template_yaml
+dataset_path: evalitahf/word_in_context
+dataset_name: default
+output_type: multiple_choice
+test_split: test
+fewshot_split: dev
+validation_split: dev
+doc_to_target: label # 0: No, 1: Si
+doc_to_choice: ["No", "Sì"]
+metric_list:
+  - metric: f1
+    higher_is_better: true
+    aggregation: f1
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/evalita_llm/metrics.py
+++ b/lm_eval/tasks/evalita_llm/metrics.py
+import torch
+from sklearn.metrics import f1_score, precision_score, recall_score
+inference_decorator = (
+    torch.inference_mode if torch.__version__ >= "2.0.0" else torch.no_grad
+)
+def _aggreg_ls(predictions):
+    """
+    Custom aggregation to compute corpus level metrics for the lexical substitution task
+    predictions is a list of tuples (prec, has_answ, has_annotation)
+    prec is the precision before dividing by |A|
+    has_answ is 0 if the model did not produce any answer
+    has_annotation is 0 if the gold answer is empty: no synonims from annotators
+    """
+    # get |A| and |T| to compute the final precision and recall using a lambda function
+    A = sum([p[1] for p in predictions])
+    T = sum([p[2] for p in predictions])
+    # compute the final precision and recall
+    if A == 0:
+        prec = sum([p[0] for p in predictions]) / 1
+    else:
+        prec = sum([p[0] for p in predictions]) / A
+    if T == 0:
+        rec = sum([p[0] for p in predictions]) / 1
+    else:
+        rec = sum([p[0] for p in predictions]) / T
+    # compute the final F1 score
+    f1 = 0
+    if prec + rec != 0:
+        f1 = (2 * prec * rec) / (prec + rec)
+    return f1
+def _aggreg_sa_v2(predictions):
+    """
+    This aggregation considers the sentiment analysis task as a multiple choice one with four classes
+    the f1 score is computed as the average of the f1 scores for each class weighted by the number of samples
+    See sklearn.metrics.f1_score for more details
+    """
+    predictions, references = zip(*predictions)
+    f1 = f1_score(references, predictions, average="weighted")
+    return f1
+def _aggreg_sa(predictions):
+    """
+    Custom aggregation function for the sentiment analysis task
+    The original tasks compute the F1 score for each class and then average them
+    Since the prompt cast the task to a multple choice one we need to aggregate the results in a different way
+    """
+    # split the predictions and references in two lists (pred is a tuple)
+    predictions, references = zip(*predictions)
+    """
+    Class 0: positivo -> 'opos': 1, 'oneg': 0
+    Class 1: negativo -> 'opos': 0, 'oneg': 1
+    etc.
+    """
+    def _map_to_original_labels(x):
+        """
+        Return two separate list of labels for opos and oneg
+        x is a list of integers
+        """
+        opos = []
+        oneg = []
+        for i in x:
+            if i == 0:
+                # positive
+                opos.append(1)
+                oneg.append(0)
+            elif i == 1:
+                # negative
+                opos.append(0)
+                oneg.append(1)
+            elif i == 2:
+                # neutral
+                opos.append(0)
+                oneg.append(0)
+            elif i == 3:
+                # mixed
+                opos.append(1)
+                oneg.append(1)
+            else:
+                pass
+        return opos, oneg
+    pred_opos, pred_oneg = _map_to_original_labels(predictions)
+    ref_opos, ref_oneg = _map_to_original_labels(references)
+    opos_f1 = f1_score(ref_opos, pred_opos, average=None)
+    opos_f1_c0 = f1_score(ref_opos, pred_opos, average=None)[0]
+    if len(opos_f1) > 1:
+        opos_f1_c1 = opos_f1[1]
+    else:
+        opos_f1_c1 = 0
+    # oneg class
+    oneg_prec_c0, oneg_prec_c1 = precision_score(
+        ref_oneg, pred_oneg, labels=[0, 1], average=None
+    )
+    oneg_rec_c0, oneg_rec_c1 = recall_score(
+        ref_oneg, pred_oneg, labels=[0, 1], average=None
+    )
+    oneg_f1 = f1_score(ref_oneg, pred_oneg, average=None)
+    oneg_f1_c0 = f1_score(ref_oneg, pred_oneg, average=None)[0]
+    if len(oneg_f1) > 1:
+        oneg_f1_c1 = f1_score(ref_oneg, pred_oneg, average=None)[1]
+    else:
+        oneg_f1_c1 = 0
+    # average f1 score for each class (opos and oneg)
+    f1_score_opos = (opos_f1_c0 + opos_f1_c1) / 2
+    f1_score_oneg = (oneg_f1_c0 + oneg_f1_c1) / 2
+    # average f1 score for the two classes
+    f1_final = (f1_score_opos + f1_score_oneg) / 2
+    return f1_final
+def _aggreg_ner(predictions):
+    pred, ref = zip(*predictions)
+    # concat all the predictions and references
+    all_pred = []
+    for p in pred:
+        all_pred.extend(p)
+    all_ref = []
+    for r in ref:
+        all_ref.extend(r)
+    # compute the F1 score
+    f1 = f1_score(all_ref, all_pred, average=None)
+    if len(f1) > 1:
+        f1_sum = sum(f1[:-1]) / (len(f1) - 1)
+    else:
+        f1_sum = f1[0]
+    return f1_sum
+def _aggreg_rel(predictions):
+    pred, ref = zip(*predictions)
+    # concat all the predictions and references
+    all_pred = []
+    for p in pred:
+        all_pred.extend(p)
+    all_ref = []
+    for r in ref:
+        all_ref.extend(r)
+    # compute the F1 score
+    f1 = f1_score(all_ref, all_pred, average="macro")
+    return f1
+# ------------------------ DOCUMENT DATING ---------------------------
+def _aggreg_dd(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = f1_score(golds, preds, average="macro")
+    return fscore
--- a/lm_eval/tasks/evalita_llm/utils.py
+++ b/lm_eval/tasks/evalita_llm/utils.py
+import logging
+from evaluate import load
+from sklearn.metrics import f1_score
+eval_logger = logging.getLogger("lm-eval")
+# ---------------------- SENTIMENT ANALYSIS ----------------------
+def sa_doc_to_target(x):
+    """
+    Function to extract the target from the dataset for sentiment analysis
+    """
+    opos = x["opos"]
+    oneg = x["oneg"]
+    # return indexes matches the choices in sa_doc_to_choice
+    if opos == "1" and oneg == "0":
+        return 0
+    elif opos == "0" and oneg == "1":
+        return 1
+    elif opos == "0" and oneg == "0":
+        return 2
+    elif opos == "1" and oneg == "1":
+        return 3
+    else:
+        pass
+def sa_doc_to_target_v2(x):
+    """
+    Function to extract the target from the dataset for sentiment analysis
+    """
+    opos = x["opos"]
+    oneg = x["oneg"]
+    # return indexes matches the choices in sa_doc_to_choice
+    if opos == "1" and oneg == "0":
+        return 0
+    elif opos == "0" and oneg == "1":
+        return 1
+    elif opos == "0" and oneg == "0":
+        return 2
+    elif opos == "1" and oneg == "1":
+        return 3
+    else:
+        pass
+def sa_doc_to_choice(x):
+    """
+    Function to return the choices from the dataset for sentiment analysis
+    """
+    return ["Positivo", "Negativo", "Neutrale", "Misto"]
+# ---------------------- LEXICAL SUBSTITUTION ----------------------
+NO_SYN_STRING = "&&NOSYN&&"
+def _ls_gold_to_target(x):
+    """
+    Generate the target for the lexical similarity task
+    """
+    # all_answers = [(i["word"], i["count"]) for i in x["answers"]]
+    if len(x["answers"]) == 0:
+        return NO_SYN_STRING
+    ans_str = ""
+    for i in x["answers"]:
+        ans_str += i["word"] + "$$" + str(i["count"]) + "::"
+    if len(ans_str) != 0 and ans_str[-2] == ":":
+        ans_str = ans_str[:-2]
+    # print(ans_str)
+    return ans_str
+def ls_doc_to_target(x):
+    """
+    Generate the target for the lexical similarity task
+    """
+    if len(x["answers"]) == 0:
+        return NO_SYN_STRING
+    ans_str = ""
+    for i in x["answers"]:
+        ans_str += i["word"] + ", "
+    if len(ans_str) != 0 and ans_str[-2] == ",":
+        ans_str = ans_str[:-2]
+    return ans_str
+def _ls_split_gold(x):
+    """
+    Split the gold string into a list of tuples
+    """
+    if x == NO_SYN_STRING:
+        return [], []
+    answers = x.split("::")
+    words = []
+    freqs = []
+    if len(answers) != 0:
+        for a in answers:
+            if "$$" in a:
+                word, count = a.split("$$")
+                words.append(word)
+                try:
+                    freqs.append(int(count))
+                except ValueError:
+                    freqs.append(0)
+    return words, freqs
+def ls_process_results(doc, results):
+    """
+    Process the results of the evaluation for the lexical substitution task
+    look at coqa for another example
+    """
+    gold_to_target = _ls_gold_to_target(doc)
+    words, freqs = _ls_split_gold(gold_to_target)
+    prec = 0
+    # Considering a maximum of the first 10 synonyms
+    results = split_text_with_regex(results[0], LS_SPLIT_REGEX)
+    results = results[: min(10, len(results))]
+    # Remove non-alphabetic characters from the word at the end of the list
+    if results:  # Check if results is not empty
+        results[-1] = "".join(char for char in results[-1] if char.isalpha())
+    has_answ = 0 if len(results) == 0 else 1  # so we can compute |A|
+    has_annotation = 0 if len(words) == 0 else 1  # so we can compute |T|
+    matching_res = []  # for debugging
+    for r in results:
+        if r in words:
+            # get frequency of the synonyms from annotators
+            idx = words.index(r.strip())
+            prec += freqs[idx]
+            matching_res.append(r)
+    # In the case of the OOT (out of ten) subtask, this normalization should not be applied
+    # ai = len(results) if len(results) != 0 else 1
+    # prec = prec / ai
+    Hi = sum(freqs)
+    if Hi != 0:
+        prec = prec / Hi
+    else:
+        eval_logger.debug("H_i is 0")
+    return {"f1": (prec, has_answ, has_annotation)}
+# ---------------------- NER ----------------------
+NO_ENT_STRING = "&&NOENT&&"
+NER_ENTITY_SEPARATOR = ","
+NER_TYPE_SEPARATOR = "$"
+NER_MAPPING_V2 = {"PER": 0, "LOC": 1, "ORG": 2, NO_ENT_STRING: 3, "O": 4}
+NER_MAPPING = {"PER": 0, "LOC": 1, "ORG": 2, "O": 3}
+def _ner_gold_to_target(x: list) -> list:
+    """
+    Convert the gold entities to the target format according to the NER_MAPPING
+    """
+    res = [NER_MAPPING[e["type"]] for e in x]
+    return res
+def _ner_gold_to_target_v2(x: list) -> list:
+    """
+    Convert the gold entities to the target format according to the NER_MAPPING
+    """
+    res = [NER_MAPPING[e["type"]] for e in x]
+    return res
+def ner_doc_to_target(doc):
+    ents = doc["entities"]
+    targ_str = ""
+    # Entità$Tipo%Entità$Tipo.
+    if ents == []:
+        return NO_ENT_STRING
+    else:
+        for e in ents:
+            targ_str += (
+                e["entity_text"] + NER_TYPE_SEPARATOR + e["type"] + NER_ENTITY_SEPARATOR
+            )
+    return targ_str[:-1]
+def ner_process_results(doc, results):
+    """
+    Process the results of the Named Entity Recognition task
+    """
+    # each document has a list of entities with the following format:
+    # [{"entity_text": "string", "type": "string"}]
+    gold = doc["entities"]
+    raw_results = results[0]
+    results = _ner_process_raw_output(raw_results)
+    gold_labels = _ner_gold_to_target(gold)
+    res_labels = [0] * len(gold_labels)
+    matched_gold_idx = []
+    if len(results) > len(gold):
+        for r in results:
+            r_text = r[0]
+            r_type = r[1]
+            for i in range(len(gold)):
+                if r_text == gold[i]["entity_text"] and r_type == gold[i]["type"]:
+                    res_labels[i] = NER_MAPPING[r_type]
+                    matched_gold_idx.append(i)
+        # Since we have more results than gold, we artificially set to false positive the remaining labels
+        # extend gold label list
+        for i in range(len(results) - len(gold)):
+            gold_labels.append(3)
+            res_labels.append(2)
+    elif len(results) == 0 and len(gold) == 0:
+        res_labels = [3]
+        gold_labels = res_labels
+    else:  # len(results) <= len(gold)
+        for r in results:
+            r_text = r[0]
+            r_type = r[1]
+            for i in range(len(gold)):
+                if r_text == gold[i]["entity_text"] and r_type == gold[i]["type"]:
+                    res_labels[i] = NER_MAPPING[r_type]
+                    matched_gold_idx.append(i)
+        # we map all wrong predictions to the "O" class
+        for i in range(len(gold_labels)):
+            if i in matched_gold_idx:
+                continue
+            if gold_labels[i] == 1:
+                res_labels[i] = 3
+            elif gold_labels[i] == 0:
+                res_labels[i] = 3
+            else:
+                res_labels[i] = 3
+    assert len(gold_labels) == len(res_labels)
+    return {"f1": (res_labels, gold_labels)}
+def ner_process_results_v2(doc, results):
+    """
+    Process the results of the Named Entity Recognition task
+    This version considers and score explicitly when the model responds that there are no entities
+    """
+    # each document has a list of entities with the following format:
+    # [{"entity_text": "string", "type": "string"}]
+    gold = doc["entities"]
+    raw_results = results[0]
+    results = _ner_process_raw_output_v2(raw_results)
+    # eval_logger.debug(f"results {results}")
+    # eval_logger.debug(f"gold {gold}")
+    gold_labels = _ner_gold_to_target_v2(gold)
+    res_labels = [0] * len(gold_labels)
+    matched_gold_idx = []
+    if len(results) > len(gold):
+        for r in results:
+            # print(r)
+            r_text = r[0]
+            r_type = r[1]
+            for i in range(len(gold)):
+                if r_text == gold[i]["entity_text"] and r_type == gold[i]["type"]:
+                    res_labels[i] = NER_MAPPING[r_type]
+                    matched_gold_idx.append(i)
+        # Since we have more results than gold, we artificially set to false positive the remaining labels
+        # extend gold label list
+        for i in range(len(results) - len(gold)):
+            # gold_labels.append(3)
+            # res_labels.append(2)
+            gold_labels.append(4)
+            res_labels.append(3)
+    elif len(results) == 0 and len(gold) == 0:
+        # res_labels = [random.choice([0, 1, 2, 3])]
+        res_labels = [3]
+        gold_labels = res_labels
+    elif len(results) == 1 and results[0] == NO_ENT_STRING:
+        # res_labels = [3]
+        res_labels = [4]
+        gold_labels = res_labels
+    else:  # len(results) <= len(gold)
+        for r in results:
+            r_text = r[0]
+            r_type = r[1]
+            for i in range(len(gold)):
+                if r_text == gold[i]["entity_text"] and r_type == gold[i]["type"]:
+                    res_labels[i] = NER_MAPPING[r_type]
+                    matched_gold_idx.append(i)
+        # we map all wrong predictions to the "O" class
+        for i in range(len(gold_labels)):
+            if i in matched_gold_idx:
+                continue
+            if gold_labels[i] == 1:
+                # res_labels[i] = 2
+                res_labels[i] = 4
+            elif gold_labels[i] == 0:
+                # res_labels[i] = 1
+                res_labels[i] = 4
+            else:
+                res_labels[i] = 4
+    assert len(gold_labels) == len(res_labels)
+    return {"f1": (res_labels, gold_labels)}
+def _ner_process_raw_output(llm_result: str) -> list[tuple]:
+    if NO_ENT_STRING in llm_result:
+        return []
+    if llm_result == "":
+        return ["WRONG"]
+    tmp_results = llm_result.split(NER_ENTITY_SEPARATOR)
+    results = []
+    for res in tmp_results:
+        r = res.strip()
+        # split on type separator
+        r_text = ""
+        r_type = ""
+        r_splitted = r.split(NER_TYPE_SEPARATOR)
+        if len(r_splitted) < 2:
+            r_text = r_splitted[0]
+            r_type = ""
+        else:
+            r_text = r_splitted[0]
+            r_type = r_splitted[1]
+        if r_text != "":
+            results.append((r_text, r_type.upper()))
+    return results
+def _ner_process_raw_output_v2(llm_result: str) -> list[tuple]:
+    if NO_ENT_STRING in llm_result:
+        return [NO_ENT_STRING]
+    if llm_result == "":
+        return ["WRONG"]
+    tmp_results = llm_result.split(NER_ENTITY_SEPARATOR)
+    results = []
+    for res in tmp_results:
+        r = res.strip()
+        # split on type separator
+        r_text = ""
+        r_type = ""
+        r_splitted = r.split(NER_TYPE_SEPARATOR)
+        if len(r_splitted) < 2:
+            r_text = r_splitted[0]
+            r_type = ""
+        else:
+            r_text = r_splitted[0]
+            r_type = r_splitted[1]
+        if r_text != "":
+            results.append((r_text, r_type.upper()))
+    return results
+# ---------------------- RELATION EXTRACTION ----------------------
+def _rel_process_raw_output(llm_result: str) -> list[str]:
+    if NO_REL_STRING in llm_result:
+        return []
+    if llm_result == "":
+        return ["WRONG"]
+    tmp_results = llm_result.split(INTER_REL_SEPARATOR)
+    relations = []
+    for res in tmp_results:
+        r_text1 = ""
+        r_text2 = ""
+        r_splitted = res.split(INTRA_REL_SEPARATOR)
+        if len(r_splitted) < 2:
+            r_text1 = r_splitted[0].strip()
+            r_text2 = ""
+        else:
+            r_text1 = r_splitted[0].strip()
+            r_text2 = r_splitted[1].strip()
+        relations.append((r_text1, r_text2))
+    assert len(relations) == len(tmp_results)
+    return relations
+INTER_REL_SEPARATOR = "%"
+INTRA_REL_SEPARATOR = "$"
+NO_REL_STRING = "&&NOREL&&"
+def re_doc_to_target(doc):
+    ents = doc["relations"]
+    targ_str = ""
+    # Entità$Tipo%Entità$Tipo.
+    if ents == []:
+        return NO_ENT_STRING
+    else:
+        for e in ents:
+            targ_str += e[0] + INTRA_REL_SEPARATOR + e[1] + INTER_REL_SEPARATOR
+    return targ_str[:-1]
+def _rel_gold_to_target(x: list) -> list:
+    if x == []:
+        return [0]
+    else:
+        return [1] * len(x)
+def rel_doc_to_target(doc):
+    rel = doc["relations"]
+    targ_str = ""
+    # misura1$result1%misure2$result2.
+    if rel == []:
+        return NO_REL_STRING
+    else:
+        for r in rel:
+            targ_str += r[0] + "$" + r[1] + "%"
+    return targ_str[:-1]
+def _extract_relations(results):
+    relations = []
+    for r in results:
+        r_text1 = ""
+        r_text2 = ""
+        r_splitted = r.split(INTRA_REL_SEPARATOR)
+        if len(r_splitted) < 2:
+            r_text1 = r_splitted[0]
+            r_text2 = ""
+        else:
+            r_text1 = r_splitted[0]
+            r_text2 = r_splitted[1]
+        relations.append((r_text1, r_text2))
+    assert len(relations) == len(results)
+    return relations
+def rel_process_results_v3(doc, results):
+    """
+    Process the results of the Relation extraction task not considering the order of the relation extracted
+    """
+    # each document has a list of relation with the following format:
+    # [[text1, text2], [text3, text4]]
+    gold = doc["relations"]
+    raw_results = results[0]
+    has_results = 0 if NO_REL_STRING in raw_results else 1
+    has_gold = 1 if gold != [] else 0
+    res_labels = []
+    gold_labels = []
+    if has_results == 0 and has_gold:
+        # False negative
+        gold_labels = _rel_gold_to_target(gold)
+        res_labels = [0] * len(gold_labels)
+    elif has_results == 0 and has_gold == 0:
+        # True negative
+        gold_labels = _rel_gold_to_target(gold)
+        res_labels = gold_labels
+    elif has_results and has_gold == 0:
+        # False positive
+        gold_labels = _rel_gold_to_target(gold)
+        res_labels = [1] * len(gold_labels)
+    else:
+        results = _rel_process_raw_output(raw_results)
+        # results = raw_results.split(INTER_REL_SEPARATOR)
+        gold_labels = _rel_gold_to_target(gold)
+        res_labels = [0] * len(gold_labels)
+        assert len(gold) > 0
+        for i in range(len(gold)):
+            for j in range(len(results)):
+                r_text1 = results[j][0]
+                r_text2 = results[j][1]
+                if r_text1 == gold[i][0] and r_text2 == gold[i][1]:  # list of lists
+                    res_labels[i] = 1
+                    results[j] = ("DELETED", "DELETED")
+                elif r_text1 == "DELETED" and r_text2 == "DELETED":
+                    continue
+                else:
+                    pass
+        # if there are more predictions than gold, we set the remaining predictions to false positive
+        if len(results) - len(gold) > 0:
+            for i in range(len(results) - len(gold)):
+                if results[i] == ("DELETED", "DELETED"):
+                    continue
+                res_labels.append(1)
+                gold_labels.append(0)
+    assert len(gold_labels) == len(res_labels)
+    return {"f1": (res_labels, gold_labels)}
+LS_SPLIT_REGEX = r"[^,]+"
+def split_text_with_regex(text, pattern):
+    """
+    pattern: str - a regex pattern to match the text
+    text: str - the text to split
+    """
+    import re
+    # Get text with model-generated words for comparison with the gold standard
+    text = text.split("\n")[0]
+    # Find all matches for the pattern
+    matches = re.findall(pattern, text)
+    # Split each matched segment further if it contains a comma and is quoted
+    result = []
+    for match in matches:
+        if match.startswith('"') and match.endswith('"'):
+            # Remove the quotes and split inside the quoted string
+            inner_matches = re.findall(r"[^,]+", match[1:-1])
+            result.extend(inner_matches)
+        else:
+            result.append(match)
+    # Strip leading and trailing whitespaces from each element
+    result = [element.strip().replace('"', "") for element in result]
+    return result
+# ---------------------- SUMMARIZATION ----------------------
+def rouge1_score(references, predictions, **kwargs):
+    """
+    suboptimal way of compute rouge because of the following issue:
+    https://github.com/EleutherAI/lm-evaluation-harness/issues/1302
+    """
+    rouge = load("rouge")
+    return rouge.compute(predictions=predictions, references=references, **kwargs)[
+        "rouge1"
+    ]
+def process_results_sum(doc, results):
+    """
+    Process the results of the Evalita summarization task
+    """
+    ref = doc["summary"] if "summary" in doc.keys() else doc["target"]
+    rouge_scorer = load("rouge", keep_in_memory=True)
+    r1score = rouge_scorer.compute(predictions=results, references=[ref])["rouge1"]
+    return {
+        "rouge1": r1score,
+    }
+def faq_doc_to_target(x):
+    if x["correct_answer"] == "A":
+        return 0
+    elif x["correct_answer"] == "B":
+        return 1
+    elif x["correct_answer"] == "C":
+        return 2
+    elif x["correct_answer"] == "D":
+        return 3
+    else:
+        eval_logger.warning(
+            'WARNING: correct answer not found or not in ["A", "B", "C", "D"]'
+        )
+def ht_doc_to_target(x):
+    if x["source"] == "ilgiornale":
+        return 0
+    elif x["source"] == "repubblica":
+        return 1
+    else:
+        eval_logger.warning(
+            'WARNING: source not found or not in ["ilgiornale", "repubblica"]'
+        )
--- a/lm_eval/tasks/fda/task.py
+++ b/lm_eval/tasks/fda/task.py
@@ -33,7 +33,9 @@ class FDA(ConfigurableTask):
    def doc_to_target(self, doc):
        return doc["value"]
-    def construct_requests(self, doc, ctx, **kwargs):
+    def construct_requests(
+        self, doc, ctx, chat_template=None, apply_chat_template=False, **kwargs
+    ):
        """Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.

--- a/lm_eval/tasks/galician_bench/README.md
+++ b/lm_eval/tasks/galician_bench/README.md
@@ -26,7 +26,40 @@ The datasets included in GalicianBench that have been made public in previous pu
 ### Citation
-Paper for GalicianBench coming soon.
+```
+@inproceedings{baucells-etal-2025-iberobench,
+    title = "{I}bero{B}ench: A Benchmark for {LLM} Evaluation in {I}berian Languages",
+    author = "Baucells, Irene  and
+      Aula-Blasco, Javier  and
+      de-Dios-Flores, Iria  and
+      Paniagua Su{\'a}rez, Silvia  and
+      Perez, Naiara  and
+      Salles, Anna  and
+      Sotelo Docio, Susana  and
+      Falc{\~a}o, J{\'u}lia  and
+      Saiz, Jose Javier  and
+      Sepulveda Torres, Robiert  and
+      Barnes, Jeremy  and
+      Gamallo, Pablo  and
+      Gonzalez-Agirre, Aitor  and
+      Rigau, German  and
+      Villegas, Marta",
+    editor = "Rambow, Owen  and
+      Wanner, Leo  and
+      Apidianaki, Marianna  and
+      Al-Khalifa, Hend  and
+      Eugenio, Barbara Di  and
+      Schockaert, Steven",
+    booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
+    month = jan,
+    year = "2025",
+    address = "Abu Dhabi, UAE",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2025.coling-main.699/",
+    pages = "10491--10519",
+}
+```
 ### Groups and Tasks

--- a/lm_eval/tasks/groundcocoa/README.md
+++ b/lm_eval/tasks/groundcocoa/README.md
+# GroundCocoa
+### Paper
+Title: `GroundCocoa: A Benchmark for Evaluating Compositional & Conditional Reasoning in Language Models`
+Abstract: https://arxiv.org/abs/2404.04237
+The rapid progress of large language models (LLMs) has seen them excel and frequently surpass human performance on standard benchmarks. This has enabled many downstream applications, such as LLM agents, to rely on their reasoning to address complex task requirements. However, LLMs are known to unexpectedly falter in simple tasks and under seemingly straightforward circumstances - underscoring the need for better and more diverse evaluation setups to measure their true capabilities. To this end, we choose to study compositional and conditional reasoning, two aspects that are central to human cognition, and introduce GroundCocoa - a lexically diverse benchmark connecting these reasoning skills to the real-world problem of flight booking. Our task involves aligning detailed user preferences with available flight options presented in a multiple-choice format. Results indicate a significant disparity in performance among current state-of-the-art LLMs with even the best performing model, GPT-4 Turbo, not exceeding 67% accuracy despite advanced prompting techniques.
+Homepage: `https://osu-nlp-group.github.io/GroundCocoa/`
+### Citation
+```
+@misc{kohli2025groundcocoabenchmarkevaluatingcompositional,
+      title={GroundCocoa: A Benchmark for Evaluating Compositional & Conditional Reasoning in Language Models},
+      author={Harsh Kohli and Sachin Kumar and Huan Sun},
+      year={2025},
+      eprint={2404.04237},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2404.04237},
+}
+```
+### Groups and Tasks
+#### Groups
+- Not part of a group yet
+#### Tasks
+- `groundcocoa`
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/groundcocoa/groundcocoa.yaml
+++ b/lm_eval/tasks/groundcocoa/groundcocoa.yaml
+task: groundcocoa
+dataset_path: harsh147/GroundCocoa
+output_type: multiple_choice
+training_split: null
+validation_split: validation
+test_split: test
+process_docs: !function utils.process_docs
+doc_to_text: "{{criteria}}"
+doc_to_target: gold
+doc_to_choice: "choices"
+target_delimiter: ""
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+dataset_kwargs:
+  trust_remote_code: true
+  streaming: true
--- a/lm_eval/tasks/groundcocoa/utils.py
+++ b/lm_eval/tasks/groundcocoa/utils.py
+import datasets
+import pandas as pd
+from datasets import Dataset
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    cocoa_dataset = [sample for sample in dataset]
+    processed = []
+    for doc in cocoa_dataset:
+        question = "A user has specified certain criteria for booking a flight. Below are five different flight options labeled 'A', 'B', 'C', 'D', and 'E'. Review these options and select the one that best matches the user requirements. Respond with a single option and the phrase 'The answer is Option ' followed by the correct letter - 'A', 'B', 'C', 'D', or 'E'\n\n"
+        question = question + "User Criteria: " + doc["query"]
+        question = question + "\n\n Option A: " + str(doc["Option A"]) + "\n"
+        question = question + "\n Option B: " + str(doc["Option B"]) + "\n"
+        question = question + "\n Option C: " + str(doc["Option C"]) + "\n"
+        question = question + "\n Option D: " + str(doc["Option D"]) + "\n"
+        question = question + "\n Option E: " + str(doc["Option E"]) + "\n"
+        out_doc = {
+            "criteria": question,
+            "choices": [
+                "The answer is Option A",
+                "The answer is Option B",
+                "The answer is Option C",
+                "The answer is Option D",
+                "The answer is Option E",
+            ],
+            "gold": "The answer is Option " + doc["Answer"],
+        }
+        processed.append(out_doc)
+    df = pd.DataFrame(processed)
+    dataset = Dataset.from_pandas(df)
+    return dataset
--- a/lm_eval/tasks/histoires_morales/README.md
+++ b/lm_eval/tasks/histoires_morales/README.md
+# Histoires Morales
+### Paper
+Title: `Histoires Morales: A French Dataset for Assessing Moral Alignment`
+Abstract: `https://arxiv.org/pdf/2501.17117`
+⚖ Histoires Morales is the first dataset for moral model alignment evaluation in French. It consists of narratives describing normative and norm-divergent actions taken by individuals to achieve certain intentions in concrete situations, along with their respective consequences.
+Each of the 12,000 stories (histoires) follows the same seven-sentence structure as the Moral Stories dataset:
+Context:
+1. Norm: A guideline for social conduct generally observed by most people in everyday situations.
+2. Situation: The setting of the story, introducing participants and describing their environment.
+3. Intention: A reasonable goal that one of the story participants (the actor) wants to achieve.
+Normative path:
+4. Normative action: An action by the actor that fulfills the intention while observing the norm.
+5. Normative consequence: A possible effect of the normative action on the actor’s environment.
+Norm-divergent path:
+6. Divergent action: An action by the actor that fulfills the intention but diverges from the norm.
+7. Divergent consequence: A possible effect of the divergent action on the actor’s environment.
+Histoires Morales is adapted to French from the widely used Moral Stories dataset.
+We translated the Moral Stories dataset and refined these translations through manual annotations.
+See paper for more details.
+Homepage: `https://huggingface.co/datasets/LabHC/histoires_morales`
+### Citation
+Coming soon (accepted to NAACL 2025)
+### Groups, Tags, and Tasks
+#### Groups
+* Not part of a group yet
+#### Tags
+No tags, since there is a single task.
+#### Tasks
+* `histoires_morales.yaml`
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/histoires_morales/histoires_morales.yaml
+++ b/lm_eval/tasks/histoires_morales/histoires_morales.yaml
+task: histoires_morales
+dataset_path: LabHC/histoires_morales
+output_type: multiple_choice
+test_split: train
+process_docs: !function utils.process_docs
+doc_to_text: "{{query}}"
+doc_to_target: "{{label}}"
+doc_to_choice: "choices"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/histoires_morales/utils.py
+++ b/lm_eval/tasks/histoires_morales/utils.py
+import datasets
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc):
+        ctx = (
+            doc["norm"].capitalize()
+            + " "
+            + doc["situation"].capitalize()
+            + " "
+            + doc["intention"].capitalize()
+        )
+        choices = [doc["moral_action"], doc["immoral_action"]]
+        out_doc = {
+            "query": ctx,
+            "choices": choices,
+            "label": 0,
+        }
+        return out_doc
+    return dataset.map(_process_doc)
--- a/lm_eval/tasks/humaneval/humaneval_plus.yaml
+++ b/lm_eval/tasks/humaneval/humaneval_plus.yaml
+include: humaneval.yaml
+task: humaneval_plus
+dataset_path: evalplus/humanevalplus
--- a/lm_eval/tasks/ifeval/utils.py
+++ b/lm_eval/tasks/ifeval/utils.py
@@ -2,7 +2,6 @@ import dataclasses
 from typing import Dict, Optional, Union
 from lm_eval.tasks.ifeval import instructions_registry
-from lm_eval.utils import eval_logger
 @dataclasses.dataclass

--- a/lm_eval/tasks/kobest/kobest_boolq.yaml
+++ b/lm_eval/tasks/kobest/kobest_boolq.yaml
-tag:
-  - kobest
 task: kobest_boolq
 dataset_path: skt/kobest_v1
 dataset_name: boolq

--- a/lm_eval/tasks/kobest/kobest_copa.yaml
+++ b/lm_eval/tasks/kobest/kobest_copa.yaml
-tag:
-  - kobest
 task: kobest_copa
 dataset_path: skt/kobest_v1
 dataset_name: copa

--- a/lm_eval/tasks/kobest/kobest_hellaswag.yaml
+++ b/lm_eval/tasks/kobest/kobest_hellaswag.yaml
-tag:
-  - kobest
 task: kobest_hellaswag
 dataset_path: skt/kobest_v1
 dataset_name: hellaswag

--- a/lm_eval/tasks/kobest/kobest_sentineg.yaml
+++ b/lm_eval/tasks/kobest/kobest_sentineg.yaml
-tag:
-  - kobest
 task: kobest_sentineg
 dataset_path: skt/kobest_v1
 dataset_name: sentineg

--- a/lm_eval/tasks/kobest/kobest_wic.yaml
+++ b/lm_eval/tasks/kobest/kobest_wic.yaml
-tag:
-  - kobest
 task: kobest_wic
 dataset_path: skt/kobest_v1
 dataset_name: wic

--- a/lm_eval/tasks/leaderboard/math/_template_yaml
+++ b/lm_eval/tasks/leaderboard/math/_template_yaml
-dataset_path: lighteval/MATH-Hard
+dataset_path: DigitalLearningGmbH/MATH-lighteval
 process_docs: !function utils.process_docs
 output_type: generate_until
 training_split: train

--- a/lm_eval/tasks/leaderboard/math/utils.py
+++ b/lm_eval/tasks/leaderboard/math/utils.py
+import logging
 import re
 import signal
 from typing import Dict, List, Optional
 import datasets
-from lm_eval.utils import eval_logger
+eval_logger = logging.getLogger(__name__)
 try:
    import sympy