Merge branch 'main' into longcxt

2b56339e · Baber · 0b533339 · 703fbffd · 2b56339e · 2b56339e
Commit 2b56339e authored Jan 17, 2025 by Baber
16 changed files
--- a/lm_eval/tasks/mlqa/mlqa_vi_zh.yaml
+++ b/lm_eval/tasks/mlqa/mlqa_vi_zh.yaml
+# Generated by generate_tasks.py
+include: mlqa_common_yaml
+task: mlqa_vi_zh
+dataset_name: mlqa.vi.zh
+process_results: !function utils.process_results_vi
--- a/lm_eval/tasks/mlqa/mlqa_zh_ar.yaml
+++ b/lm_eval/tasks/mlqa/mlqa_zh_ar.yaml
+# Generated by generate_tasks.py
+include: mlqa_common_yaml
+task: mlqa_zh_ar
+dataset_name: mlqa.zh.ar
+process_results: !function utils.process_results_zh
--- a/lm_eval/tasks/mlqa/mlqa_zh_de.yaml
+++ b/lm_eval/tasks/mlqa/mlqa_zh_de.yaml
+# Generated by generate_tasks.py
+include: mlqa_common_yaml
+task: mlqa_zh_de
+dataset_name: mlqa.zh.de
+process_results: !function utils.process_results_zh
--- a/lm_eval/tasks/mlqa/mlqa_zh_en.yaml
+++ b/lm_eval/tasks/mlqa/mlqa_zh_en.yaml
+# Generated by generate_tasks.py
+include: mlqa_common_yaml
+task: mlqa_zh_en
+dataset_name: mlqa.zh.en
+process_results: !function utils.process_results_zh
--- a/lm_eval/tasks/mlqa/mlqa_zh_es.yaml
+++ b/lm_eval/tasks/mlqa/mlqa_zh_es.yaml
+# Generated by generate_tasks.py
+include: mlqa_common_yaml
+task: mlqa_zh_es
+dataset_name: mlqa.zh.es
+process_results: !function utils.process_results_zh
--- a/lm_eval/tasks/mlqa/mlqa_zh_hi.yaml
+++ b/lm_eval/tasks/mlqa/mlqa_zh_hi.yaml
+# Generated by generate_tasks.py
+include: mlqa_common_yaml
+task: mlqa_zh_hi
+dataset_name: mlqa.zh.hi
+process_results: !function utils.process_results_zh
--- a/lm_eval/tasks/mlqa/mlqa_zh_vi.yaml
+++ b/lm_eval/tasks/mlqa/mlqa_zh_vi.yaml
+# Generated by generate_tasks.py
+include: mlqa_common_yaml
+task: mlqa_zh_vi
+dataset_name: mlqa.zh.vi
+process_results: !function utils.process_results_zh
--- a/lm_eval/tasks/mlqa/mlqa_zh_zh.yaml
+++ b/lm_eval/tasks/mlqa/mlqa_zh_zh.yaml
+# Generated by generate_tasks.py
+include: mlqa_common_yaml
+task: mlqa_zh_zh
+dataset_name: mlqa.zh.zh
+process_results: !function utils.process_results_zh
--- a/lm_eval/tasks/mlqa/utils.py
+++ b/lm_eval/tasks/mlqa/utils.py
+"""
+Code based on Official evaluation script for the MLQA dataset.
+Repo: https://github.com/facebookresearch/MLQA/blob/main/mlqa_evaluation_v1.py
+"""
+
+import re
+import string
+import sys
+import unicodedata
+from collections import Counter
+
+import datasets
+
+
+PUNCT = {
+    chr(i)
+    for i in range(sys.maxunicode)
+    if unicodedata.category(chr(i)).startswith("P")
+}.union(string.punctuation)
+WHITESPACE_LANGS = ["en", "es", "hi", "vi", "de", "ar"]
+MIXED_SEGMENTATION_LANGS = ["zh"]
+
+
+def whitespace_tokenize(text):
+    return text.split()
+
+
+def mixed_segmentation(text):
+    segs_out = []
+    temp_str = ""
+    for char in text:
+        if re.search(r"[\u4e00-\u9fa5]", char) or char in PUNCT:
+            if temp_str != "":
+                ss = whitespace_tokenize(temp_str)
+                segs_out.extend(ss)
+                temp_str = ""
+            segs_out.append(char)
+        else:
+            temp_str += char
+
+    if temp_str != "":
+        ss = whitespace_tokenize(temp_str)
+        segs_out.extend(ss)
+
+    return segs_out
+
+
+def normalize_answer(s, lang):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text, lang):
+        if lang == "en":
+            return re.sub(r"\b(a|an|the)\b", " ", text)
+        elif lang == "es":
+            return re.sub(r"\b(un|una|unos|unas|el|la|los|las)\b", " ", text)
+        elif lang == "hi":
+            return text  # Hindi does not have formal articles
+        elif lang == "vi":
+            return re.sub(r"\b(của|là|cái|chiếc|những)\b", " ", text)
+        elif lang == "de":
+            return re.sub(
+                r"\b(ein|eine|einen|einem|eines|einer|der|die|das|den|dem|des)\b",
+                " ",
+                text,
+            )
+        elif lang == "ar":
+            return re.sub(r"\sال^|ال", " ", text)
+        elif lang == "zh":
+            return text  # Chinese does not have formal articles
+        else:
+            raise Exception("Unknown Language {}".format(lang))
+
+    def white_space_fix(text, lang):
+        if lang in WHITESPACE_LANGS:
+            tokens = whitespace_tokenize(text)
+        elif lang in MIXED_SEGMENTATION_LANGS:
+            tokens = mixed_segmentation(text)
+        else:
+            raise Exception("Unknown Language {}".format(lang))
+        return " ".join([t for t in tokens if t.strip() != ""])
+
+    def remove_punc(text):
+        return "".join(ch for ch in text if ch not in PUNCT)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s)), lang), lang)
+
+
+def f1_score(prediction, ground_truth, lang):
+    prediction_tokens = normalize_answer(prediction, lang).split()
+    ground_truth_tokens = normalize_answer(ground_truth, lang).split()
+    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction_tokens)
+    recall = 1.0 * num_same / len(ground_truth_tokens)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def exact_match_score(prediction, ground_truth, lang):
+    return normalize_answer(prediction, lang) == normalize_answer(ground_truth, lang)
+
+
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths, lang):
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth, lang)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc):
+        out_doc = {
+            "context": doc["context"],
+            "question": doc["question"],
+            "answers": doc["answers"]["text"],
+        }
+        return out_doc
+
+    return dataset.map(_process_doc)
+
+
+# Base function
+def process_results_lang(doc, results, lang):
+    ground_truths = doc["answers"]
+    prediction = results[0].strip()
+    exact_match = metric_max_over_ground_truths(
+        exact_match_score, prediction, ground_truths, lang
+    )
+    f1 = metric_max_over_ground_truths(f1_score, prediction, ground_truths, lang)
+    return {"exact_match": exact_match, "f1": f1}
+
+
+# Language Wrapper functions
+def process_results_en(doc, results):
+    return process_results_lang(doc, results, "en")
+
+
+def process_results_es(doc, results):
+    return process_results_lang(doc, results, "es")
+
+
+def process_results_hi(doc, results):
+    return process_results_lang(doc, results, "hi")
+
+
+def process_results_vi(doc, results):
+    return process_results_lang(doc, results, "vi")
+
+
+def process_results_de(doc, results):
+    return process_results_lang(doc, results, "de")
+
+
+def process_results_ar(doc, results):
+    return process_results_lang(doc, results, "ar")
+
+
+def process_results_zh(doc, results):
+    return process_results_lang(doc, results, "zh")
--- a/lm_eval/tasks/model_written_evals/winogenerated/winogenerated.yaml
+++ b/lm_eval/tasks/model_written_evals/winogenerated/winogenerated.yaml
--- a/lm_eval/tasks/scrolls/task.py
+++ b/lm_eval/tasks/scrolls/task.py
@@ -4,7 +4,8 @@ from functools import reduce

 import numpy as np
 import transformers.data.metrics.squad_metrics as squad_metrics
-from datasets import Dataset, load_metric
+from datasets import Dataset
+from evaluate import load
 from transformers import AutoTokenizer

 from lm_eval.api.instance import Instance
@@ -48,7 +49,10 @@ def _download_metric():
    from huggingface_hub import hf_hub_download

    scrolls_metric_path = hf_hub_download(
-        repo_id="tau/scrolls", repo_type="dataset", filename="metrics/scrolls.py"
+        repo_id="tau/scrolls",
+        repo_type="dataset",
+        filename="metrics/scrolls.py",
+        revision="refs/pr/5",
    )
    updated_scrolls_metric_path = (
        os.path.dirname(scrolls_metric_path)
@@ -119,7 +123,7 @@ class _SCROLLSTask(ConfigurableTask):
    def __init__(self, config=None):
        super().__init__(config={"metadata": {"version": self.VERSION}})
        if self.DATASET_NAME is not None:
-            self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME)
+            self.metric = load(_download_metric(), config_name=self.DATASET_NAME)

    def has_training_docs(self):
        return True
@@ -253,11 +257,14 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
        }

    def construct_requests(self, doc, ctx, **kwargs):
+        apply_chat_template = kwargs.pop("apply_chat_template", False)
        request_list = [
            Instance(
                request_type="loglikelihood",
                doc=doc,
-                arguments=(ctx, " {}".format(choice)),
+                arguments=(ctx, " {}".format(choice))
+                if not apply_chat_template
+                else (ctx, "{}".format(choice)),
                idx=i,
                **kwargs,
            )
@@ -285,6 +292,7 @@ class _SCROLLSSummaryTask(_SCROLLSTask):
        }

    def construct_requests(self, doc, ctx, **kwargs):
+        kwargs.pop("apply_chat_template", False)
        return Instance(
            request_type="generate_until",
            doc=doc,
@@ -327,19 +335,22 @@ class Qasper(_SCROLLSTask):
        return {"f1": (prediction, doc["outputs"])}

    def construct_requests(self, doc, ctx, **kwargs):
+        apply_chat_template = kwargs.pop("apply_chat_template", False)
        if doc["is_yes_no"]:
            return [
                Instance(
                    request_type="loglikelihood",
                    doc=doc,
-                    arguments=(ctx, " yes"),
+                    arguments=(ctx, " yes")
+                    if not apply_chat_template
+                    else (ctx, "yes"),
                    idx=0,
                    **kwargs,
                ),
                Instance(
                    request_type="loglikelihood",
                    doc=doc,
-                    arguments=(ctx, " no"),
+                    arguments=(ctx, " no") if not apply_chat_template else (ctx, "no"),
                    idx=1,
                    **kwargs,
                ),
@@ -406,6 +417,7 @@ class NarrativeQA(_SCROLLSTask):
        return {"f1": (results[0], doc["outputs"])}

    def construct_requests(self, doc, ctx, **kwargs):
+        kwargs.pop("apply_chat_template", False)
        return Instance(
            request_type="generate_until",
            doc=doc,

--- a/lm_eval/tasks/spanish_bench/phrases_es/_phrases_es_common.yaml
+++ b/lm_eval/tasks/spanish_bench/phrases_es/_phrases_es_common.yaml
--- a/lm_eval/tasks/spanish_bench/phrases_es/phrases_es-va.yaml
+++ b/lm_eval/tasks/spanish_bench/phrases_es/phrases_es-va.yaml
 # File generated by `create-yamls.py`
-include: _phrases_es_common.yaml
+include: _phrases_es_common
 task: phrases_es-va
 doc_to_text: 'Oració en espanyol: {{es}}


--- a/lm_eval/tasks/spanish_bench/phrases_es/phrases_va-es.yaml
+++ b/lm_eval/tasks/spanish_bench/phrases_es/phrases_va-es.yaml
 # File generated by `create-yamls.py`
-include: _phrases_es_common.yaml
+include: _phrases_es_common
 task: phrases_va-es
 doc_to_text: 'Oració en valencià: {{va}}


--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -104,7 +104,8 @@ def simple_parse_args_string(args_string):
        return {}
    arg_list = [arg for arg in args_string.split(",") if arg]
    args_dict = {
-        k: handle_arg_string(v) for k, v in [arg.split("=") for arg in arg_list]
+        kv[0]: handle_arg_string("=".join(kv[1:]))
+        for kv in [arg.split("=") for arg in arg_list]
    }
    return args_dict


--- a/scripts/zeno_visualize.py
+++ b/scripts/zeno_visualize.py
@@ -109,13 +109,14 @@ def main():
            if model_index == 0:  # Only need to assemble data for the first model
                metrics = []
                for metric in config["metric_list"]:
-                    metrics.append(
-                        ZenoMetric(
-                            name=metric["metric"],
-                            type="mean",
-                            columns=[metric["metric"]],
+                    if metric.get("aggregation") == "mean":
+                        metrics.append(
+                            ZenoMetric(
+                                name=metric["metric"],
+                                type="mean",
+                                columns=[metric["metric"]],
+                            )
                        )
-                    )
                project = client.create_project(
                    name=args.project_name + (f"_{task}" if len(tasks) > 1 else ""),
                    view="text-classification",
@@ -168,7 +169,11 @@ def generate_dataset(
    Returns:
        pd.Dataframe: A dataframe that is ready to be uploaded to Zeno.
    """
-    ids = [x["doc_id"] for x in data]
+    ids = (
+        [x["doc_id"] for x in data]
+        if not config.get("filter_list")
+        else [f"{x['doc_id']}.{x['filter']}" for x in data]
+    )
    labels = [x["target"] for x in data]
    instance = [""] * len(ids)

@@ -190,6 +195,7 @@ def generate_dataset(
    return pd.DataFrame(
        {
            "id": ids,
+            "doc_id": [x["doc_id"] for x in data],
            "data": instance,
            "input_len": [len(x) for x in instance],
            "labels": labels,
@@ -208,8 +214,15 @@ def generate_system_df(data, config):
    Returns:
        pd.Dataframe: A dataframe that is ready to be uploaded to Zeno as a system.
    """
-    ids = [x["doc_id"] for x in data]
+    ids = (
+        [x["doc_id"] for x in data]
+        if not config.get("filter_list")
+        else [f"{x['doc_id']}.{x['filter']}" for x in data]
+    )
    system_dict = {"id": ids}
+    system_dict["doc_id"] = [x["doc_id"] for x in data]
+    if config.get("filter_list"):
+        system_dict["filter"] = [x["filter"] for x in data]
    system_dict["output"] = [""] * len(ids)

    if config["output_type"] == "loglikelihood":
@@ -228,11 +241,10 @@ def generate_system_df(data, config):
        system_dict["output"] = [str(x["filtered_resps"][0]) for x in data]
        system_dict["output_length"] = [len(str(x["filtered_resps"][0])) for x in data]

-    metrics = {}
-    for metric in config["metric_list"]:
-        if "aggregation" in metric and metric["aggregation"] == "mean":
-            metrics[metric["metric"]] = [x[metric["metric"]] for x in data]
-
+    metrics = {
+        metric["metric"]: [x[metric["metric"]] for x in data]
+        for metric in config["metric_list"]
+    }
    system_dict.update(metrics)
    system_df = pd.DataFrame(system_dict)
    return system_df