Merge branch 'main' into longcxt

2b56339e · Baber · 0b533339 · 703fbffd · 2b56339e · 2b56339e
Commit 2b56339e authored Jan 17, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/llama3/instruct/mmlu_pro/mmlu_pro_psychology.yaml
+++ b/lm_eval/tasks/llama3/instruct/mmlu_pro/mmlu_pro_psychology.yaml
+include: "_default_template_yaml"
+task: "mmlu_pro_llama_psychology"
+task_alias: "psychology"
+process_docs: !function utils.process_psychology
--- a/lm_eval/tasks/llama3/instruct/mmlu_pro/utils.py
+++ b/lm_eval/tasks/llama3/instruct/mmlu_pro/utils.py
+import re
+from functools import partial
+
+
+def process_docs(dataset, subject):
+    return dataset.filter(lambda x: x["category"] == subject)
+
+
+def fewshot_to_text(example):
+    text = example["cot_content"].removeprefix("A: Let's think step by step.").strip()
+    return re.sub(r"The answer is \(([A-Z])\)\.", r"The best answer is \1.", text)
+
+
+process_biology = partial(process_docs, subject="biology")
+process_business = partial(process_docs, subject="business")
+process_chemistry = partial(process_docs, subject="chemistry")
+process_computer_science = partial(process_docs, subject="computer science")
+process_economics = partial(process_docs, subject="economics")
+process_engineering = partial(process_docs, subject="engineering")
+process_health = partial(process_docs, subject="health")
+process_history = partial(process_docs, subject="history")
+process_law = partial(process_docs, subject="law")
+process_math = partial(process_docs, subject="math")
+process_other = partial(process_docs, subject="other")
+process_philosophy = partial(process_docs, subject="philosophy")
+process_physics = partial(process_docs, subject="physics")
+process_psychology = partial(process_docs, subject="psychology")
--- a/lm_eval/tasks/mbpp/README.md
+++ b/lm_eval/tasks/mbpp/README.md
+# MBPP
+
+## Paper
+Program Synthesis with Large Language Models
+https://arxiv.org/abs/2108.07732
+
+This paper explores the limits of the current generation of large language models for program synthesis in general purpose programming languages. We evaluate a collection of such models (with between 244M and 137B parameters) on two new benchmarks, MBPP and MathQA-Python, in both the few-shot and fine-tuning regimes. Our benchmarks are designed to measure the ability of these models to synthesize short Python programs from natural language descriptions. The Mostly Basic Programming Problems (MBPP) dataset contains 974 programming tasks, designed to be solvable by entry-level programmers. The MathQA-Python dataset, a Python version of the MathQA benchmark, contains 23914 problems that evaluate the ability of the models to synthesize code from more complex text. On both datasets, we find that synthesis performance scales log-linearly with model size. Our largest models, even without finetuning on a code dataset, can synthesize solutions to 59.6 percent of the problems from MBPP using few-shot learning with a well-designed prompt. Fine-tuning on a held-out portion of the dataset improves performance by about 10 percentage points across most model sizes. On the MathQA-Python dataset, the largest fine-tuned model achieves 83.8 percent accuracy. Going further, we study the model's ability to engage in dialog about code, incorporating human feedback to improve its solutions. We find that natural language feedback from a human halves the error rate compared to the model's initial prediction. Additionally, we conduct an error analysis to shed light on where these models fall short and what types of programs are most difficult to generate. Finally, we explore the semantic grounding of these models by fine-tuning them to predict the results of program execution. We find that even our best models are generally unable to predict the output of a program given a specific input.
+
+Homepage: https://github.com/google-research/google-research/tree/master/mbpp
+
+
+## Citation
+```
+@article{austin2021program,
+  title={Program synthesis with large language models},
+  author={Austin, Jacob and Odena, Augustus and Nye, Maxwell and Bosma, Maarten and Michalewski, Henryk and Dohan, David and Jiang, Ellen and Cai, Carrie and Terry, Michael and Le, Quoc and others},
+  journal={arXiv preprint arXiv:2108.07732},
+  year={2021}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* Not part of a group yet.
+
+#### Tasks
+
+- `mbpp`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/mbpp/mbpp.yaml
+++ b/lm_eval/tasks/mbpp/mbpp.yaml
+task: mbpp
+dataset_path: google-research-datasets/mbpp
+dataset_name: full
+unsafe_code: true
+output_type: generate_until
+test_split: test
+doc_to_text: "You are an expert Python programmer, and here is your task: {{text}} Your code should pass these tests:\n\n{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}\n[BEGIN]"
+doc_to_target: "{% if is_fewshot is defined %}{{code}}\n[DONE]{% else %}{{test_list[0]}}\n{{test_list[1]}}\n{{test_list[2]}}{% endif %}"
+target_delimiter: "\n"
+metric_list:
+  - metric: !function utils.pass_at_1
+    aggregation: mean
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "[DONE]"
+  do_sample: false
+num_fewshot: 3
+fewshot_config:
+  sampler: first_n
+  samples: !function utils.list_fewshot_samples
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/mbpp/utils.py
+++ b/lm_eval/tasks/mbpp/utils.py
+import evaluate as hf_evaluate
+
+
+try:
+    pass_at_k = hf_evaluate.load("code_eval")
+
+    # run simple test to check code execution is enabled before model generation
+    test_cases = ["assert add(2, 3)==5"]
+    candidates = [["def add(a,b): return a*b"]]
+    results = pass_at_k.compute(references=test_cases, predictions=candidates, k=[1])
+except Exception as e:
+    raise e
+
+
+def pass_at_1(references, predictions):
+    return pass_at_k.compute(
+        references=references,
+        predictions=[predictions],
+        k=[1],
+    )[0]["pass@1"]
+
+
+def list_fewshot_samples():
+    return [
+        {
+            "task_id": 2,
+            "text": "Write a function to find the similar elements from the given two tuple lists.",
+            "code": "def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res) ",
+            "test_list": [
+                "assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)",
+                "assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)",
+                "assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)",
+            ],
+            "is_fewshot": True,
+        },
+        {
+            "task_id": 3,
+            "text": "Write a python function to identify non-prime numbers.",
+            "code": "import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result",
+            "test_list": [
+                "assert is_not_prime(2) == False",
+                "assert is_not_prime(10) == True",
+                "assert is_not_prime(35) == True",
+            ],
+            "is_fewshot": True,
+        },
+        {
+            "task_id": 4,
+            "text": "Write a function to find the largest integers from a given list of numbers using heap queue algorithm.",
+            "code": "import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums",
+            "test_list": [
+                "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] ",
+                "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] ",
+                "assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]",
+            ],
+            "is_fewshot": True,
+        },
+    ]
--- a/lm_eval/tasks/mgsm/README.md
+++ b/lm_eval/tasks/mgsm/README.md
@@ -92,3 +92,7 @@ If other tasks on this dataset are already supported:
 * [ ] Is the "Main" variant of this task clearly denoted?
 * [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
 * [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
+
+# changelog
+- (en_cot, direct) ver 3; (native_cot) ver 4: issue #2578; PR #2587
+  - fix fewshot format: Changed inconsistent usage of ':' (ASCII) and '：' (Chinese) to use '：' consistently.
--- a/lm_eval/tasks/mgsm/direct/direct_yaml
+++ b/lm_eval/tasks/mgsm/direct/direct_yaml
@@ -32,4 +32,4 @@ metric_list:
    ignore_case: true
    ignore_punctuation: true
 metadata:
-  version: 2.0
+  version: 3.0
--- a/lm_eval/tasks/mgsm/direct/mgsm_direct_ja.yaml
+++ b/lm_eval/tasks/mgsm/direct/mgsm_direct_ja.yaml
 # Generated by utils.py
 dataset_name: ja
 doc_to_target: '{% if answer is not none %}{{answer[11:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"問題: "+question+"\nAnswer:"}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"問題： "+question+"\nAnswer:"}}{% endif %}'
 generation_kwargs:
  do_sample: false
  until:
-  - '問題:'
+  - 問題：
  - </s>
  - <|im_end|>
 include: direct_yaml

--- a/lm_eval/tasks/mgsm/direct/mgsm_direct_zh.yaml
+++ b/lm_eval/tasks/mgsm/direct/mgsm_direct_zh.yaml
 # Generated by utils.py
 dataset_name: zh
 doc_to_target: '{% if answer is not none %}{{answer[6:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"问题: "+question+"\nAnswer:"}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"问题： "+question+"\nAnswer:"}}{% endif %}'
 generation_kwargs:
  do_sample: false
  until:
-  - '问题:'
+  - 问题：
  - </s>
  - <|im_end|>
 include: direct_yaml

--- a/lm_eval/tasks/mgsm/en_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/en_cot/cot_yaml
@@ -33,4 +33,4 @@ filter_list:
    - function: take_first
    name: flexible-extract
 metadata:
-  version: 2.0
+  version: 3.0
--- a/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_ja.yaml
+++ b/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_ja.yaml
 # Generated by utils.py
 dataset_name: ja
 doc_to_target: '{% if answer is not none %}{{answer[11:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"問題: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"問題： "+question+"\nStep-by-Step Answer:"}}{% endif %}'
 generation_kwargs:
  do_sample: false
  until:
-  - '問題:'
+  - 問題：
  - </s>
  - <|im_end|>
 include: cot_yaml

--- a/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_zh.yaml
+++ b/lm_eval/tasks/mgsm/en_cot/mgsm_en_cot_zh.yaml
 # Generated by utils.py
 dataset_name: zh
 doc_to_target: '{% if answer is not none %}{{answer[6:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"问题: "+question+"\nStep-by-Step Answer:"}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nStep-by-Step Answer:"}}{% else %}{{"问题： "+question+"\nStep-by-Step Answer:"}}{% endif %}'
 generation_kwargs:
  do_sample: false
  until:
-  - '问题:'
+  - 问题：
  - </s>
  - <|im_end|>
 include: cot_yaml

--- a/lm_eval/tasks/mgsm/native_cot/cot_yaml
+++ b/lm_eval/tasks/mgsm/native_cot/cot_yaml
@@ -28,4 +28,4 @@ filter_list:
        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
      - function: "take_first"
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_ja.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_ja.yaml
 # Generated by utils.py
 dataset_name: ja
 doc_to_target: '{% if answer is not none %}{{answer[11:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\nステップごとの答え:"}}{% else %}{{"問題: "+question+"\nステップごとの答え:"}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nステップごとの答え:"}}{% else %}{{"問題： "+question+"\nステップごとの答え:"}}{% endif %}'
 filter_list:
 - filter:
  - function: regex
@@ -17,7 +17,7 @@ filter_list:
 generation_kwargs:
  do_sample: false
  until:
-  - '問題:'
+  - 問題：
  - </s>
  - <|im_end|>
 include: cot_yaml

--- a/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_zh.yaml
+++ b/lm_eval/tasks/mgsm/native_cot/mgsm_native_cot_zh.yaml
 # Generated by utils.py
 dataset_name: zh
 doc_to_target: '{% if answer is not none %}{{answer[6:]}}{% else %}{{answer_number|string}}{% endif %}'
-doc_to_text: '{% if answer is not none %}{{question+"\n逐步解答:"}}{% else %}{{"问题: "+question+"\n逐步解答:"}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\n逐步解答:"}}{% else %}{{"问题： "+question+"\n逐步解答:"}}{% endif %}'
 filter_list:
 - filter:
  - function: regex
@@ -17,7 +17,7 @@ filter_list:
 generation_kwargs:
  do_sample: false
  until:
-  - '问题:'
+  - 问题：
  - </s>
  - <|im_end|>
 include: cot_yaml

--- a/lm_eval/tasks/mgsm/utils.py
+++ b/lm_eval/tasks/mgsm/utils.py
@@ -75,7 +75,7 @@ LANGUAGES = {
    },
    "ja": {  # Japanese
        # "QUESTION": "問題:",
-        "QUESTION": "\u554f\u984c:",
+        "QUESTION": "\u554f\u984c：",
        # "ANSWER": "ステップごとの答え:",
        "ANSWER": "\u30b9\u30c6\u30c3\u30d7\u3054\u3068\u306e\u7b54\u3048:",
        "DIRECT": "Answer:",
@@ -84,7 +84,7 @@ LANGUAGES = {
    },
    "zh": {  # Chinese
        # "QUESTION": "问题:",
-        "QUESTION": "\u95ee\u9898:",
+        "QUESTION": "\u95ee\u9898：",
        # "ANSWER": "逐步解答:",
        "ANSWER": "\u9010\u6b65\u89e3\u7b54:",
        "DIRECT": "Answer:",

--- a/lm_eval/tasks/mlqa/README.md
+++ b/lm_eval/tasks/mlqa/README.md
+# MLQA
+
+### Paper
+
+Title: `MLQA: Evaluating Cross-lingual Extractive Question Answering`
+
+Abstract: `https://arxiv.org/abs/1910.07475`
+
+MLQA (MultiLingual Question Answering) is a benchmark dataset for evaluating cross-lingual question answering performance.
+MLQA consists of over 5K extractive QA instances (12K in English) in SQuAD format in seven languages - English, Arabic,
+German, Spanish, Hindi, Vietnamese and Simplified Chinese. MLQA is highly parallel, with QA instances parallel between
+4 different languages on average
+
+Homepage: `https://github.com/facebookresearch/MLQA`
+
+
+### Citation
+
+```
+@misc{lewis2020mlqaevaluatingcrosslingualextractive,
+      title={MLQA: Evaluating Cross-lingual Extractive Question Answering},
+      author={Patrick Lewis and Barlas Oğuz and Ruty Rinott and Sebastian Riedel and Holger Schwenk},
+      year={2020},
+      eprint={1910.07475},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/1910.07475},
+}
+```
+
+### Groups, Tags, and Tasks
+
+#### Groups
+
+* Not part of a group yet
+
+#### Tasks
+
+Tasks of the form `mlqa_context-lang_question-lang.yaml`
+* `mlqa_ar_ar.yaml`
+* `mlqa_ar_de.yaml`
+* `mlqa_ar_vi.yaml`
+* `mlqa_ar_zh.yaml`
+* `mlqa_ar_en.yaml`
+* `mlqa_ar_es.yaml`
+* `mlqa_ar_hi.yaml`
+* `mlqa_de_ar.yaml`
+* `mlqa_de_de.yaml`
+* `mlqa_de_vi.yaml`
+* `mlqa_de_zh.yaml`
+* `mlqa_de_en.yaml`
+* `mlqa_de_es.yaml`
+* `mlqa_de_hi.yaml`
+* `mlqa_vi_ar.yaml`
+* `mlqa_vi_de.yaml`
+* `mlqa_vi_vi.yaml`
+* `mlqa_vi_zh.yaml`
+* `mlqa_vi_en.yaml`
+* `mlqa_vi_es.yaml`
+* `mlqa_vi_hi.yaml`
+* `mlqa_zh_ar.yaml`
+* `mlqa_zh_de.yaml`
+* `mlqa_zh_vi.yaml`
+* `mlqa_zh_zh.yaml`
+* `mlqa_zh_en.yaml`
+* `mlqa_zh_es.yaml`
+* `mlqa_zh_hi.yaml`
+* `mlqa_en_ar.yaml`
+* `mlqa_en_de.yaml`
+* `mlqa_en_vi.yaml`
+* `mlqa_en_zh.yaml`
+* `mlqa_en_en.yaml`
+* `mlqa_en_es.yaml`
+* `mlqa_en_hi.yaml`
+* `mlqa_es_ar.yaml`
+* `mlqa_es_de.yaml`
+* `mlqa_es_vi.yaml`
+* `mlqa_es_zh.yaml`
+* `mlqa_es_en.yaml`
+* `mlqa_es_es.yaml`
+* `mlqa_es_hi.yaml`
+* `mlqa_hi_ar.yaml`
+* `mlqa_hi_de.yaml`
+* `mlqa_hi_vi.yaml`
+* `mlqa_hi_zh.yaml`
+* `mlqa_hi_en.yaml`
+* `mlqa_hi_es.yaml`
+* `mlqa_hi_hi.yaml`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/mlqa/generate_tasks.py
+++ b/lm_eval/tasks/mlqa/generate_tasks.py
+# ruff: noqa: E731, E741
+"""
+Script to generate task YAMLs for the mlqa dataset.
+Based on `tasks/bigbench/generate_tasks.py`.
+"""
+
+from datasets import get_dataset_config_names
+
+
+chosen_subtasks = []
+
+language_dict = {
+    "en": "english",
+    "es": "spanish",
+    "hi": "hindi",
+    "vi": "vietnamese",
+    "de": "german",
+    "ar": "arabic",
+    "zh": "chinese",
+}
+
+
+def main() -> None:
+    configs = get_dataset_config_names("facebook/mlqa", trust_remote_code=True)
+    for config in configs:
+        if len(config.split(".")) == 2:
+            continue
+        else:
+            chosen_subtasks.append(config)
+    assert len(chosen_subtasks) == 49
+    for task in chosen_subtasks:
+        file_name = f"{task.replace('.', '_')}.yaml"
+        context_lang = file_name.split("_")[1]
+        # Not using yaml to avoid tagging issues with !function
+        with open(file_name, "w", encoding="utf-8") as f:
+            f.write("# Generated by generate_tasks.py\n")
+
+            # Manually writing the YAML-like content inside files to avoid tagging issues
+            f.write("include: mlqa_common_yaml\n")
+            f.write(f"task: {task.replace('.', '_')}\n")
+            f.write(f"dataset_name: {task}\n")
+            f.write(
+                f"process_results: !function utils.process_results_{context_lang}\n"
+            )
+
+
+if __name__ == "__main__":
+    main()
--- a/lm_eval/tasks/mlqa/mlqa_ar_ar.yaml
+++ b/lm_eval/tasks/mlqa/mlqa_ar_ar.yaml
+# Generated by generate_tasks.py
+include: mlqa_common_yaml
+task: mlqa_ar_ar
+dataset_name: mlqa.ar.ar
+process_results: !function utils.process_results_ar
--- a/lm_eval/tasks/mlqa/mlqa_ar_de.yaml
+++ b/lm_eval/tasks/mlqa/mlqa_ar_de.yaml
+# Generated by generate_tasks.py
+include: mlqa_common_yaml
+task: mlqa_ar_de
+dataset_name: mlqa.ar.de
+process_results: !function utils.process_results_ar