Merge branch 'EleutherAI:main' into main

da211969 · Jess · GitHub · 1b97e487 · 801322e0 · da211969
Unverified Commit da211969 authored Jun 28, 2024 by Jess Committed by GitHub Jun 28, 2024
20 changed files
--- a/lm_eval/tasks/bigbench/multiple_choice_template_b_yaml
+++ b/lm_eval/tasks/bigbench/multiple_choice_template_b_yaml
+group: bigbench_multiple_choice
+dataset_path: hails/bigbench
+dataset_kwargs:
+  # num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods
+  # subtask_name: null
+output_type: multiple_choice
+test_split: default
+doc_to_text: inputs
+doc_to_target: "{{multiple_choice_scores.index(1)}}"
+doc_to_choice: "{{multiple_choice_targets}}"
+metric_list:
+  - metric: acc
+  # TODO: brier score and other metrics
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/bigbench/push_bigbench_dataset.py
+++ b/lm_eval/tasks/bigbench/push_bigbench_dataset.py
@@ -8,6 +8,7 @@ Requires the installation of
 `pip install "bigbench @ https://storage.googleapis.com/public_research_data/bigbench/bigbench-0.0.1.tar.gz"`
 and is included so that the bigbench dependency can be avoided.
 """
+
 import bigbench.api.util as bb_utils
 import datasets
 from tqdm import tqdm

--- a/lm_eval/tasks/ceval/_generate_configs.py
+++ b/lm_eval/tasks/ceval/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
+
 import argparse
 import os


--- a/lm_eval/tasks/cmmlu/_generate_configs.py
+++ b/lm_eval/tasks/cmmlu/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
+
 import argparse
 import os


--- a/lm_eval/tasks/commonsense_qa/README.md
+++ b/lm_eval/tasks/commonsense_qa/README.md
+# Task-name
+
+### Paper
+
+Title: `COMMONSENSEQA: A Question Answering Challenge Targeting
+Commonsense Knowledge`
+
+Abstract: https://arxiv.org/pdf/1811.00937.pdf
+
+CommonsenseQA is a multiple-choice question answering dataset that requires different types of commonsense knowledge to predict the correct answers.
+It contains 12,102 questions with one correct answer and four distractor answers.
+
+Homepage: https://www.tau-nlp.org/commonsenseqa
+
+
+### Citation
+
+```
+@inproceedings{talmor-etal-2019-commonsenseqa,
+    title = "{C}ommonsense{QA}: A Question Answering Challenge Targeting Commonsense Knowledge",
+    author = "Talmor, Alon  and
+      Herzig, Jonathan  and
+      Lourie, Nicholas  and
+      Berant, Jonathan",
+    booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
+    month = jun,
+    year = "2019",
+    address = "Minneapolis, Minnesota",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/N19-1421",
+    doi = "10.18653/v1/N19-1421",
+    pages = "4149--4158",
+    archivePrefix = "arXiv",
+    eprint        = "1811.00937",
+    primaryClass  = "cs",
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* Not part of a group yet.
+
+#### Tasks
+
+* `commonsense_qa`: Represents the "random" split from the paper. Uses an MMLU-style prompt, as (presumably) used by Llama evaluations.
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/commonsense_qa/default.yaml
+++ b/lm_eval/tasks/commonsense_qa/default.yaml
+task: commonsense_qa
+dataset_path: tau/commonsense_qa
+training_split: train
+validation_split: validation
+output_type: multiple_choice
+doc_to_text: "Question: {{ question.strip() }}\nA. {{choices['text'][0]}}\nB. {{choices['text'][1]}}\nC. {{choices['text'][2]}}\nD. {{choices['text'][3]}}\nE. {{choices['text'][4]}}\nAnswer:"
+doc_to_target: answerKey
+doc_to_choice: ['A', 'B', 'C', 'D', 'E']
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/csatqa/_generate_configs.py
+++ b/lm_eval/tasks/csatqa/_generate_configs.py
 """
 Take in a YAML, and output all other splits with this YAML
 """
+
 import argparse
 import os


--- a/lm_eval/tasks/fda/task.py
+++ b/lm_eval/tasks/fda/task.py
-"""
-"""
 import re
 from typing import List


--- a/lm_eval/tasks/fld/README.md
+++ b/lm_eval/tasks/fld/README.md
@@ -38,18 +38,19 @@ Homepage: https://github.com/hitachi-nlp/FLD

 ### Groups and Tasks

-#### Groups
-
-* `fld`
-
-#### Tasks
-
 This release is the simplified version of FLD where a model is required to predict only an answer.
 This setting is described by "answer accuracy" in the original paper.

+#### Tasks in Group `fld`
 * `fld_default` is a basic task based on [FLD.v2](https://huggingface.co/datasets/hitachi-nlp/FLD.v2/viewer/star)
 * `fld_star`: is a more challenging version based on [FLD.v2-star](https://huggingface.co/datasets/hitachi-nlp/FLD.v2/viewer/star)

+#### Tasks in Group `fld_logical_formula`
+Further, we have "logical formula" versions of the benchmarks, which evaluate LLMs' pure logical reasoning capabilities within the domain of logical formulas, rather than natural language:
+* `fld_logical_formula_default`
+* `fld_logical_formula_fld_star`
+
+
 ### Checklist

 For adding novel benchmarks/datasets to the library:

--- a/lm_eval/tasks/fld/fld_logical_formula_default.yaml
+++ b/lm_eval/tasks/fld/fld_logical_formula_default.yaml
+group:
+  - fld_logical_formula
+task: fld_logical_formula_default
+dataset_path: hitachi-nlp/FLD.v2
+dataset_name: default
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_text: "Based on the provided facts ($context$), either prove or disprove the hypothesis or state that it is unknown. The facts and the hypothesis are written in logical formulas as follows: capital letters such as \"{A}\", \"{B}\", \"{AB}\" are predicates, small letters such as \"{a}\", \"{b}\", \"{ab}\" are constants, \"&\" is logical conjunction, \"v\" is logical disjunction, \"¬\" is negation, \"->\" is implication, \"(x)\" is \"for all x\", and \"(Ex)\" is \"for some x\". $hypothesis$ = {{hypothesis_formula}} ; $context$ = {{context_formula}} ; $proof$ = "
+doc_to_target: world_assump_label
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+metadata:
+  version: 2.0
--- a/lm_eval/tasks/fld/fld_logical_formula_star.yaml
+++ b/lm_eval/tasks/fld/fld_logical_formula_star.yaml
+include: fld_logical_formula_default.yaml
+task: fld_logical_formula_star
+dataset_name: star
--- a/lm_eval/tasks/glianorex/README.md
+++ b/lm_eval/tasks/glianorex/README.md
+# Glianorex
+
+The goal of this benchmark is to isolate the test answering capabilities from the content knowledge.
+
+### Paper
+
+Title: Multiple Choice Questions and Large Languages Models: A Case Study with Fictional Medical Data
+
+Abstract: https://arxiv.org/abs/2406.02394
+
+To test the relevance of MCQs to assess LLM performance without prior data exposure, we created a fictional medical benchmark and knowledge base on a non-existent gland, the Glianorex. Using GPT-4 we generated a comprehensive textbook on the Glianorex in both English and French, and created multiple-choice questions in both English and French.
+
+### Tasks
+
+All tasks are multiple choice questions with 4 options, only one correct option.
+
+- `glianorex`: Evaluates all tasks listed below.
+
+- `glianorex_en`: Evaluates the accuracy on 264 questions in English.
+- `glianorex_fr`: Evaluates the accuracy on 264 questions in French.
--- a/lm_eval/tasks/glianorex/glianorex.yaml
+++ b/lm_eval/tasks/glianorex/glianorex.yaml
+task: glianorex
+dataset_path: maximegmd/glianorex
+output_type: multiple_choice
+test_split: train
+doc_to_text: !function preprocess_glianorex.doc_to_text
+doc_to_target: !function preprocess_glianorex.doc_to_target
+doc_to_choice: [ 'A', 'B', 'C', 'D' ]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/glianorex/glianorex_en.yaml
+++ b/lm_eval/tasks/glianorex/glianorex_en.yaml
+task: glianorex_en
+dataset_path: maximegmd/glianorex
+output_type: multiple_choice
+test_split: train
+doc_to_text: !function preprocess_glianorex.doc_to_text
+doc_to_target: !function preprocess_glianorex.doc_to_target
+process_docs: !function preprocess_glianorex.filter_english
+doc_to_choice: [ 'A', 'B', 'C', 'D' ]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/glianorex/glianorex_fr.yaml
+++ b/lm_eval/tasks/glianorex/glianorex_fr.yaml
+task: glianorex_fr
+dataset_path: maximegmd/glianorex
+output_type: multiple_choice
+test_split: train
+doc_to_text: !function preprocess_glianorex.doc_to_text
+doc_to_target: !function preprocess_glianorex.doc_to_target
+process_docs: !function preprocess_glianorex.filter_french
+doc_to_choice: [ 'A', 'B', 'C', 'D' ]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/glianorex/preprocess_glianorex.py
+++ b/lm_eval/tasks/glianorex/preprocess_glianorex.py
+import datasets
+
+
+def doc_to_text(doc) -> str:
+    option_choices = doc["options"]
+    answers = "".join((f"{k}. {v}\n") for k, v in option_choices.items())
+    return f"Question: {doc['question']}\n{answers}Answer:"
+
+
+def doc_to_target(doc) -> int:
+    return doc["answer_idx"]
+
+
+def filter_dataset(dataset: datasets.Dataset, lang: str) -> datasets.Dataset:
+    return dataset.filter(lambda example: example["language"].startswith(lang))
+
+
+def filter_french(dataset: datasets.Dataset) -> datasets.Dataset:
+    return filter_dataset(dataset, "fr")
+
+
+def filter_english(dataset: datasets.Dataset) -> datasets.Dataset:
+    return filter_dataset(dataset, "en")
--- a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
-group:
-  - chain_of_thought
-task: gsm8k_cot
-dataset_path: gsm8k
 dataset_name: main
-output_type: generate_until
-test_split: test
-doc_to_text: "Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\nA: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.\n\n\
-Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\nA: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.\n\n\
-Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\nA: Originally, Leah had 32 chocolates. Her sister had 42. So in total they had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39. The answer is 39.\n\n\
-Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?\nA: Jason started with 20 lollipops. Then he had 12 after giving some to Denny. So he gave Denny 20 - 12 = 8. The answer is 8.\n\n\
-Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?\nA: Shawn started with 5 toys. If he got 2 toys each from his mom and dad, then that is 4 more toys. 5 + 4 = 9. The answer is 9.\n\n\
-Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?\nA: There were originally 9 computers. For each of 4 days, 5 more computers were added. So 5 * 4 = 20 computers were added. 9 + 20 is 29. The answer is 29.\n\n\
-Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\nA: Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The answer is 33.\n\n\
-Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\nA: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The answer is 8.\n\n\
-Q: {{question}}\nA:"
-doc_to_target: "{{answer.split('####')[-1].strip()}}"
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: false
-    regexes_to_ignore:
-      - ","
-      - "\\$"
-      - "(?s).*#### "
-      - "\\.$"
+dataset_path: gsm8k
+doc_to_target: '{{answer.split(''####'')[-1].strip() if answer is defined else target}}'
+doc_to_text: 'Q: {{question}}
+
+  A:'
+fewshot_config:
+  sampler: first_n
+  samples:
+  - question: There are 15 trees in the grove. Grove workers will plant trees in the
+      grove today. After they are done, there will be 21 trees. How many trees did
+      the grove workers plant today?
+    target: There are 15 trees originally. Then there were 21 trees after some more
+      were planted. So there must have been 21 - 15 = 6. The answer is 6.
+  - question: If there are 3 cars in the parking lot and 2 more cars arrive, how many
+      cars are in the parking lot?
+    target: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer
+      is 5.
+  - question: Leah had 32 chocolates and her sister had 42. If they ate 35, how many
+      pieces do they have left in total?
+    target: Originally, Leah had 32 chocolates. Her sister had 42. So in total they
+      had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39. The answer is 39.
+  - question: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12
+      lollipops. How many lollipops did Jason give to Denny?
+    target: Jason started with 20 lollipops. Then he had 12 after giving some to Denny.
+      So he gave Denny 20 - 12 = 8. The answer is 8.
+  - question: Shawn has five toys. For Christmas, he got two toys each from his mom and
+      dad. How many toys does he have now?
+    target: Shawn started with 5 toys. If he got 2 toys each from his mom and dad,
+      then that is 4 more toys. 5 + 4 = 9. The answer is 9.
+  - question: There were nine computers in the server room. Five more computers were
+      installed each day, from monday to thursday. How many computers are now in the
+      server room?
+    target: There were originally 9 computers. For each of 4 days, 5 more computers
+      were added. So 5 * 4 = 20 computers were added. 9 + 20 is 29. The answer is
+      29.
+  - question: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday,
+      he lost 2 more. How many golf balls did he have at the end of wednesday?
+    target: Michael started with 58 golf balls. After losing 23 on tuesday, he had
+      58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The answer
+      is 33.
+  - question: Olivia has $23. She bought five bagels for $3 each. How much money does
+      she have left?
+    target: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15
+      dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The answer is 8.
+filter_list:
+- filter:
+  - function: regex
+    regex_pattern: The answer is (\-?[0-9\.\,]+).
+  - function: take_first
+  name: strict-match
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+  - function: take_first
+  name: flexible-extract
 generation_kwargs:
-  until:
-    - "Q:"
-    - "</s>"
-    - "<|im_end|>"
  do_sample: false
-repeats: 1
-num_fewshot: 0
-filter_list:
-  - name: "strict-match"
-    filter:
-      - function: "regex"
-        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)."
-      - function: "take_first"
-  - name: "flexible-extract"
-    filter:
-      - function: "regex"
-        group_select: -1
-        regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-      - function: "take_first"
+  until:
+  - 'Q:'
+  - </s>
+  - <|im_end|>
+group:
+- chain_of_thought
 metadata:
  version: 3.0
-  num_fewshot: 8
+metric_list:
+- aggregation: mean
+  higher_is_better: true
+  ignore_case: true
+  ignore_punctuation: false
+  metric: exact_match
+  regexes_to_ignore:
+  - ','
+  - \$
+  - '(?s).*#### '
+  - \.$
+num_fewshot: 8
+output_type: generate_until
+repeats: 1
+task: gsm8k_cot
+test_split: test
--- a/lm_eval/tasks/ifeval/instructions.py
+++ b/lm_eval/tasks/ifeval/instructions.py
@@ -13,6 +13,7 @@
 # limitations under the License.

 """Library of instructions."""
+
 import collections
 import json
 import logging

--- a/lm_eval/tasks/ifeval/instructions_registry.py
+++ b/lm_eval/tasks/ifeval/instructions_registry.py
@@ -13,6 +13,7 @@
 # limitations under the License.

 """Registry of all instructions."""
+
 from lm_eval.tasks.ifeval import instructions



--- a/lm_eval/tasks/kmmlu/cot_hard/_cot_kmmlu_yaml
+++ b/lm_eval/tasks/kmmlu/cot_hard/_cot_kmmlu_yaml
@@ -5,7 +5,7 @@ dataset_path: HAERAE-HUB/KMMLU-HARD
 output_type: generate_until
 validation_split: dev # not meant to be used, only here to silence warnings
 test_split: test
-doc_to_target: "{{['A', 'B', 'C', 'D'][answer-1]}}"
+doc_to_target: "{{['A', 'B', 'C', 'D'][answer-1] if answer is defined else target}}" # answer undefined: cot few shot example, else normal sample
 metric_list:
  - metric: exact_match
    aggregation: mean
@@ -18,7 +18,7 @@ generation_kwargs:
  do_sample: false
  max_gen_toks: 2048
  temperature: 0.0
-num_fewshot: 0
+num_fewshot: 5
 filter_list:
  - name: "get-answer"
    filter:
@@ -28,4 +28,3 @@ filter_list:
      - function: "take_first"
 metadata:
  version: 2.0
-  num_fewshot: 5