Merge branch 'main' of https://github.com/EleutherAI/lm-evaluation-harness into mela

741a6a69 · lintangsutawika · 494a4515 · b536f067 · 741a6a69 · 741a6a69
Commit 741a6a69 authored Aug 20, 2024 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/gpqa/generative/_gpqa_generative_n_shot_yaml
+++ b/lm_eval/tasks/gpqa/generative/_gpqa_generative_n_shot_yaml
 dataset_path: Idavidrein/gpqa
-group: gpqa
+tag: gpqa
 output_type: generate_until
 process_docs: !function utils.process_docs
 training_split: train
@@ -36,4 +36,4 @@ metric_list:
    ignore_case: true
    ignore_punctuation: true
 metadata:
-  version: 1.0
+  version: 2.0
--- a/lm_eval/tasks/gpqa/n_shot/_gpqa_n_shot_yaml
+++ b/lm_eval/tasks/gpqa/n_shot/_gpqa_n_shot_yaml
 dataset_path: Idavidrein/gpqa
-group: gpqa
+tag: gpqa
 output_type: multiple_choice
 process_docs: !function utils.process_docs
 training_split: train
@@ -18,4 +18,4 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: 1.0
+  version: 2.0
--- a/lm_eval/tasks/gpqa/zeroshot/_gpqa_zeroshot_yaml
+++ b/lm_eval/tasks/gpqa/zeroshot/_gpqa_zeroshot_yaml
 dataset_path: Idavidrein/gpqa
-group: gpqa
+tag: gpqa
 output_type: multiple_choice
 process_docs: !function utils.process_docs
 training_split: train

--- a/lm_eval/tasks/gsm8k/README.md
+++ b/lm_eval/tasks/gsm8k/README.md
@@ -44,6 +44,9 @@ Homepage: https://github.com/openai/grade-school-math
 - `gsm8k_yaml`
 - `gsm8k_cot`: GSM8K with Chain-of-Thought
 - `gsm8k_cot_self_consistency`: GSM8K with Chain-of-Thought and Self-Consistency
+- `gsm8k_cot_llama`: GSM8K with prompt formatting modified to conform to the evaluation settings described by Meta here: https://huggingface.co/datasets/meta-llama/Meta-Llama-3.1-8B-Instruct-evals/viewer/Meta-Llama-3.1-8B-Instruct-evals__gsm8k__details?row=0
+    - Use this task with --fewshot_as_multiturn and --apply_chat_template to replicate Meta's reported performance.
+

 ### Checklist


--- a/lm_eval/tasks/gsm8k/gsm8k-cot-llama.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot-llama.yaml
+dataset_name: main
+dataset_path: gsm8k
+doc_to_target: '{{answer.split(''####'')[-1].strip() if answer is defined else target}}'
+doc_to_text: "Given the following problem, reason and give a final answer to the problem.\nProblem: {{question}}\nYour response should end with \"The final answer is [answer]\" where [answer] is the response to the problem.\n"
+fewshot_config:
+  sampler: first_n
+  samples:
+  - question: There are 15 trees in the grove. Grove workers will plant trees in the
+      grove today. After they are done, there will be 21 trees. How many trees did
+      the grove workers plant today?
+    target: There are 15 trees originally. Then there were 21 trees after some more
+      were planted. So there must have been 21 - 15 = 6. The final answer is 6
+  - question: If there are 3 cars in the parking lot and 2 more cars arrive, how many
+      cars are in the parking lot?
+    target: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The final answer
+      is 5
+  - question: Leah had 32 chocolates and her sister had 42. If they ate 35, how many
+      pieces do they have left in total?
+    target: Originally, Leah had 32 chocolates. Her sister had 42. So in total they
+      had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39. The final answer is 39
+  - question: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12
+      lollipops. How many lollipops did Jason give to Denny?
+    target: Jason started with 20 lollipops. Then he had 12 after giving some to Denny.
+      So he gave Denny 20 - 12 = 8. The final answer is 8
+  - question: Shawn has five toys. For Christmas, he got two toys each from his mom and
+      dad. How many toys does he have now?
+    target: Shawn started with 5 toys. If he got 2 toys each from his mom and dad,
+      then that is 4 more toys. 5 + 4 = 9. The final answer is 9
+  - question: There were nine computers in the server room. Five more computers were
+      installed each day, from monday to thursday. How many computers are now in the
+      server room?
+    target: There were originally 9 computers. For each of 4 days, 5 more computers
+      were added. So 5 * 4 = 20 computers were added. 9 + 20 is 29. The final answer is
+      29
+  - question: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday,
+      he lost 2 more. How many golf balls did he have at the end of wednesday?
+    target: Michael started with 58 golf balls. After losing 23 on tuesday, he had
+      58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The final answer
+      is 33
+  - question: Olivia has $23. She bought five bagels for $3 each. How much money does
+      she have left?
+    target: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15
+      dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The final answer is 8
+filter_list:
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: The final answer is ((-?[$0-9.,]{2,})|(-?[0-9]+))
+  - function: take_first
+  name: strict-match
+- filter:
+  - function: regex
+    group_select: -1
+    regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
+  - function: take_first
+  name: flexible-extract
+generation_kwargs:
+  do_sample: false
+  until:
+  - '<|eot_id|>'
+  - '<|start_header_id|>user<|end_header_id|>'
+  - 'Q:'
+  - </s>
+  - <|im_end|>
+tag:
+- chain_of_thought
+metadata:
+  version: 3.0
+metric_list:
+- aggregation: mean
+  higher_is_better: true
+  ignore_case: true
+  ignore_punctuation: false
+  metric: exact_match
+  regexes_to_ignore:
+  - ','
+  - \$
+  - '(?s).*#### '
+  - \.$
+num_fewshot: 8
+output_type: generate_until
+repeats: 1
+task: gsm8k_cot_llama
+test_split: test
--- a/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml
 include: gsm8k-cot.yaml
-group:
+tag:
  - chain_of_thought
  - self_consistency
 task: gsm8k_cot_self_consistency

--- a/lm_eval/tasks/gsm8k/gsm8k-cot-zeroshot.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot-zeroshot.yaml
-group:
+tag:
  - math_word_problems
 task: gsm8k_cot_zeroshot
 dataset_path: gsm8k

--- a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
@@ -61,7 +61,7 @@ generation_kwargs:
  - 'Q:'
  - </s>
  - <|im_end|>
-group:
+tag:
 - chain_of_thought
 metadata:
  version: 3.0

--- a/lm_eval/tasks/gsm8k/gsm8k.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k.yaml
-group:
+tag:
  - math_word_problems
 task: gsm8k
 dataset_path: gsm8k

--- a/lm_eval/tasks/gsm_plus/README.md
+++ b/lm_eval/tasks/gsm_plus/README.md
+# gsm_plus
+
+### Paper
+
+Title: `GSM-PLUS: A Comprehensive Benchmark for Evaluating the Robustness of LLMs as Mathematical Problem Solvers`
+
+Abstract: `Large language models (LLMs) have achieved impressive performance across various mathematical reasoning benchmarks. However, there are increasing debates regarding whether these models truly understand and apply mathematical knowledge or merely rely on shortcuts for mathematical reasoning. One essential and frequently occurring evidence is that when the math questions are slightly changed, LLMs can behave incorrectly. This motivates us to evaluate the robustness of LLMs’ math reasoning capability by testing a wide range of question variations. We introduce the adversarial grade school math (GSM-PLUS) dataset, an extension of GSM8K augmented with various mathematical perturbations. Our experiments on 25 LLMs and 4 prompting techniques show that while LLMs exhibit different levels of math reasoning abilities, their performances are far from robust. In particular, even for problems that have been solved in GSM8K, LLMs can make mistakes when new statements are added or the question targets are altered. We also explore whether more robust performance can be achieved by composing existing prompting methods, in which we try an iterative method that generates and verifies each intermediate thought based on its reasoning goal and calculation result.`
+
+Homepage: https://huggingface.co/datasets/qintongli/GSM-Plus
+
+### Citation
+
+```bibtex
+@misc{li2024gsmpluscomprehensivebenchmarkevaluating,
+      title={GSM-Plus: A Comprehensive Benchmark for Evaluating the Robustness of LLMs as Mathematical Problem Solvers},
+      author={Qintong Li and Leyang Cui and Xueliang Zhao and Lingpeng Kong and Wei Bi},
+      year={2024},
+      eprint={2402.19255},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2402.19255},
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* Not part of a group yet
+
+#### Tasks
+
+The following tasks evaluate subjects in the gsm_plus dataset
+- `gsm_plus`
+- `gsm_plus_mini`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/gsm_plus/gsm_plus.yaml
+++ b/lm_eval/tasks/gsm_plus/gsm_plus.yaml
+tag:
+  - math_word_problems
+task: gsm_plus
+dataset_path: qintongli/GSM-Plus
+output_type: generate_until
+training_split: test
+fewshot_split: test
+test_split: test
+doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: "{{solution}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: false
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+      - "(?s).*#### "
+      - "\\.$"
+generation_kwargs:
+  until:
+    - "Question:"
+    - "</s>"
+    - "<|im_end|>"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+num_fewshot: 5
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
+      - function: "take_first"
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/gsm_plus/gsm_plus_mini.yaml
+++ b/lm_eval/tasks/gsm_plus/gsm_plus_mini.yaml
+tag:
+  - math_word_problems
+task: gsm_plus_mini
+dataset_path: qintongli/GSM-Plus
+output_type: generate_until
+training_split: testmini
+fewshot_split: testmini
+test_split: testmini
+doc_to_text: "Question: {{question}}\nAnswer:"
+doc_to_target: "{{solution}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: false
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+      - "(?s).*#### "
+      - "\\.$"
+generation_kwargs:
+  until:
+    - "Question:"
+    - "</s>"
+    - "<|im_end|>"
+  do_sample: false
+  temperature: 0.0
+repeats: 1
+num_fewshot: 5
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
+  - name: "flexible-extract"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
+      - function: "take_first"
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/haerae/_default_haerae_yaml
+++ b/lm_eval/tasks/haerae/_default_haerae_yaml
-group: haerae
 dataset_path: HAERAE-HUB/HAE_RAE_BENCH
 test_split: test
 fewshot_split: test

--- a/lm_eval/tasks/haerae/_haerae.yaml
+++ b/lm_eval/tasks/haerae/_haerae.yaml
+group: haerae
+task:
+  - haerae_general_knowledge
+  - haerae_history
+  - haerae_loan_word
+  - haerae_rare_word
+  - haerae_standard_nomenclature
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+  - metric: acc_norm
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/haerae/haerae_gk.yaml
+++ b/lm_eval/tasks/haerae/haerae_gk.yaml
-"dataset_name": "general_knowledge"
-"include": "_default_haerae_yaml"
-"task": "haerae_general_knowledge"
+dataset_name: general_knowledge
+include: _default_haerae_yaml
+task: haerae_general_knowledge
--- a/lm_eval/tasks/haerae/haerae_hi.yaml
+++ b/lm_eval/tasks/haerae/haerae_hi.yaml
-"dataset_name": "history"
-"include": "_default_haerae_yaml"
-"task": "haerae_history"
+dataset_name: history
+include: _default_haerae_yaml
+task: haerae_history
--- a/lm_eval/tasks/haerae/haerae_lw.yaml
+++ b/lm_eval/tasks/haerae/haerae_lw.yaml
-"dataset_name": "loan_words"
-"include": "_default_haerae_yaml"
-"task": "haerae_loan_word"
+dataset_name: loan_words
+include: _default_haerae_yaml
+task: haerae_loan_word
--- a/lm_eval/tasks/haerae/haerae_rw.yaml
+++ b/lm_eval/tasks/haerae/haerae_rw.yaml
-"dataset_name": "rare_words"
-"include": "_default_haerae_yaml"
-"task": "haerae_rare_word"
+dataset_name: rare_words
+include: _default_haerae_yaml
+task: haerae_rare_word
--- a/lm_eval/tasks/haerae/haerae_sn.yaml
+++ b/lm_eval/tasks/haerae/haerae_sn.yaml
-"dataset_name": "standard_nomenclature"
-"include": "_default_haerae_yaml"
-"task": "haerae_standard_nomenclature"
+dataset_name: standard_nomenclature
+include: _default_haerae_yaml
+task: haerae_standard_nomenclature
--- a/lm_eval/tasks/headqa/headqa_en.yaml
+++ b/lm_eval/tasks/headqa/headqa_en.yaml
-group:
-  - headqa
+tag: headqa
 task: headqa_en
 dataset_path: EleutherAI/headqa
 dataset_name: en