Merge branch 'big-refactor' into model-written-eval

26bc3eab · Lintang Sutawika · GitHub · 0d701496 · cf617ab1 · 26bc3eab
Unverified Commit 26bc3eab authored Oct 19, 2023 by Lintang Sutawika Committed by GitHub Oct 19, 2023
20 changed files
--- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_generative_template_yaml
@@ -2,7 +2,7 @@ group: mmlu_flan_cot_zeroshot
 dataset_path: cais/mmlu
 validation_split: validation
 fewshot_split: dev
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: Let's think step by step."
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
 filter_list:

--- a/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/_mmlu_flan_generative_template_yaml
@@ -2,7 +2,7 @@ group: mmlu_flan_n_shot_generative
 dataset_path: cais/mmlu
 test_split: test
 fewshot_split: dev
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "Q: {{question.strip()}}\n(A) {{choices[0]}} (B) {{choices[1]}} (C) {{choices[2]}} (D) {{choices[3]}}\nA: "
 doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
 generation_kwargs:

--- a/lm_eval/tasks/nq_open/nq_open.yaml
+++ b/lm_eval/tasks/nq_open/nq_open.yaml
 task: nq_open
 dataset_path: nq_open
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 description: "Answer these questions:\n"

--- a/lm_eval/tasks/polemo2/polemo2_in.yaml
+++ b/lm_eval/tasks/polemo2/polemo2_in.yaml
@@ -3,7 +3,7 @@ group:
 task: polemo2_in
 dataset_path: allegro/klej-polemo2-in
 dataset_name: klej-polemo2-in
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 test_split: test

--- a/lm_eval/tasks/qasper/freeform.yaml
+++ b/lm_eval/tasks/qasper/freeform.yaml
 group: qasper
 task: qasper_freeform
 dataset_path: qasper
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 process_docs: !function utils.process_docs_freeform

--- a/lm_eval/tasks/squadv2/README.md
+++ b/lm_eval/tasks/squadv2/README.md
@@ -2,25 +2,44 @@

 ### Paper

-Title: `paper title goes here`
-Abstract: `link to paper PDF or arXiv abstract goes here`
+Title: `Know What You Don’t Know: Unanswerable Questions for SQuAD`
+Abstract: https://arxiv.org/abs/1806.03822

-`Short description of paper / benchmark goes here:`
+Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
+consisting of questions posed by crowdworkers on a set of Wikipedia articles,
+where the answer to every question is a segment of text, or span, from the
+corresponding reading passage, or the question might be unanswerable.
+SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable
+questions written adversarially by crowdworkers to look similar to answerable ones.
+To do well on SQuAD2.0, systems must not only answer questions when possible, but
+also determine when no answer is supported by the paragraph and abstain from answering.

-Homepage: `homepage to the benchmark's website goes here, if applicable`
+Homepage: https://rajpurkar.github.io/SQuAD-explorer/


 ### Citation

 ```
-BibTeX-formatted citation goes here
+@misc{rajpurkar2018know,
+    title={Know What You Don't Know: Unanswerable Questions for SQuAD},
+    author={Pranav Rajpurkar and Robin Jia and Percy Liang},
+    year={2018},
+    eprint={1806.03822},
+    archivePrefix={arXiv},
+    primaryClass={cs.CL}
+}
 ```

-### Subtasks
+### Groups and Tasks

-List or describe tasks defined in this folder, and their names here:
-* `task_name`: `1-sentence description of what this particular task does`
-* `task_name2`: .....
+#### Groups
+
+* `squadv2_complete`: Runs both `squadv2` and `squadv2_noans_loglikelihood`
+
+#### Tasks
+
+* `squadv2`: `Default squadv2 task`
+* `squadv2_noans_loglikelihood`: `Additional task to acquire the probability of model predicting there is no answer`

 ### Checklist


--- a/lm_eval/tasks/squadv2/_template_yaml
+++ b/lm_eval/tasks/squadv2/_template_yaml
+dataset_path: squad_v2
+training_split: train
+validation_split: validation
+doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
+doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
+target_delimiter: ""
+should_decontaminate: true
+doc_to_decontamination_query: context
--- a/lm_eval/tasks/squadv2/default.yaml
+++ b/lm_eval/tasks/squadv2/default.yaml
+include: _template_yaml
 task: squadv2
-dataset_path: squad_v2
-output_type: greedy_until
-training_split: train
-validation_split: validation
-doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
-doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
-target_delimiter: ""
-should_decontaminate: true
-doc_to_decontamination_query: context
+output_type: generate_until
 generation_kwargs:
  until:
    - "\n"
-# filter_list:
-#   - name: remove_whitespace
-#     filter:
-#       - function: remove_whitespace
-#       - function: take_first
 metric_list:
  - metric: !function utils.exact
    aggregation: mean

--- a/lm_eval/tasks/squadv2/no_ans.yaml
+++ b/lm_eval/tasks/squadv2/no_ans.yaml
-include: default.yaml
+include: _template_yaml
 task: squadv2_noans_loglikelihood
-dataset_path: squad_v2
 output_type: loglikelihood
-training_split: train
-validation_split: validation
 doc_to_target: " unanswerable"
 metric_list:
  - metric: perplexity
--- a/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
+++ b/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
@@ -3,7 +3,7 @@ group:
 task: "boolq-seq2seq"
 dataset_path: super_glue
 dataset_name: boolq
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: validation
 doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"

--- a/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/boolq/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: boolq
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "boolq passage: {{passage}} question: {{question}}"
 doc_to_target: label
 doc_to_choice: ['False', 'True']

--- a/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: cb
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "cb hypothesis: {{hypothesis}} premise: {{premise}}"
 doc_to_target: label
 doc_to_choice: ['entailment', 'contradiction', 'neutral']

--- a/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/copa/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: copa
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "copa choice1: {{choice1}} choice2: {{choice2}} premise: {{premise}} question: {{question}}"
 doc_to_target: label
 doc_to_choice: ['choice1', 'choice2']

--- a/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/multirc/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: multirc
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "multirc question: {{question}} answer: {{answer}} paragraph: {{paragraph}}"
 doc_to_target: label
 doc_to_choice: "{% set group_id = idx.question|string %}{{[group_id+'_False', group_id+'_True']}}"

--- a/lm_eval/tasks/super_glue/record/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/record/t5-prompt.yaml
@@ -4,7 +4,7 @@ task: super_glue-record-t5-prompt
 dataset_path: super_glue
 dataset_name: record
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 process_docs: !function t5_utils.process_docs
 doc_to_text: !function t5_utils.doc_to_text
 doc_to_target: "{{idx.passage|string}}+{{idx.query}}_{{answers}}"

--- a/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/rte/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: rte
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "rte hypothesis: {{hypothesis}} premise: {{premise}}"
 doc_to_target: label
 doc_to_choice: ['entailment', 'not_entailment']

--- a/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wic/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: wic
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: "wic sentence1: {{sentence1}} sentence2: {{sentence2}} word: {{word}}"
 doc_to_target: label
 doc_to_choice: ['False', 'True']

--- a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
@@ -5,7 +5,7 @@ dataset_path: super_glue
 dataset_name: wsc.fixed
 training_split: train
 validation_split: validation
-output_type: greedy_until
+output_type: generate_until
 doc_to_text: !function "t5_utils.doc_to_text"
 doc_to_target: label
 generation_kwargs:

--- a/lm_eval/tasks/translation/iwslt2017_ar-en.yaml
+++ b/lm_eval/tasks/translation/iwslt2017_ar-en.yaml
@@ -6,7 +6,7 @@ doc_to_text: 'Arabic phrase: {{translation["ar"]}}

  English phrase:'
 group:
- greedy_until
+- generate_until
 - translation
 - iwslt2017
 include: wmt_common_yaml

--- a/lm_eval/tasks/translation/iwslt2017_en-ar.yaml
+++ b/lm_eval/tasks/translation/iwslt2017_en-ar.yaml
@@ -6,7 +6,7 @@ doc_to_text: 'English phrase: {{translation["en"]}}

  Arabic phrase:'
 group:
- greedy_until
+- generate_until
 - translation
 - iwslt2017
 include: wmt_common_yaml