modifications for current evals on t5v2

f7b81bd4 · lintangsutawika · 032e879b · f7b81bd4 · f7b81bd4 · f7b81bd4
Commit f7b81bd4 authored Jan 17, 2024 by lintangsutawika
13 changed files
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -399,7 +399,7 @@ def evaluate(
            if type(items[0]) == tuple:
                numitem = len(items[0])
-            if isinstance(items[0], (str, list)):
+            if isinstance(items[0], (str, list, tuple)):
                # handle the string case
                gathered_items = [None] * lm.accelerator.num_processes
                torch.distributed.all_gather_object(gathered_items, items)
@@ -492,6 +492,8 @@ def evaluate(
                        ]:
                            stderr = "_stderr,".join(metric.split(","))
                            stderr_score = results[task][stderr]
+                            if isinstance(stderr_score, str):
+                                stderr_score = 0
                            var_score = stderr_score**2
                            metric_score = results[task][metric]

--- a/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
@@ -17,6 +17,7 @@ generation_kwargs:
    - "</s>"
    - "Q"
    - "\n\n"
+    - "<0x0A>"
  do_sample: false
  temperature: 0.0
 filter_list:

--- a/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/cot_zeroshot/_cot_zeroshot_template_yaml
@@ -14,6 +14,7 @@ generation_kwargs:
    - "</s>"
    - "Q"
    - "\n\n"
+    - "<0x0A>"
  do_sample: false
  temperature: 0.0
 filter_list:

--- a/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
@@ -14,6 +14,7 @@ generation_kwargs:
    - "</s>"
    - "Q"
    - "\n\n"
+    - "<0x0A>"
  do_sample: false
  temperature: 0.0
 num_fewshot: 0

--- a/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
+++ b/lm_eval/tasks/bbh/zeroshot/_zeroshot_template_yaml
@@ -14,6 +14,7 @@ generation_kwargs:
    - "</s>"
    - "Q:"
    - "\n\n"
+    - "<0x0A>"
  do_sample: false
  temperature: 0.0
 num_fewshot: 0

--- a/lm_eval/tasks/benchmarks/flan/flan_anli.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_anli.yaml
 group: flan_anli
 task:
  - include: yaml_templates/held_in_template_yaml
-    task: anli_r1
+    task: r1
    dataset_path: anli
    use_prompt: prompt_templates/anli.yaml:*
    validation_split: dev_r1
  - include: yaml_templates/held_in_template_yaml
-    task: anli_r2
+    task: r2
    dataset_path: anli
    use_prompt: prompt_templates/anli.yaml:*
    validation_split: dev_r2
  - include: yaml_templates/held_in_template_yaml
-    task: anli_r3
+    task: r3
    dataset_path: anli
    use_prompt: prompt_templates/anli.yaml:*
    validation_split: dev_r3
--- a/lm_eval/tasks/benchmarks/flan/flan_arc.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_arc.yaml
 group: flan_arc
 task:
  - include: yaml_templates/held_in_template_yaml
-    task: arc_easy
    dataset_path: ai2_arc
    dataset_name: ARC-Easy
    use_prompt: prompt_templates/arc.yaml:*
    validation_split: validation
  - include: yaml_templates/held_in_template_yaml
-    task: arc_challenge
    dataset_path: ai2_arc
    dataset_name: ARC-Challenge
    use_prompt: prompt_templates/arc.yaml:*

--- a/lm_eval/tasks/benchmarks/flan/flan_boolq.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_boolq.yaml
--- a/lm_eval/tasks/benchmarks/flan/flan_rte.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_rte.yaml
--- a/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
 group: flan_held_in
 task:
-  - flan_boolq
+  - include: yaml_templates/held_in_template_yaml
-  - flan_rte
+    task: r1
-  - flan_anli
+    dataset_path: anli
-  - flan_arc
+    use_prompt: prompt_templates/anli.yaml:*
+    validation_split: dev_r1
+  - include: yaml_templates/held_in_template_yaml
+    task: r2
+    dataset_path: anli
+    use_prompt: prompt_templates/anli.yaml:*
+    validation_split: dev_r2
+  - include: yaml_templates/held_in_template_yaml
+    task: r3
+    dataset_path: anli
+    use_prompt: prompt_templates/anli.yaml:*
+    validation_split: dev_r3
+  - include: yaml_templates/held_in_template_yaml
+    dataset_path: ai2_arc
+    dataset_name: ARC-Easy
+    use_prompt: prompt_templates/arc.yaml:*
+    validation_split: validation
+  - include: yaml_templates/held_in_template_yaml
+    dataset_path: ai2_arc
+    dataset_name: ARC-Challenge
+    use_prompt: prompt_templates/arc.yaml:*
+    validation_split: validation
+  - include: yaml_templates/held_in_template_yaml
+    dataset_path: super_glue
+    dataset_name: boolq
+    use_prompt: prompt_templates/boolq.yaml:*
+    validation_split: validation
+  - include: yaml_templates/held_in_template_yaml
+    dataset_path: super_glue
+    dataset_name: rte
+    use_prompt: prompt_templates/rte.yaml:*
+    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/flan_held_in_yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_in_yaml
-group: flan_held_in
-task:
-  - include: flan/yaml_templates/held_in_template_yaml
-    dataset_path: super_glue
-    dataset_name: boolq
-    use_prompt: flan/prompt_templates/boolq.yaml:*
-    validation_split: validation
-  - include: flan/yaml_templates/held_in_template_yaml
-    dataset_path: super_glue
-    dataset_name: rte
-    use_prompt: flan/prompt_templates/rte.yaml:*
-    validation_split: validation
-  - include: flan/yaml_templates/held_in_template_yaml
-    task: anli_r1
-    dataset_path: anli
-    use_prompt: flan/prompt_templates/anli.yaml:*
-    validation_split: dev_r1
-  - include: flan/yaml_templates/held_in_template_yaml
-    task: anli_r2
-    dataset_path: anli
-    use_prompt: flan/prompt_templates/anli.yaml:*
-    validation_split: dev_r2
-  - include: flan/yaml_templates/held_in_template_yaml
-    task: anli_r3
-    dataset_path: anli
-    use_prompt: flan/prompt_templates/anli.yaml:*
-    validation_split: dev_r3
-  - include: flan/yaml_templates/held_in_template_yaml
-    task: arc_easy
-    dataset_path: ai2_arc
-    dataset_name: ARC-Easy
-    use_prompt: flan/prompt_templates/arc.yaml:*
-    validation_split: validation
-  - include: flan/yaml_templates/held_in_template_yaml
-    task: arc_challenge
-    dataset_path: ai2_arc
-    dataset_name: ARC-Challenge
-    use_prompt: flan/prompt_templates/arc.yaml:*
-    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
 group: flan_held_out
 task:
  # BBH
-  - bbh_flan_zeroshot
+  - bbh_zeroshot
-  - bbh_flan_fewshot
+  - bbh_fewshot
-  - bbh_flan_cot_fewshot
+  - bbh_cot_fewshot
-  - bbh_flan_cot_zeroshot
+  - bbh_cot_zeroshot
  # MMLU
  - mmlu
  - mmlu_flan_n_shot_generative

--- a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
+++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml
@@ -8,6 +8,7 @@ doc_to_target: "{{['(A)', '(B)', '(C)', '(D)'][answer]}}"
 generation_kwargs:
  until:
    - "</s>"
+    - "<0x0A>"
 metric_list:
  - metric: exact_match
    aggregation: mean