update

c6403765 · lintangsutawika · ea8b5beb · c6403765 · c6403765
Commit c6403765 authored Sep 21, 2023 by lintangsutawika
Showing with 78 additions and 78 deletions

lm_eval/tasks/benchmarks/flan_held_out.yaml lm_eval/tasks/benchmarks/flan_held_out.yaml +4 -4

lm_eval/tasks/benchmarks/t0_eval.yaml lm_eval/tasks/benchmarks/t0_eval.yaml +74 -74

No files found.
--- a/lm_eval/tasks/benchmarks/flan_held_out.yaml
+++ b/lm_eval/tasks/benchmarks/flan_held_out.yaml
@@ -3,10 +3,10 @@ task:
  # BBH
  - bbh_flan_zeroshot
  - bbh_flan_fewshot
-  - bbh_flan_cot_fewshot
+  # - bbh_flan_cot_fewshot
-  - bbh_flan_cot_zeroshot
+  # - bbh_flan_cot_zeroshot
  # MMLU
  - mmlu_flan_n_shot_generative
  - mmlu_flan_n_shot_loglikelihood
-  - mmlu_flan_cot_zeroshot
+  # - mmlu_flan_cot_zeroshot
-  - mmlu_flan_cot_fewshot
+  # - mmlu_flan_cot_fewshot
--- a/lm_eval/tasks/benchmarks/t0_eval.yaml
+++ b/lm_eval/tasks/benchmarks/t0_eval.yaml
@@ -26,80 +26,80 @@ task:
        higher_is_better: true
        ignore_case: true
        ignore_punctuation: true
-  # # Natural Language Inference
+  # Natural Language Inference
-  # - dataset_path: super_glue
+  - dataset_path: super_glue
-  #   dataset_name: cb
+    dataset_name: cb
-  #   use_prompt: promptsource:*
+    use_prompt: promptsource:*
-  #   training_split: train
+    training_split: train
-  #   validation_split: validation
+    validation_split: validation
-  #   output_type: greedy_until
+    output_type: greedy_until
-  #   metric_list:
+    metric_list:
-  #     - metric: exact_match
+      - metric: exact_match
-  #       aggregation: mean
+        aggregation: mean
-  #       higher_is_better: true
+        higher_is_better: true
-  #       ignore_case: true
+        ignore_case: true
-  #       ignore_punctuation: true
+        ignore_punctuation: true
-  # - dataset_path: super_glue
+  - dataset_path: super_glue
-  #   dataset_name: rte
+    dataset_name: rte
-  #   use_prompt: promptsource:*
+    use_prompt: promptsource:*
-  #   training_split: train
+    training_split: train
-  #   validation_split: validation
+    validation_split: validation
-  #   output_type: greedy_until
+    output_type: greedy_until
-  #   metric_list:
+    metric_list:
-  #     - metric: exact_match
+      - metric: exact_match
-  #       aggregation: mean
+        aggregation: mean
-  #       higher_is_better: true
+        higher_is_better: true
-  #       ignore_case: true
+        ignore_case: true
-  #       ignore_punctuation: true
+        ignore_punctuation: true
-  # - task: anli_r1
+  - task: anli_r1
-  #   dataset_path: anli
+    dataset_path: anli
-  #   use_prompt: promptsource:*
+    use_prompt: promptsource:*
-  #   training_split: train_r1
+    training_split: train_r1
-  #   validation_split: dev_r1
+    validation_split: dev_r1
-  #   output_type: greedy_until
+    output_type: greedy_until
-  #   metric_list:
+    metric_list:
-  #     - metric: exact_match
+      - metric: exact_match
-  #       aggregation: mean
+        aggregation: mean
-  #       higher_is_better: true
+        higher_is_better: true
-  #       ignore_case: true
+        ignore_case: true
-  #       ignore_punctuation: true
+        ignore_punctuation: true
-  # - task: anli_r2
+  - task: anli_r2
-  #   dataset_path: anli
+    dataset_path: anli
-  #   use_prompt: promptsource:*
+    use_prompt: promptsource:*
-  #   training_split: train_r2
+    training_split: train_r2
-  #   validation_split: dev_r2
+    validation_split: dev_r2
-  #   output_type: greedy_until
+    output_type: greedy_until
-  #   metric_list:
+    metric_list:
-  #     - metric: exact_match
+      - metric: exact_match
-  #       aggregation: mean
+        aggregation: mean
-  #       higher_is_better: true
+        higher_is_better: true
-  #       ignore_case: true
+        ignore_case: true
-  #       ignore_punctuation: true
+        ignore_punctuation: true
-  # - task: anli_r3
+  - task: anli_r3
-  #   dataset_path: anli
+    dataset_path: anli
-  #   use_prompt: promptsource:*
+    use_prompt: promptsource:*
-  #   training_split: train_r3
+    training_split: train_r3
-  #   validation_split: dev_r3
+    validation_split: dev_r3
-  #   output_type: greedy_until
+    output_type: greedy_until
-  #   metric_list:
+    metric_list:
-  #     - metric: exact_match
+      - metric: exact_match
-  #       aggregation: mean
+        aggregation: mean
-  #       higher_is_better: true
+        higher_is_better: true
-  #       ignore_case: true
+        ignore_case: true
-  #       ignore_punctuation: true
+        ignore_punctuation: true
-  # # Sentence Completion
+  # Sentence Completion
-  # - dataset_path: super_glue
+  - dataset_path: super_glue
-  #   dataset_name: copa
+    dataset_name: copa
-  #   use_prompt: promptsource:*
+    use_prompt: promptsource:*
-  #   training_split: train
+    training_split: train
-  #   validation_split: validation
+    validation_split: validation
-  #   output_type: greedy_until
+    output_type: greedy_until
-  #   metric_list:
+    metric_list:
-  #     - metric: exact_match
+      - metric: exact_match
-  #       aggregation: mean
+        aggregation: mean
-  #       higher_is_better: true
+        higher_is_better: true
-  #       ignore_case: true
+        ignore_case: true
-  #       ignore_punctuation: true
+        ignore_punctuation: true
  # Natural Language Inference
  - dataset_path: hellaswag
    use_prompt: promptsource:*