Merge branch 'group-agg-rework' of...

Merge branch 'group-agg-rework' of https://github.com/EleutherAI/lm-evaluation-harness into multiprompt

Merge branch 'group-agg-rework' of...
Merge branch 'group-agg-rework' of https://github.com/EleutherAI/lm-evaluation-harness into multiprompt
88486e57 · lintangsutawika · 5971f2ca · ba73d131 · 88486e57 · 88486e57
Commit 88486e57 authored Jul 05, 2024 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/paws-x/_pawsx.yaml
+++ b/lm_eval/tasks/paws-x/_pawsx.yaml
+group: pawsx
+task:
+  - paws_en
+  - paws_de
+  - paws_es
+  - paws_fr
+  - paws_ja
+  - paws_ko
+  - paws_zh
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/paws-x/pawsx_template_yaml
+++ b/lm_eval/tasks/paws-x/pawsx_template_yaml
 # This file will be included in the generated language-specific task configs.
 # It doesn't have a yaml file extension as it is not meant to be imported directly
 # by the harness.
-group: pawsx
 task: null
 dataset_path: paws-x
 dataset_name: null

--- a/lm_eval/tasks/pile/pile_arxiv.yaml
+++ b/lm_eval/tasks/pile/pile_arxiv.yaml
-group:
-  - pile
 task: pile_arxiv
 dataset_path: EleutherAI/pile
 dataset_name: pile_arxiv

--- a/lm_eval/tasks/piqa/piqa.yaml
+++ b/lm_eval/tasks/piqa/piqa.yaml
@@ -19,3 +19,5 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/polemo2/polemo2_in.yaml
+++ b/lm_eval/tasks/polemo2/polemo2_in.yaml
-group:
+tag:
  - polemo2
 task: polemo2_in
 dataset_path: allegro/klej-polemo2-in

--- a/lm_eval/tasks/qa4mre/qa4mre_2011.yaml
+++ b/lm_eval/tasks/qa4mre/qa4mre_2011.yaml
-group:
+tag:
  - qa4mre
 task: qa4mre_2011
 dataset_path: qa4mre

--- a/lm_eval/tasks/qasper/bool.yaml
+++ b/lm_eval/tasks/qasper/bool.yaml
-group: qasper
+tag: qasper
 task: qasper_bool
 dataset_path: allenai/qasper
 output_type: multiple_choice

--- a/lm_eval/tasks/qasper/freeform.yaml
+++ b/lm_eval/tasks/qasper/freeform.yaml
-group: qasper
+tag: qasper
 task: qasper_freeform
 dataset_path: allenai/qasper
 output_type: generate_until

--- a/lm_eval/tasks/scrolls/task.py
+++ b/lm_eval/tasks/scrolls/task.py
@@ -4,12 +4,12 @@ from functools import reduce
 import numpy as np
 import transformers.data.metrics.squad_metrics as squad_metrics
-from datasets import load_metric
+from datasets import Dataset, load_metric
 from transformers import AutoTokenizer
 from lm_eval.api.instance import Instance
 from lm_eval.api.metrics import mean
-from lm_eval.api.task import Task
+from lm_eval.api.task import ConfigurableTask
 _CITATION = """
@@ -108,7 +108,7 @@ def _num_cpu_cores():
        return len(os.sched_getaffinity(0))
-class _SCROLLSTask(Task):
+class _SCROLLSTask(ConfigurableTask):
    VERSION = 2
    DATASET_PATH = "tau/scrolls"
    DATASET_NAME = None
@@ -117,7 +117,7 @@ class _SCROLLSTask(Task):
    PRUNE_NUM_PROC = None
    def __init__(self):
-        super().__init__()
+        super().__init__(config={"metadata": {"version": self.VERSION}})
        if self.DATASET_NAME is not None:
            self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME)
@@ -131,12 +131,26 @@ class _SCROLLSTask(Task):
        return False
    def training_docs(self):
-        for doc in self.dataset["train"]:
+        processed_docs = list(map(self._process_doc, self.dataset["train"]))
-            yield from self._process_doc(doc)
+        # Flatten the list of lists since _process_doc returns a list of one element.
+        processed_docs = [item for sublist in processed_docs for item in sublist]
+        processed_dict = {
+            key: [d[key] for d in processed_docs] for key in processed_docs[0]
+        }
+        return Dataset.from_dict(processed_dict)
    def validation_docs(self):
-        for doc in self.dataset["validation"]:
+        processed_docs = list(map(self._process_doc, self.dataset["validation"]))
-            yield from self._process_doc(doc)
+        # Flatten the list of lists since _process_doc returns a list of one element.
+        processed_docs = [item for sublist in processed_docs for item in sublist]
+        processed_dict = {
+            key: [d[key] for d in processed_docs] for key in processed_docs[0]
+        }
+        return Dataset.from_dict(processed_dict)
    def should_decontaminate(self):
        return True

--- a/lm_eval/tasks/siqa/siqa.yaml
+++ b/lm_eval/tasks/siqa/siqa.yaml
@@ -6,10 +6,7 @@ training_split: train
 validation_split: validation
 doc_to_text: "Q: {{context}} {{question}}\nA:"
 target_delimiter: " "
-doc_to_choice:
+doc_to_choice: "{{[answerA, answerB, answerC]}}"
-  - "{{answerA}}"
-  - "{{answerB}}"
-  - "{{answerC}}"
 doc_to_target: "{{ (label|int) - 1 }}"
 metric_list:
  - metric: acc

--- a/lm_eval/tasks/squad_completion/task.py
+++ b/lm_eval/tasks/squad_completion/task.py
-"""
-"""
 import re
 from typing import List
@@ -14,7 +12,7 @@ class SQUADCompletion(ConfigurableTask):
    DATASET_PATH = "hazyresearch/based-squad"
    DATASET_NAME = "default"
-    def __init__(self):
+    def __init__(self, **kwargs):
        super().__init__(config={"metadata": {"version": self.VERSION}})
    def has_training_docs(self):

--- a/lm_eval/tasks/squadv2/task.py
+++ b/lm_eval/tasks/squadv2/task.py
@@ -13,6 +13,7 @@ also determine when no answer is supported by the paragraph and abstain from ans
 Homepage: https://rajpurkar.github.io/SQuAD-explorer/
 """
 from functools import partial
 from math import exp

--- a/lm_eval/tasks/storycloze/storycloze_2016.yaml
+++ b/lm_eval/tasks/storycloze/storycloze_2016.yaml
-group: storycloze
+tag: storycloze
 task: storycloze_2016
 dataset_path: story_cloze
 dataset_name: 2016

--- a/lm_eval/tasks/storycloze/storycloze_2018.yaml
+++ b/lm_eval/tasks/storycloze/storycloze_2018.yaml
-group: storycloze
+tag: storycloze
 task: storycloze_2018
 dataset_path: story_cloze
 dataset_name: 2018

--- a/lm_eval/tasks/super_glue/README.md
+++ b/lm_eval/tasks/super_glue/README.md
@@ -26,10 +26,14 @@ Homepage: https://super.gluebenchmark.com/
 }
 ```
-### Groups and Tasks
+### Groups, Tags, and Tasks
 #### Groups
+None.
+#### Tags
 * `super-glue-lm-eval-v1`: SuperGLUE eval adapted from LM Eval V1
 * `super-glue-t5-prompt`: SuperGLUE prompt and evaluation that matches the T5 paper (if using accelerate, will error if record is included.)

--- a/lm_eval/tasks/swde/task.py
+++ b/lm_eval/tasks/swde/task.py
@@ -12,7 +12,7 @@ class SWDE(ConfigurableTask):
    DATASET_PATH = "hazyresearch/based-swde-v2"
    DATASET_NAME = "default"
-    def __init__(self):
+    def __init__(self, **kwargs):
        super().__init__(config={"metadata": {"version": self.VERSION}})
    def has_training_docs(self):

--- a/lm_eval/tasks/tinyBenchmarks/utils_winogrande.py
+++ b/lm_eval/tasks/tinyBenchmarks/utils_winogrande.py
-""" This code mirrors the utils of the original winogrande task """
+"""This code mirrors the utils of the original winogrande task"""
 def doc_to_text(doc):

--- a/lm_eval/tasks/tmmluplus/default/_generate_configs.py
+++ b/lm_eval/tasks/tmmluplus/default/_generate_configs.py
 """
 Take in a YAML, and output all "other" splits with this YAML
 """
 import argparse
 import os

--- a/lm_eval/tasks/translation/iwslt2017_ar-en.yaml
+++ b/lm_eval/tasks/translation/iwslt2017_ar-en.yaml
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["en"]}}'
 doc_to_text: 'Arabic phrase: {{translation["ar"]}}
  English phrase:'
-group:
+tag:
- generate_until
 - translation
 - iwslt2017
 include: wmt_common_yaml

--- a/lm_eval/tasks/translation/iwslt2017_en-ar.yaml
+++ b/lm_eval/tasks/translation/iwslt2017_en-ar.yaml
@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["ar"]}}'
 doc_to_text: 'English phrase: {{translation["en"]}}
  Arabic phrase:'
-group:
+tag:
- generate_until
 - translation
 - iwslt2017
 include: wmt_common_yaml