Commit 88486e57 authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'group-agg-rework' of...

Merge branch 'group-agg-rework' of https://github.com/EleutherAI/lm-evaluation-harness into multiprompt
parents 5971f2ca ba73d131
group: pawsx
task:
- paws_en
- paws_de
- paws_es
- paws_fr
- paws_ja
- paws_ko
- paws_zh
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: true
metadata:
version: 0.0
# This file will be included in the generated language-specific task configs. # This file will be included in the generated language-specific task configs.
# It doesn't have a yaml file extension as it is not meant to be imported directly # It doesn't have a yaml file extension as it is not meant to be imported directly
# by the harness. # by the harness.
group: pawsx
task: null task: null
dataset_path: paws-x dataset_path: paws-x
dataset_name: null dataset_name: null
......
group:
- pile
task: pile_arxiv task: pile_arxiv
dataset_path: EleutherAI/pile dataset_path: EleutherAI/pile
dataset_name: pile_arxiv dataset_name: pile_arxiv
......
...@@ -19,3 +19,5 @@ metric_list: ...@@ -19,3 +19,5 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
group: tag:
- polemo2 - polemo2
task: polemo2_in task: polemo2_in
dataset_path: allegro/klej-polemo2-in dataset_path: allegro/klej-polemo2-in
......
group: tag:
- qa4mre - qa4mre
task: qa4mre_2011 task: qa4mre_2011
dataset_path: qa4mre dataset_path: qa4mre
......
group: qasper tag: qasper
task: qasper_bool task: qasper_bool
dataset_path: allenai/qasper dataset_path: allenai/qasper
output_type: multiple_choice output_type: multiple_choice
......
group: qasper tag: qasper
task: qasper_freeform task: qasper_freeform
dataset_path: allenai/qasper dataset_path: allenai/qasper
output_type: generate_until output_type: generate_until
......
...@@ -4,12 +4,12 @@ from functools import reduce ...@@ -4,12 +4,12 @@ from functools import reduce
import numpy as np import numpy as np
import transformers.data.metrics.squad_metrics as squad_metrics import transformers.data.metrics.squad_metrics as squad_metrics
from datasets import load_metric from datasets import Dataset, load_metric
from transformers import AutoTokenizer from transformers import AutoTokenizer
from lm_eval.api.instance import Instance from lm_eval.api.instance import Instance
from lm_eval.api.metrics import mean from lm_eval.api.metrics import mean
from lm_eval.api.task import Task from lm_eval.api.task import ConfigurableTask
_CITATION = """ _CITATION = """
...@@ -108,7 +108,7 @@ def _num_cpu_cores(): ...@@ -108,7 +108,7 @@ def _num_cpu_cores():
return len(os.sched_getaffinity(0)) return len(os.sched_getaffinity(0))
class _SCROLLSTask(Task): class _SCROLLSTask(ConfigurableTask):
VERSION = 2 VERSION = 2
DATASET_PATH = "tau/scrolls" DATASET_PATH = "tau/scrolls"
DATASET_NAME = None DATASET_NAME = None
...@@ -117,7 +117,7 @@ class _SCROLLSTask(Task): ...@@ -117,7 +117,7 @@ class _SCROLLSTask(Task):
PRUNE_NUM_PROC = None PRUNE_NUM_PROC = None
def __init__(self): def __init__(self):
super().__init__() super().__init__(config={"metadata": {"version": self.VERSION}})
if self.DATASET_NAME is not None: if self.DATASET_NAME is not None:
self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME) self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME)
...@@ -131,12 +131,26 @@ class _SCROLLSTask(Task): ...@@ -131,12 +131,26 @@ class _SCROLLSTask(Task):
return False return False
def training_docs(self): def training_docs(self):
for doc in self.dataset["train"]: processed_docs = list(map(self._process_doc, self.dataset["train"]))
yield from self._process_doc(doc)
# Flatten the list of lists since _process_doc returns a list of one element.
processed_docs = [item for sublist in processed_docs for item in sublist]
processed_dict = {
key: [d[key] for d in processed_docs] for key in processed_docs[0]
}
return Dataset.from_dict(processed_dict)
def validation_docs(self): def validation_docs(self):
for doc in self.dataset["validation"]: processed_docs = list(map(self._process_doc, self.dataset["validation"]))
yield from self._process_doc(doc)
# Flatten the list of lists since _process_doc returns a list of one element.
processed_docs = [item for sublist in processed_docs for item in sublist]
processed_dict = {
key: [d[key] for d in processed_docs] for key in processed_docs[0]
}
return Dataset.from_dict(processed_dict)
def should_decontaminate(self): def should_decontaminate(self):
return True return True
......
...@@ -6,10 +6,7 @@ training_split: train ...@@ -6,10 +6,7 @@ training_split: train
validation_split: validation validation_split: validation
doc_to_text: "Q: {{context}} {{question}}\nA:" doc_to_text: "Q: {{context}} {{question}}\nA:"
target_delimiter: " " target_delimiter: " "
doc_to_choice: doc_to_choice: "{{[answerA, answerB, answerC]}}"
- "{{answerA}}"
- "{{answerB}}"
- "{{answerC}}"
doc_to_target: "{{ (label|int) - 1 }}" doc_to_target: "{{ (label|int) - 1 }}"
metric_list: metric_list:
- metric: acc - metric: acc
......
"""
"""
import re import re
from typing import List from typing import List
...@@ -14,7 +12,7 @@ class SQUADCompletion(ConfigurableTask): ...@@ -14,7 +12,7 @@ class SQUADCompletion(ConfigurableTask):
DATASET_PATH = "hazyresearch/based-squad" DATASET_PATH = "hazyresearch/based-squad"
DATASET_NAME = "default" DATASET_NAME = "default"
def __init__(self): def __init__(self, **kwargs):
super().__init__(config={"metadata": {"version": self.VERSION}}) super().__init__(config={"metadata": {"version": self.VERSION}})
def has_training_docs(self): def has_training_docs(self):
......
...@@ -13,6 +13,7 @@ also determine when no answer is supported by the paragraph and abstain from ans ...@@ -13,6 +13,7 @@ also determine when no answer is supported by the paragraph and abstain from ans
Homepage: https://rajpurkar.github.io/SQuAD-explorer/ Homepage: https://rajpurkar.github.io/SQuAD-explorer/
""" """
from functools import partial from functools import partial
from math import exp from math import exp
......
group: storycloze tag: storycloze
task: storycloze_2016 task: storycloze_2016
dataset_path: story_cloze dataset_path: story_cloze
dataset_name: 2016 dataset_name: 2016
......
group: storycloze tag: storycloze
task: storycloze_2018 task: storycloze_2018
dataset_path: story_cloze dataset_path: story_cloze
dataset_name: 2018 dataset_name: 2018
......
...@@ -26,10 +26,14 @@ Homepage: https://super.gluebenchmark.com/ ...@@ -26,10 +26,14 @@ Homepage: https://super.gluebenchmark.com/
} }
``` ```
### Groups and Tasks ### Groups, Tags, and Tasks
#### Groups #### Groups
None.
#### Tags
* `super-glue-lm-eval-v1`: SuperGLUE eval adapted from LM Eval V1 * `super-glue-lm-eval-v1`: SuperGLUE eval adapted from LM Eval V1
* `super-glue-t5-prompt`: SuperGLUE prompt and evaluation that matches the T5 paper (if using accelerate, will error if record is included.) * `super-glue-t5-prompt`: SuperGLUE prompt and evaluation that matches the T5 paper (if using accelerate, will error if record is included.)
......
...@@ -12,7 +12,7 @@ class SWDE(ConfigurableTask): ...@@ -12,7 +12,7 @@ class SWDE(ConfigurableTask):
DATASET_PATH = "hazyresearch/based-swde-v2" DATASET_PATH = "hazyresearch/based-swde-v2"
DATASET_NAME = "default" DATASET_NAME = "default"
def __init__(self): def __init__(self, **kwargs):
super().__init__(config={"metadata": {"version": self.VERSION}}) super().__init__(config={"metadata": {"version": self.VERSION}})
def has_training_docs(self): def has_training_docs(self):
......
""" This code mirrors the utils of the original winogrande task """ """This code mirrors the utils of the original winogrande task"""
def doc_to_text(doc): def doc_to_text(doc):
......
""" """
Take in a YAML, and output all "other" splits with this YAML Take in a YAML, and output all "other" splits with this YAML
""" """
import argparse import argparse
import os import os
......
...@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["en"]}}' ...@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["en"]}}'
doc_to_text: 'Arabic phrase: {{translation["ar"]}} doc_to_text: 'Arabic phrase: {{translation["ar"]}}
English phrase:' English phrase:'
group: tag:
- generate_until
- translation - translation
- iwslt2017 - iwslt2017
include: wmt_common_yaml include: wmt_common_yaml
......
...@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["ar"]}}' ...@@ -5,8 +5,7 @@ doc_to_target: ' {{translation["ar"]}}'
doc_to_text: 'English phrase: {{translation["en"]}} doc_to_text: 'English phrase: {{translation["en"]}}
Arabic phrase:' Arabic phrase:'
group: tag:
- generate_until
- translation - translation
- iwslt2017 - iwslt2017
include: wmt_common_yaml include: wmt_common_yaml
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment