Unverified Commit 269d3683 authored by Leo Gao's avatar Leo Gao Committed by GitHub
Browse files

Merge branch 'master' into webqs

parents 34eb121f a1a4a32e
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
name: Python application
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Cache
uses: actions/cache@v2.1.3
with:
# A list of files, directories, and wildcard patterns to cache and restore
path: |
data
~/.cache
# An explicit key for restoring and saving the cache
key: evaldata-cache
- name: Set up Python 3.9
uses: actions/setup-python@v2
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest pytest-cov
pip install -e .
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest --cov=lm_eval/ tests/
- name: Upload to codecov
run: |
bash <(curl -s https://codecov.io/bash)
\ No newline at end of file
# Evaluation Harness for Large Language Models
![](https://github.com/EleutherAI/lm-evaluation-harness/workflows/Python%20application/badge.svg)
[![codecov](https://codecov.io/gh/EleutherAI/lm-evaluation-harness/branch/master/graph/badge.svg?token=JSG3O2427J)](https://codecov.io/gh/EleutherAI/lm-evaluation-harness)
## Overview
The goal of this project is to build a set of tools for evaluating LMs on typical NLU tasks, based on evaluation of GPT-3 as described in https://arxiv.org/pdf/2005.14165.pdf. Following the initial description, this repo should support 3 functions:
......
......@@ -58,10 +58,10 @@ class LM(abc.ABC):
return cls()
class Dataset(abc.ABC):
class Task(abc.ABC):
def __init__(self):
self.download()
self._traindocs = None
self._training_docs = None
def download(self):
"""Downloads the task dataset if necessary"""
......@@ -71,7 +71,7 @@ class Dataset(abc.ABC):
def has_training_docs(self):
"""Whether the task has a training set"""
pass
@abc.abstractmethod
def has_validation_docs(self):
"""Whether the task has a validation set"""
......@@ -84,23 +84,29 @@ class Dataset(abc.ABC):
def training_docs(self):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return []
def validation_docs(self):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return []
def test_docs(self):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return []
def fewshot_examples(self, k):
if self._traindocs is None:
self._traindocs = list(self.training_docs())
return random.sample(self._traindocs, k)
def fewshot_examples(self, k):
if self._training_docs is None:
self._training_docs = list(self.training_docs())
return random.sample(self._training_docs, k)
@abc.abstractmethod
def doc_to_text(self, doc):
......@@ -123,7 +129,7 @@ class Dataset(abc.ABC):
part of the document for `doc`.
"""
pass
@abc.abstractmethod
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
......@@ -161,7 +167,7 @@ class Dataset(abc.ABC):
def fewshot_context(self, doc, num_fewshot, provide_description):
raw_description = self.fewshot_description()
description = (raw_description + "\n===\n\n") if provide_description and raw_description else ""
if num_fewshot == 0:
labeled_examples = ""
else:
......
......@@ -20,4 +20,4 @@ class DummyLM(LM):
def greedy_until(self, requests):
# TODO: implement
pass
\ No newline at end of file
pass
......@@ -43,4 +43,4 @@ class GPT2LM(LM):
def greedy_until(self, requests):
# TODO: implement
pass
\ No newline at end of file
pass
......@@ -46,7 +46,7 @@ TASK_REGISTRY = {
"lambada": lambada.LAMBADA,
"piqa": piqa.PiQA,
"triviaqa": triviaqa.TriviaQA,
#"triviaqa": triviaqa.TriviaQA,
# "arc_easy": arc.ARCEasy, # not implemented yet
# "arc_challenge": arc.ARCChallenge, # not implemented yet
# "quac": quac.QuAC, # not implemented yet
......
......@@ -70,4 +70,4 @@ class ARCEasy(HFTask):
class ARCChallenge(ARCEasy):
DATASET_PATH = "ai2_arc"
DATASET_NAME = "ARC-Challenge"
\ No newline at end of file
DATASET_NAME = "ARC-Challenge"
......@@ -2,12 +2,12 @@ import abc
import json
import os
from collections import namedtuple
from lm_eval.base import Dataset, mean, rf
from lm_eval.base import Task, mean, rf
from best_download import download_file
ArithmeticDoc = namedtuple('ArithmeticDoc', ['context', 'completion'])
class Arithmetic(Dataset):
class Arithmetic(Task):
directory = 'data/arithmetic/'
def __init__(self):
......
import datasets
import numpy as np
import random
from ..base import Dataset
from ..base import Task
class HFTask(Dataset):
class HFTask(Task):
DATASET_PATH = None
DATASET_NAME = None
def __init__(self):
self.data = None
super().__init__()
self._training_docs = None
def download(self):
self.data = datasets.load_dataset(path=self.DATASET_PATH, name=self.DATASET_NAME)
......
......@@ -2,11 +2,11 @@
import json
import random
from lm_eval.base import Dataset
from lm_eval.base import Task
from ..utils import sh
class CoQA(Dataset):
class CoQA(Task):
def __init__(self):
self.download()
def download(self):
......
......@@ -5,9 +5,9 @@ from sklearn.metrics import f1_score, matthews_corrcoef
from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno
from pathlib import Path
from ..base import Dataset
from ..base import Task
class DROP(Dataset):
class DROP(Task):
DATAFOLDER = Path(__file__).parent / "../../data/drop"
def __init__(self):
......@@ -104,4 +104,4 @@ class DROP(Dataset):
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
raise NotImplementedError('Evaluation not implemented')
from lm_eval.base import Dataset, rf, mean
from lm_eval.base import Task, rf, mean
from lm_eval.utils import sh
import json
import math
from best_download import download_file
class LAMBADA(Dataset):
class LAMBADA(Task):
def download(self):
sh("mkdir -p data/lambada")
download_file(
......@@ -67,4 +67,4 @@ class LAMBADA(Dataset):
return {
'perplexity': False,
'accuracy': True
}
\ No newline at end of file
}
from . common import HFTask
from itertools import islice
import random
class NaturalQs(HFTask):
# TODO: naturalqs has a *really* large train set that huggingface just
# automatically downloads even if you dont use it. we should try and only
# download the val set and not even bother with the train set.
DATASET_PATH = "natural_questions"
DATASET_NAME = None
......@@ -25,10 +30,10 @@ class NaturalQs(HFTask):
def fewshot_examples(self, k):
# Data is too large to fit in memory. We just sample from the first bit.
if self._traindocs is None:
self._traindocs = list(islice(self.training_docs(), 0, 100000))
if self._training_docs is None:
self._training_docs = list(islice(self.training_docs(), 0, 100000))
return random.sample(self._traindocs, k)
return random.sample(self._training_docs, k)
def doc_to_text(self, doc):
return 'Q: ' + doc['question']['text'] + '\n\n' + 'A: '
......@@ -87,4 +92,4 @@ class NaturalQs(HFTask):
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
raise NotImplementedError('Evaluation not implemented')
......@@ -95,4 +95,4 @@ class OpenBookQA(HFTask):
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
raise NotImplementedError('Evaluation not implemented')
import json
import random
from lm_eval.base import Dataset, rf, mean
from lm_eval.base import Task, rf, mean
from ..utils import sh
import os
class PiQA(Dataset):
class PiQA(Task):
def download(self):
if not os.path.exists('data/piqa'):
#TODO: use best_download
......@@ -74,4 +74,4 @@ class PiQA(Dataset):
def higher_is_better(self):
return {
'acc': True
}
\ No newline at end of file
}
import json
import random
import os
from lm_eval.base import Dataset
from lm_eval.base import Task
from ..utils import sh
class QuAC(Dataset):
class QuAC(Task):
def __init__(self):
super().__init__()
......@@ -103,4 +103,4 @@ class QuAC(Dataset):
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
raise NotImplementedError('Evaluation not implemented')
......@@ -23,7 +23,8 @@ class RACE(HFTask):
return True
def _collate_data(self, set):
if set in self.cache: return self.cache[set]
if set in self.cache:
return self.cache[set]
# One big issue with HF's implementation of this dataset: it makes a
# separate document for each question; meanwhile, in the GPT3 paper it
# is shown that one document is made per passage.
......
import json
import random
import os
from lm_eval.base import Dataset, rf, mean
from lm_eval.base import Task, rf, mean
from tqdm import auto as tqdm_lib
from . common import simple_accuracy_metric
import numpy as np
from ..utils import sh
class SATAnalogies(Dataset):
class SATAnalogies(Task):
NEEDS_MANUAL_DL = True
def __init__(self):
......
......@@ -83,4 +83,4 @@ class SQuAD(HFTask):
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
raise NotImplementedError('Evaluation not implemented')
import json
import random
from lm_eval.base import Dataset
from lm_eval.base import Task
from ..utils import sh
import csv
class StoryCloze(Dataset):
class StoryCloze(Task):
NEEDS_MANUAL_DL = True
def download(self):
......@@ -89,4 +89,4 @@ class StoryCloze(Dataset):
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
\ No newline at end of file
raise NotImplementedError('Evaluation not implemented')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment