Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
269d3683
Unverified
Commit
269d3683
authored
Feb 02, 2021
by
Leo Gao
Committed by
GitHub
Feb 02, 2021
Browse files
Merge branch 'master' into webqs
parents
34eb121f
a1a4a32e
Changes
28
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
113 additions
and
50 deletions
+113
-50
.github/workflows/python-app.yml
.github/workflows/python-app.yml
+49
-0
README.md
README.md
+3
-0
lm_eval/base.py
lm_eval/base.py
+19
-13
lm_eval/models/dummy.py
lm_eval/models/dummy.py
+1
-1
lm_eval/models/gpt2.py
lm_eval/models/gpt2.py
+1
-1
lm_eval/tasks/__init__.py
lm_eval/tasks/__init__.py
+1
-1
lm_eval/tasks/arc.py
lm_eval/tasks/arc.py
+1
-1
lm_eval/tasks/arithmetic.py
lm_eval/tasks/arithmetic.py
+2
-2
lm_eval/tasks/common.py
lm_eval/tasks/common.py
+4
-5
lm_eval/tasks/coqa.py
lm_eval/tasks/coqa.py
+2
-2
lm_eval/tasks/drop.py
lm_eval/tasks/drop.py
+3
-3
lm_eval/tasks/lambada.py
lm_eval/tasks/lambada.py
+3
-3
lm_eval/tasks/naturalqs.py
lm_eval/tasks/naturalqs.py
+9
-4
lm_eval/tasks/openbookqa.py
lm_eval/tasks/openbookqa.py
+1
-1
lm_eval/tasks/piqa.py
lm_eval/tasks/piqa.py
+3
-3
lm_eval/tasks/quac.py
lm_eval/tasks/quac.py
+3
-3
lm_eval/tasks/race.py
lm_eval/tasks/race.py
+2
-1
lm_eval/tasks/sat.py
lm_eval/tasks/sat.py
+2
-2
lm_eval/tasks/squad.py
lm_eval/tasks/squad.py
+1
-1
lm_eval/tasks/storycloze.py
lm_eval/tasks/storycloze.py
+3
-3
No files found.
.github/workflows/python-app.yml
0 → 100644
View file @
269d3683
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
name
:
Python application
on
:
push
:
branches
:
[
master
]
pull_request
:
branches
:
[
master
]
jobs
:
build
:
runs-on
:
ubuntu-latest
steps
:
-
uses
:
actions/checkout@v2
-
name
:
Cache
uses
:
actions/cache@v2.1.3
with
:
# A list of files, directories, and wildcard patterns to cache and restore
path
:
|
data
~/.cache
# An explicit key for restoring and saving the cache
key
:
evaldata-cache
-
name
:
Set up Python
3.9
uses
:
actions/setup-python@v2
with
:
python-version
:
3.9
-
name
:
Install dependencies
run
:
|
python -m pip install --upgrade pip
pip install flake8 pytest pytest-cov
pip install -e .
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
-
name
:
Lint with flake8
run
:
|
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
-
name
:
Test with pytest
run
:
|
pytest --cov=lm_eval/ tests/
-
name
:
Upload to codecov
run
:
|
bash <(curl -s https://codecov.io/bash)
\ No newline at end of file
README.md
View file @
269d3683
# Evaluation Harness for Large Language Models

[

](https://codecov.io/gh/EleutherAI/lm-evaluation-harness)
## Overview
The goal of this project is to build a set of tools for evaluating LMs on typical NLU tasks, based on evaluation of GPT-3 as described in https://arxiv.org/pdf/2005.14165.pdf. Following the initial description, this repo should support 3 functions:
...
...
lm_eval/base.py
View file @
269d3683
...
...
@@ -58,10 +58,10 @@ class LM(abc.ABC):
return
cls
()
class
Dataset
(
abc
.
ABC
):
class
Task
(
abc
.
ABC
):
def
__init__
(
self
):
self
.
download
()
self
.
_traindocs
=
None
self
.
_train
ing_
docs
=
None
def
download
(
self
):
"""Downloads the task dataset if necessary"""
...
...
@@ -71,7 +71,7 @@ class Dataset(abc.ABC):
def
has_training_docs
(
self
):
"""Whether the task has a training set"""
pass
@
abc
.
abstractmethod
def
has_validation_docs
(
self
):
"""Whether the task has a validation set"""
...
...
@@ -84,23 +84,29 @@ class Dataset(abc.ABC):
def
training_docs
(
self
):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return
[]
def
validation_docs
(
self
):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return
[]
def
test_docs
(
self
):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
return
[]
def
fewshot_examples
(
self
,
k
):
if
self
.
_traindocs
is
None
:
self
.
_traindocs
=
list
(
self
.
training_docs
())
return
random
.
sample
(
self
.
_traindocs
,
k
)
def
fewshot_examples
(
self
,
k
):
if
self
.
_training_docs
is
None
:
self
.
_training_docs
=
list
(
self
.
training_docs
())
return
random
.
sample
(
self
.
_training_docs
,
k
)
@
abc
.
abstractmethod
def
doc_to_text
(
self
,
doc
):
...
...
@@ -123,7 +129,7 @@ class Dataset(abc.ABC):
part of the document for `doc`.
"""
pass
@
abc
.
abstractmethod
def
process_results
(
self
,
doc
,
results
):
"""Take a single document and the LM results and evaluates, returning a
...
...
@@ -161,7 +167,7 @@ class Dataset(abc.ABC):
def
fewshot_context
(
self
,
doc
,
num_fewshot
,
provide_description
):
raw_description
=
self
.
fewshot_description
()
description
=
(
raw_description
+
"
\n
===
\n\n
"
)
if
provide_description
and
raw_description
else
""
if
num_fewshot
==
0
:
labeled_examples
=
""
else
:
...
...
lm_eval/models/dummy.py
View file @
269d3683
...
...
@@ -20,4 +20,4 @@ class DummyLM(LM):
def
greedy_until
(
self
,
requests
):
# TODO: implement
pass
\ No newline at end of file
pass
lm_eval/models/gpt2.py
View file @
269d3683
...
...
@@ -43,4 +43,4 @@ class GPT2LM(LM):
def
greedy_until
(
self
,
requests
):
# TODO: implement
pass
\ No newline at end of file
pass
lm_eval/tasks/__init__.py
View file @
269d3683
...
...
@@ -46,7 +46,7 @@ TASK_REGISTRY = {
"lambada"
:
lambada
.
LAMBADA
,
"piqa"
:
piqa
.
PiQA
,
"triviaqa"
:
triviaqa
.
TriviaQA
,
#
"triviaqa": triviaqa.TriviaQA,
# "arc_easy": arc.ARCEasy, # not implemented yet
# "arc_challenge": arc.ARCChallenge, # not implemented yet
# "quac": quac.QuAC, # not implemented yet
...
...
lm_eval/tasks/arc.py
View file @
269d3683
...
...
@@ -70,4 +70,4 @@ class ARCEasy(HFTask):
class
ARCChallenge
(
ARCEasy
):
DATASET_PATH
=
"ai2_arc"
DATASET_NAME
=
"ARC-Challenge"
\ No newline at end of file
DATASET_NAME
=
"ARC-Challenge"
lm_eval/tasks/arithmetic.py
View file @
269d3683
...
...
@@ -2,12 +2,12 @@ import abc
import
json
import
os
from
collections
import
namedtuple
from
lm_eval.base
import
Dataset
,
mean
,
rf
from
lm_eval.base
import
Task
,
mean
,
rf
from
best_download
import
download_file
ArithmeticDoc
=
namedtuple
(
'ArithmeticDoc'
,
[
'context'
,
'completion'
])
class
Arithmetic
(
Dataset
):
class
Arithmetic
(
Task
):
directory
=
'data/arithmetic/'
def
__init__
(
self
):
...
...
lm_eval/tasks/common.py
View file @
269d3683
import
datasets
import
numpy
as
np
import
random
from
..base
import
Dataset
from
..base
import
Task
class
HFTask
(
Dataset
):
class
HFTask
(
Task
):
DATASET_PATH
=
None
DATASET_NAME
=
None
def
__init__
(
self
):
self
.
data
=
None
super
().
__init__
()
self
.
_training_docs
=
None
def
download
(
self
):
self
.
data
=
datasets
.
load_dataset
(
path
=
self
.
DATASET_PATH
,
name
=
self
.
DATASET_NAME
)
...
...
lm_eval/tasks/coqa.py
View file @
269d3683
...
...
@@ -2,11 +2,11 @@
import
json
import
random
from
lm_eval.base
import
Dataset
from
lm_eval.base
import
Task
from
..utils
import
sh
class
CoQA
(
Dataset
):
class
CoQA
(
Task
):
def
__init__
(
self
):
self
.
download
()
def
download
(
self
):
...
...
lm_eval/tasks/drop.py
View file @
269d3683
...
...
@@ -5,9 +5,9 @@ from sklearn.metrics import f1_score, matthews_corrcoef
from
tqdm
import
auto
as
tqdm_lib
from
.
common
import
HFTask
,
simple_accuracy_metric
,
yesno
from
pathlib
import
Path
from
..base
import
Dataset
from
..base
import
Task
class
DROP
(
Dataset
):
class
DROP
(
Task
):
DATAFOLDER
=
Path
(
__file__
).
parent
/
"../../data/drop"
def
__init__
(
self
):
...
...
@@ -104,4 +104,4 @@ class DROP(Dataset):
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
\ No newline at end of file
raise
NotImplementedError
(
'Evaluation not implemented'
)
lm_eval/tasks/lambada.py
View file @
269d3683
from
lm_eval.base
import
Dataset
,
rf
,
mean
from
lm_eval.base
import
Task
,
rf
,
mean
from
lm_eval.utils
import
sh
import
json
import
math
from
best_download
import
download_file
class
LAMBADA
(
Dataset
):
class
LAMBADA
(
Task
):
def
download
(
self
):
sh
(
"mkdir -p data/lambada"
)
download_file
(
...
...
@@ -67,4 +67,4 @@ class LAMBADA(Dataset):
return
{
'perplexity'
:
False
,
'accuracy'
:
True
}
\ No newline at end of file
}
lm_eval/tasks/naturalqs.py
View file @
269d3683
from
.
common
import
HFTask
from
itertools
import
islice
import
random
class
NaturalQs
(
HFTask
):
# TODO: naturalqs has a *really* large train set that huggingface just
# automatically downloads even if you dont use it. we should try and only
# download the val set and not even bother with the train set.
DATASET_PATH
=
"natural_questions"
DATASET_NAME
=
None
...
...
@@ -25,10 +30,10 @@ class NaturalQs(HFTask):
def
fewshot_examples
(
self
,
k
):
# Data is too large to fit in memory. We just sample from the first bit.
if
self
.
_traindocs
is
None
:
self
.
_traindocs
=
list
(
islice
(
self
.
training_docs
(),
0
,
100000
))
if
self
.
_train
ing_
docs
is
None
:
self
.
_train
ing_
docs
=
list
(
islice
(
self
.
training_docs
(),
0
,
100000
))
return
random
.
sample
(
self
.
_traindocs
,
k
)
return
random
.
sample
(
self
.
_train
ing_
docs
,
k
)
def
doc_to_text
(
self
,
doc
):
return
'Q: '
+
doc
[
'question'
][
'text'
]
+
'
\n\n
'
+
'A: '
...
...
@@ -87,4 +92,4 @@ class NaturalQs(HFTask):
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
\ No newline at end of file
raise
NotImplementedError
(
'Evaluation not implemented'
)
lm_eval/tasks/openbookqa.py
View file @
269d3683
...
...
@@ -95,4 +95,4 @@ class OpenBookQA(HFTask):
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
\ No newline at end of file
raise
NotImplementedError
(
'Evaluation not implemented'
)
lm_eval/tasks/piqa.py
View file @
269d3683
import
json
import
random
from
lm_eval.base
import
Dataset
,
rf
,
mean
from
lm_eval.base
import
Task
,
rf
,
mean
from
..utils
import
sh
import
os
class
PiQA
(
Dataset
):
class
PiQA
(
Task
):
def
download
(
self
):
if
not
os
.
path
.
exists
(
'data/piqa'
):
#TODO: use best_download
...
...
@@ -74,4 +74,4 @@ class PiQA(Dataset):
def
higher_is_better
(
self
):
return
{
'acc'
:
True
}
\ No newline at end of file
}
lm_eval/tasks/quac.py
View file @
269d3683
import
json
import
random
import
os
from
lm_eval.base
import
Dataset
from
lm_eval.base
import
Task
from
..utils
import
sh
class
QuAC
(
Dataset
):
class
QuAC
(
Task
):
def
__init__
(
self
):
super
().
__init__
()
...
...
@@ -103,4 +103,4 @@ class QuAC(Dataset):
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
\ No newline at end of file
raise
NotImplementedError
(
'Evaluation not implemented'
)
lm_eval/tasks/race.py
View file @
269d3683
...
...
@@ -23,7 +23,8 @@ class RACE(HFTask):
return
True
def
_collate_data
(
self
,
set
):
if
set
in
self
.
cache
:
return
self
.
cache
[
set
]
if
set
in
self
.
cache
:
return
self
.
cache
[
set
]
# One big issue with HF's implementation of this dataset: it makes a
# separate document for each question; meanwhile, in the GPT3 paper it
# is shown that one document is made per passage.
...
...
lm_eval/tasks/sat.py
View file @
269d3683
import
json
import
random
import
os
from
lm_eval.base
import
Dataset
,
rf
,
mean
from
lm_eval.base
import
Task
,
rf
,
mean
from
tqdm
import
auto
as
tqdm_lib
from
.
common
import
simple_accuracy_metric
import
numpy
as
np
from
..utils
import
sh
class
SATAnalogies
(
Dataset
):
class
SATAnalogies
(
Task
):
NEEDS_MANUAL_DL
=
True
def
__init__
(
self
):
...
...
lm_eval/tasks/squad.py
View file @
269d3683
...
...
@@ -83,4 +83,4 @@ class SQuAD(HFTask):
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
\ No newline at end of file
raise
NotImplementedError
(
'Evaluation not implemented'
)
lm_eval/tasks/storycloze.py
View file @
269d3683
import
json
import
random
from
lm_eval.base
import
Dataset
from
lm_eval.base
import
Task
from
..utils
import
sh
import
csv
class
StoryCloze
(
Dataset
):
class
StoryCloze
(
Task
):
NEEDS_MANUAL_DL
=
True
def
download
(
self
):
...
...
@@ -89,4 +89,4 @@ class StoryCloze(Dataset):
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise
NotImplementedError
(
'Evaluation not implemented'
)
\ No newline at end of file
raise
NotImplementedError
(
'Evaluation not implemented'
)
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment