Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
38a240ce
Unverified
Commit
38a240ce
authored
Apr 27, 2022
by
Stella Biderman
Committed by
GitHub
Apr 27, 2022
Browse files
Merge branch 'master' into PytestUpdate
parents
c9011859
33f5572a
Changes
23
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
168 additions
and
2 deletions
+168
-2
setup.py
setup.py
+7
-2
templates/new_task.py
templates/new_task.py
+128
-0
tests/test_gpt2.py
tests/test_gpt2.py
+33
-0
No files found.
setup.py
View file @
38a240ce
...
...
@@ -18,8 +18,13 @@ setuptools.setup(
"License :: OSI Approved :: MIT License"
,
"Operating System :: OS Independent"
,
],
python_requires
=
'
>=3.6
'
,
python_requires
=
"
>=3.6
"
,
install_requires
=
[
"promptsource @ git+https://github.com/bigscience-workshop/promptsource@eval-hackathon"
,
"wrapt"
,
"nltk"
,
"jinja2"
,
"black"
,
"datasets==2.0.0"
,
"click>=7.1"
,
"scikit-learn>=0.24.1"
,
...
...
@@ -40,7 +45,7 @@ setuptools.setup(
"openai==0.6.4"
,
"jieba==0.42.1"
,
"nagisa==0.2.7"
,
"bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt"
"bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt"
,
],
dependency_links
=
[
"https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt"
,
...
...
templates/new_task.py
0 → 100644
View file @
38a240ce
# TODO: Remove all TODO comments once the implementation is complete.
"""
TODO: Add the Paper Title on this line.
TODO: Add the paper's PDF URL (preferrably from arXiv) on this line.
TODO: Write a Short Description of the task.
Homepage: TODO: Add the URL to the task's Homepage here.
"""
from
lm_eval.base
import
PromptSourceTask
# TODO: Add the BibTeX citation for the task.
_CITATION
=
"""
"""
# TODO: Replace `NewTask` with the name of your Task.
class
NewTask
(
PromptSourceTask
):
VERSION
=
0
# TODO: Add the `DATASET_PATH` string. This will be the name of the `Task`
# dataset as denoted in HuggingFace `datasets`.
DATASET_PATH
=
""
# TODO: Add the `DATASET_NAME` string. This is the name of a subset within
# `DATASET_PATH`. If there aren't specific subsets you need, leave this as `None`.
DATASET_NAME
=
None
def
has_training_docs
(
self
):
# TODO: Fill in the return with `True` if the Task has training data; else `False`.
return
False
def
has_validation_docs
(
self
):
# TODO: Fill in the return with `True` if the Task has validation data; else `False`.
return
False
def
has_test_docs
(
self
):
# TODO: Fill in the return with `True` if the Task has test data; else `False`.
return
False
def
training_docs
(
self
):
if
self
.
has_training_docs
():
# We cache training documents in `self._training_docs` for faster
# few-shot processing. If the data is too large to fit in memory,
# return the training data as a generator instead of a list.
if
self
.
_training_docs
is
None
:
# TODO: Return the training document generator from `self.dataset`.
# If you need to process the data, `map` over the documents with
# the custom procesing function, `self._process_doc`. E.g.
# `map(self._process_doc, self.dataset["validation"])`
# In most case you can leave this as is unless the dataset split is
# named differently than the default `"train"`.
self
.
_training_docs
=
list
(
self
.
dataset
[
"train"
])
return
self
.
_training_docs
def
validation_docs
(
self
):
if
self
.
has_validation_docs
():
# TODO: Return the validation document generator from `self.dataset`.
# If you need to process the data, `map` over the documents with the
# custom procesing function, `self._process_doc`. E.g.
# `map(self._process_doc, self.dataset["validation"])`
# In most case you can leave this as is unless the dataset split is
# named differently than the default `"validation"`.
return
self
.
dataset
[
"validation"
]
def
test_docs
(
self
):
if
self
.
has_test_docs
():
# TODO: Return the test document generator from `self.dataset`.
# If you need to process the data, `map` over the documents with the
# custom processing function, `self._process_doc`. E.g.
# `map(self._process_doc, self.dataset["test"])`
# In most case you can leave this as is unless the dataset split is
# named differently than the default `"test"`.
return
self
.
dataset
[
"test"
]
def
stopping_criteria
(
self
):
# TODO: Denote the string where the generation should be split.
# For example, for `coqa`, this is '\nQ:' and for `drop` '.'.
# NOTE: You may delete this function if the task does not required generation.
return
None
def
construct_requests
(
self
,
doc
,
ctx
):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or
test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: Construct your language model requests with the request factory, `rf`,
# and return them as an iterable.
return
[]
def
process_results
(
self
,
doc
,
results
):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: For each (sub)metric in the task evaluation, add a key-value pair
# with the metric name as key and the corresponding metric result as value
# for the current `doc`.
return
{}
def
aggregation
(
self
):
"""
:returns: {str: [metric_score] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metric scores
"""
# TODO: For each (sub)metric in the task evaluation, add a key-value pair
# with the metric name as key and an aggregation function as value which
# determines how to combine results from each document in the dataset.
# Check `lm_eval.metrics` to find built-in aggregation functions.
return
{}
def
higher_is_better
(
self
):
# TODO: For each (sub)metric in the task evaluation, add a key-value pair
# with the metric name as key and a `bool` value determining whether or
# not higher values of that metric are deemed better.
return
{}
\ No newline at end of file
tests/test_gpt2.py
0 → 100644
View file @
38a240ce
import
random
import
lm_eval.models
as
models
import
pytest
import
torch
from
transformers
import
StoppingCriteria
@
pytest
.
mark
.
parametrize
(
"eos_token,test_input,expected"
,
[
(
"not"
,
"i like"
,
"i like to say that I'm not"
),
(
"say that"
,
"i like"
,
"i like to say that"
),
(
"great"
,
"big science is"
,
"big science is a great"
),
(
"<|endoftext|>"
,
"big science has"
,
"big science has been done in the past, but it's not the same as the science of the"
)
]
)
def
test_stopping_criteria
(
eos_token
,
test_input
,
expected
):
random
.
seed
(
42
)
torch
.
random
.
manual_seed
(
42
)
device
=
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
gpt2
=
models
.
get_model
(
"gpt2"
)(
device
=
device
)
context
=
torch
.
tensor
([
gpt2
.
tokenizer
.
encode
(
test_input
)])
stopping_criteria_ids
=
gpt2
.
tokenizer
.
encode
(
eos_token
)
generations
=
gpt2
.
_model_generate
(
context
,
max_length
=
20
,
stopping_criteria_ids
=
stopping_criteria_ids
)
generations
=
gpt2
.
tokenizer
.
decode
(
generations
[
0
])
assert
generations
==
expected
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment