Unverified Commit 38a240ce authored by Stella Biderman's avatar Stella Biderman Committed by GitHub
Browse files

Merge branch 'master' into PytestUpdate

parents c9011859 33f5572a
...@@ -18,8 +18,13 @@ setuptools.setup( ...@@ -18,8 +18,13 @@ setuptools.setup(
"License :: OSI Approved :: MIT License", "License :: OSI Approved :: MIT License",
"Operating System :: OS Independent", "Operating System :: OS Independent",
], ],
python_requires='>=3.6', python_requires=">=3.6",
install_requires=[ install_requires=[
"promptsource @ git+https://github.com/bigscience-workshop/promptsource@eval-hackathon",
"wrapt",
"nltk",
"jinja2",
"black",
"datasets==2.0.0", "datasets==2.0.0",
"click>=7.1", "click>=7.1",
"scikit-learn>=0.24.1", "scikit-learn>=0.24.1",
...@@ -40,7 +45,7 @@ setuptools.setup( ...@@ -40,7 +45,7 @@ setuptools.setup(
"openai==0.6.4", "openai==0.6.4",
"jieba==0.42.1", "jieba==0.42.1",
"nagisa==0.2.7", "nagisa==0.2.7",
"bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt" "bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt",
], ],
dependency_links=[ dependency_links=[
"https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt", "https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt",
......
# TODO: Remove all TODO comments once the implementation is complete.
"""
TODO: Add the Paper Title on this line.
TODO: Add the paper's PDF URL (preferrably from arXiv) on this line.
TODO: Write a Short Description of the task.
Homepage: TODO: Add the URL to the task's Homepage here.
"""
from lm_eval.base import PromptSourceTask
# TODO: Add the BibTeX citation for the task.
_CITATION = """
"""
# TODO: Replace `NewTask` with the name of your Task.
class NewTask(PromptSourceTask):
VERSION = 0
# TODO: Add the `DATASET_PATH` string. This will be the name of the `Task`
# dataset as denoted in HuggingFace `datasets`.
DATASET_PATH = ""
# TODO: Add the `DATASET_NAME` string. This is the name of a subset within
# `DATASET_PATH`. If there aren't specific subsets you need, leave this as `None`.
DATASET_NAME = None
def has_training_docs(self):
# TODO: Fill in the return with `True` if the Task has training data; else `False`.
return False
def has_validation_docs(self):
# TODO: Fill in the return with `True` if the Task has validation data; else `False`.
return False
def has_test_docs(self):
# TODO: Fill in the return with `True` if the Task has test data; else `False`.
return False
def training_docs(self):
if self.has_training_docs():
# We cache training documents in `self._training_docs` for faster
# few-shot processing. If the data is too large to fit in memory,
# return the training data as a generator instead of a list.
if self._training_docs is None:
# TODO: Return the training document generator from `self.dataset`.
# If you need to process the data, `map` over the documents with
# the custom procesing function, `self._process_doc`. E.g.
# `map(self._process_doc, self.dataset["validation"])`
# In most case you can leave this as is unless the dataset split is
# named differently than the default `"train"`.
self._training_docs = list(self.dataset["train"])
return self._training_docs
def validation_docs(self):
if self.has_validation_docs():
# TODO: Return the validation document generator from `self.dataset`.
# If you need to process the data, `map` over the documents with the
# custom procesing function, `self._process_doc`. E.g.
# `map(self._process_doc, self.dataset["validation"])`
# In most case you can leave this as is unless the dataset split is
# named differently than the default `"validation"`.
return self.dataset["validation"]
def test_docs(self):
if self.has_test_docs():
# TODO: Return the test document generator from `self.dataset`.
# If you need to process the data, `map` over the documents with the
# custom processing function, `self._process_doc`. E.g.
# `map(self._process_doc, self.dataset["test"])`
# In most case you can leave this as is unless the dataset split is
# named differently than the default `"test"`.
return self.dataset["test"]
def stopping_criteria(self):
# TODO: Denote the string where the generation should be split.
# For example, for `coqa`, this is '\nQ:' and for `drop` '.'.
# NOTE: You may delete this function if the task does not required generation.
return None
def construct_requests(self, doc, ctx):
"""Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or
test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: Construct your language model requests with the request factory, `rf`,
# and return them as an iterable.
return []
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: For each (sub)metric in the task evaluation, add a key-value pair
# with the metric name as key and the corresponding metric result as value
# for the current `doc`.
return {}
def aggregation(self):
"""
:returns: {str: [metric_score] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metric scores
"""
# TODO: For each (sub)metric in the task evaluation, add a key-value pair
# with the metric name as key and an aggregation function as value which
# determines how to combine results from each document in the dataset.
# Check `lm_eval.metrics` to find built-in aggregation functions.
return {}
def higher_is_better(self):
# TODO: For each (sub)metric in the task evaluation, add a key-value pair
# with the metric name as key and a `bool` value determining whether or
# not higher values of that metric are deemed better.
return {}
\ No newline at end of file
import random
import lm_eval.models as models
import pytest
import torch
from transformers import StoppingCriteria
@pytest.mark.parametrize(
"eos_token,test_input,expected",
[
("not", "i like", "i like to say that I'm not"),
("say that", "i like", "i like to say that"),
("great", "big science is", "big science is a great"),
("<|endoftext|>", "big science has", "big science has been done in the past, but it's not the same as the science of the")
]
)
def test_stopping_criteria(eos_token, test_input, expected):
random.seed(42)
torch.random.manual_seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"
gpt2 = models.get_model("gpt2")(device=device)
context = torch.tensor([gpt2.tokenizer.encode(test_input)])
stopping_criteria_ids = gpt2.tokenizer.encode(eos_token)
generations = gpt2._model_generate(
context,
max_length=20,
stopping_criteria_ids=stopping_criteria_ids
)
generations = gpt2.tokenizer.decode(generations[0])
assert generations == expected
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment