Unverified Commit 4f0410a4 authored by Jonathan Tow's avatar Jonathan Tow Committed by GitHub
Browse files

Add `TextSynth` API (#299)

* Add `TextSynth` API
parent ddd97430
from . import gpt2
from . import gpt3
from . import textsynth
from . import dummy
MODEL_REGISTRY = {
"hf": gpt2.HFLM,
"gpt2": gpt2.GPT2LM,
"gpt3": gpt3.GPT3LM,
"textsynth": textsynth.TextSynthLM,
"dummy": dummy.DummyLM,
}
......
""" TextSynth API
Implementation provided by Fabrice Bellard:
https://github.com/EleutherAI/lm-evaluation-harness/issues/295
In order to use the API, you must have a valid TextSynth account and
enough credits.
Example usage:
python main.py --model textsynth --model_args engine=gptj_6B --no_cache --tasks piqa
Homepage: https://textsynth.com/index.html
"""
import logging
import os
import requests as _requests
import time
from tqdm import tqdm
from lm_eval.base import BaseLM
logger = logging.getLogger(__name__)
def textsynth_completion(**kwargs):
"""Query TextSynth API for completion.
Retry with back-off until they respond.
"""
backoff_time = 3
while True:
try:
return _requests.post(**kwargs)
except _requests.exceptions.RequestException:
import traceback
traceback.print_exc()
time.sleep(backoff_time)
backoff_time *= 1.5
class TextSynthLM(BaseLM):
def __init__(self, engine, truncate=False):
"""
:param engine: str
TextSynth API engine (e.g. `gptj_6B`)
:param truncate: bool
Truncate input if too long (if False and input is too long, throw error)
"""
super().__init__()
self.engine = engine
self.truncate = truncate
self.api_url = "https://api.textsynth.com"
# Read from environment variable TEXTSYNTH_API_SECRET_KEY
self.api_key = os.environ["TEXTSYNTH_API_SECRET_KEY"]
@property
def eot_token_id(self):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
raise NotImplementedError()
@property
def max_length(self):
# NOTE: Turn on truncation to avoid errors on long inputs.
return 2048
@property
def max_gen_toks(self):
return 256
@property
def batch_size(self):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
raise NotImplementedError()
@property
def device(self):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
raise NotImplementedError()
def tok_encode(self, string: str):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
raise NotImplementedError()
def tok_decode(self, tokens):
# Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
raise NotImplementedError()
def loglikelihood(self, requests):
res = []
for context, continuation in tqdm(requests):
response = textsynth_completion(
url=self.api_url + "/v1/engines/" + self.engine + "/logprob",
headers={"Authorization": "Bearer " + self.api_key},
json={"context": context, "continuation": continuation},
)
resp = response.json()
if "logprob" in resp:
logprob = resp["logprob"]
is_greedy = resp["is_greedy"]
res.append((logprob, is_greedy))
else:
logger.error(
f"The following response does not contain `logprobs`. Got:\n{resp}"
)
assert False
return res
def loglikelihood_rolling(self, requests):
# TODO: The TextSynth API does not support tokenized inputs so we cannot
# manually partition long contexts into smaller rolling windows as
# done for other models derived from `BaseLM`. Override this method
# with a windowing scheme that works for direct string inputs.
raise NotImplementedError(
"`loglikelihood_rolling` is currently not supported due to lack of "
"input tokenization support from TextSynth."
)
def greedy_until(self, requests):
if not requests:
return []
res = []
for request in tqdm(requests):
inp = request[0]
until = request[1]
response = textsynth_completion(
url=self.api_url + "/v1/engines/" + self.engine + "/completions",
headers={"Authorization": "Bearer " + self.api_key},
json={
"prompt": inp,
"max_tokens": self.max_gen_toks,
"top_k": 1,
"stop": until,
},
)
resp = response.json()
if "text" in resp:
s = resp["text"]
res.append(s)
else:
logger.error(
f"The following response does not contain generated `text`. "
"Got:\n{resp}"
)
assert False
return res
def _model_call(self, inps):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
def _model_generate(self, context, max_length, eos_token_id):
# Isn't used because we override greedy_until
raise NotImplementedError()
import lm_eval.models as models
import pytest
import os
import json
import openai
import mock
import pickle
import hashlib
def mock_completion(**kwargs):
# Mock completion function
# Loads from a cached+pickled response if it exists, otherwise it will actually try to ping
os.makedirs("tests/testdata", exist_ok=True)
hash = hashlib.sha256(
json.dumps(kwargs, sort_keys=True).encode("utf-8")
).hexdigest()
fname = f"tests/testdata/gpt3_test_{hash}.pkl"
if os.path.exists(fname):
with open(fname, "rb") as fh:
return pickle.load(fh)
ret = openai.Completion.create(**kwargs)
ret.api_key = ""
with open(fname, "wb") as fh:
pickle.dump(ret, fh)
return ret
@mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
def test_gpt3():
if "OPENAI_API_SECRET_KEY" not in os.environ:
os.environ["OPENAI_API_SECRET_KEY"] = ""
gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
(
(ll_dog, ig_dog),
(ll_cat, ig_cat),
(_, ll_max_0),
(_, ll_max_1),
(_, ll_max_2),
*vals,
) = gpt3.loglikelihood(
[
("The quick brown fox jumps over the lazy", " dog"),
("The quick brown fox jumps over the lazy", " cat"),
("The quick brown fox jumps over the lazy", ", lazy dog"),
("The quick brown fox jumps over the lazy", ", lazy fox"),
(
"The quick brown fox jumps over the lazy",
", lazy fox and they both fall to the ground",
),
(
"""A mult""",
"""ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
),
(
"""The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""",
""" (with threshold activation); see § Terminology""",
),
(
"""Multilayer perceptrons are sometimes coll""",
"""oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]""",
),
(
"""An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""",
""" activation function.""",
),
(
"""MLP utilizes a supervised""",
""" learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]""",
),
(
"""Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""",
""" in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """,
),
(
"""Specifically, we train GPT-3, an autoregressive language model with 175""",
""" billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.""",
),
(
"""A mult""",
"""ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
),
("""Hello""", """ World"""),
]
)
assert ll_dog > ll_cat
assert not ig_cat
assert ig_dog
assert not ll_max_0
assert not ll_max_1
assert not ll_max_2
# test empty context
gpt3.loglikelihood([("", "test")])
(gen,) = gpt3.greedy_until(
[("The quick brown fox jumps over the lazy", [".", "\n"])]
)
assert gen == " dog"
print([x[0] for x in vals])
targets = [
-34.848301606999996,
-47.148329679999996,
-45.44380149599999,
-5.285246016,
-133.97821690686004,
-321.2616693239001,
-658.0299524401041,
-34.848301606999996,
-7.525115,
]
for (pred, _), tgt in zip(vals, targets):
assert pred == pytest.approx(tgt, rel=1e-3)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
def test_gpt3_perplexity():
if "OPENAI_API_SECRET_KEY" not in os.environ:
os.environ["OPENAI_API_SECRET_KEY"] = ""
gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
tgt = -84.38819608
assert perplexity == pytest.approx(tgt, rel=1e-3)
# Hack: modify gpt3 to have shorter context length to induce rolling windows
with mock.patch.object(
models.gpt3.GPT3LM, "max_length", new_callable=mock.PropertyMock
) as mock_max_length:
mock_max_length.return_value = 5
gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
tgt = -101.81967209999999
assert perplexity == pytest.approx(tgt, rel=1e-3)
import hashlib
import json
import openai
import os
import pickle
import pytest
import unittest.mock as mock
import lm_eval.models as models
LOGLIKELIHOOD_TEST_CASES = [
("The quick brown fox jumps over the lazy", " dog"),
("The quick brown fox jumps over the lazy", " cat"),
("The quick brown fox jumps over the lazy", ", lazy dog"),
("The quick brown fox jumps over the lazy", ", lazy fox"),
(
"The quick brown fox jumps over the lazy",
", lazy fox and they both fall to the ground",
),
(
"""A mult""",
"""ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
),
(
"""The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""",
""" (with threshold activation); see § Terminology""",
),
(
"""Multilayer perceptrons are sometimes coll""",
"""oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]""",
),
(
"""An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""",
""" activation function.""",
),
(
"""MLP utilizes a supervised""",
""" learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]""",
),
(
"""Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""",
""" in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """,
),
(
"""Specifically, we train GPT-3, an autoregressive language model with 175""",
""" billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.""",
),
(
"""A mult""",
"""ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
),
("""Hello""", """ World"""),
]
# Test HuggingFace Models (GPT-2)
def test_gpt2():
gpt2 = models.get_model("gpt2").create_from_arg_string("device=cpu")
(
......@@ -12,51 +66,7 @@ def test_gpt2():
(_, ll_max_1),
(_, ll_max_2),
*vals,
) = gpt2.loglikelihood(
[
("The quick brown fox jumps over the lazy", " dog"),
("The quick brown fox jumps over the lazy", " cat"),
("The quick brown fox jumps over the lazy", ", lazy dog"),
("The quick brown fox jumps over the lazy", ", lazy fox"),
(
"The quick brown fox jumps over the lazy",
", lazy fox and they both fall to the ground",
),
(
"""A mult""",
"""ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
),
(
"""The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""",
""" (with threshold activation); see § Terminology""",
),
(
"""Multilayer perceptrons are sometimes coll""",
"""oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]""",
),
(
"""An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""",
""" activation function.""",
),
(
"""MLP utilizes a supervised""",
""" learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]""",
),
(
"""Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""",
""" in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """,
),
(
"""Specifically, we train GPT-3, an autoregressive language model with 175""",
""" billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.""",
),
(
"""A mult""",
"""ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
),
("""Hello""", """ World"""),
]
)
) = gpt2.loglikelihood(LOGLIKELIHOOD_TEST_CASES)
assert ll_dog > ll_cat
assert not ig_cat
......@@ -145,3 +155,169 @@ def test_gpt2_perplexity():
]
)
assert perplexity == pytest.approx(tgt, rel=1e-3)
# Test OpenAI Models (GPT-3)
def openai_mock_completion(**kwargs):
# Mock completion function
# Loads from a cached+pickled response if it exists, otherwise it will actually try to ping
os.makedirs("tests/testdata", exist_ok=True)
hash = hashlib.sha256(
json.dumps(kwargs, sort_keys=True).encode("utf-8")
).hexdigest()
fname = f"tests/testdata/gpt3_test_{hash}.pkl"
if os.path.exists(fname):
with open(fname, "rb") as fh:
return pickle.load(fh)
ret = openai.Completion.create(**kwargs)
ret.api_key = ""
with open(fname, "wb") as fh:
pickle.dump(ret, fh)
return ret
@mock.patch("lm_eval.models.gpt3.oa_completion", new=openai_mock_completion)
def test_gpt3():
if "OPENAI_API_SECRET_KEY" not in os.environ:
os.environ["OPENAI_API_SECRET_KEY"] = ""
gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
(
(ll_dog, ig_dog),
(ll_cat, ig_cat),
(_, ll_max_0),
(_, ll_max_1),
(_, ll_max_2),
*vals,
) = gpt3.loglikelihood(LOGLIKELIHOOD_TEST_CASES)
assert ll_dog > ll_cat
assert not ig_cat
assert ig_dog
assert not ll_max_0
assert not ll_max_1
assert not ll_max_2
# test empty context
gpt3.loglikelihood([("", "test")])
(gen,) = gpt3.greedy_until(
[("The quick brown fox jumps over the lazy", [".", "\n"])]
)
assert gen == " dog"
print([x[0] for x in vals])
targets = [
-34.848301606999996,
-47.148329679999996,
-45.44380149599999,
-5.285246016,
-133.97821690686004,
-321.2616693239001,
-658.0299524401041,
-34.848301606999996,
-7.525115,
]
for (pred, _), tgt in zip(vals, targets):
assert pred == pytest.approx(tgt, rel=1e-3)
@mock.patch("lm_eval.models.gpt3.oa_completion", new=openai_mock_completion)
def test_gpt3_perplexity():
if "OPENAI_API_SECRET_KEY" not in os.environ:
os.environ["OPENAI_API_SECRET_KEY"] = ""
gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
tgt = -84.38819608
assert perplexity == pytest.approx(tgt, rel=1e-3)
# Hack: modify gpt3 to have shorter context length to induce rolling windows
with mock.patch.object(
models.gpt3.GPT3LM, "max_length", new_callable=mock.PropertyMock
) as mock_max_length:
mock_max_length.return_value = 5
gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
tgt = -101.81967209999999
assert perplexity == pytest.approx(tgt, rel=1e-3)
# Test TextSynth Models (GPT-J)
def textsynth_mock_completion(**kwargs):
# Mock completion function
# Loads from a cached+pickled response if it exists, otherwise it will actually try to ping
import requests
os.makedirs("tests/testdata", exist_ok=True)
hash = hashlib.sha256(
json.dumps(kwargs, sort_keys=True).encode("utf-8")
).hexdigest()
fname = f"tests/testdata/textsynth_test_{hash}.pkl"
if os.path.exists(fname):
with open(fname, "rb") as fh:
return pickle.load(fh)
ret = requests.post(**kwargs)
with open(fname, "wb") as fh:
pickle.dump(ret, fh)
return ret
@mock.patch(
"lm_eval.models.textsynth.textsynth_completion", new=textsynth_mock_completion
)
def test_textsynth():
if "TEXTSYNTH_API_SECRET_KEY" not in os.environ:
os.environ["TEXTSYNTH_API_SECRET_KEY"] = ""
textsynth = models.get_model("textsynth").create_from_arg_string("engine=gptj_6B")
(
(ll_dog, ig_dog),
(ll_cat, ig_cat),
(_, ll_max_0),
(_, ll_max_1),
(_, ll_max_2),
*vals,
) = textsynth.loglikelihood(LOGLIKELIHOOD_TEST_CASES)
assert ll_dog > ll_cat
assert not ig_cat
assert ig_dog
assert not ll_max_0
assert not ll_max_1
assert not ll_max_2
# test empty context
textsynth.loglikelihood([("", "test")])
(gen,) = textsynth.greedy_until(
[("The quick brown fox jumps over the lazy", [".", "\n"])]
)
assert gen == " dog"
print([x[0] for x in vals])
targets = [
-17.90513712817,
-41.83518912287,
-33.82445643841,
-2.377361565302,
-99.53018069754,
-243.5642283598,
-528.6862613790,
-17.90513712817,
-5.041000672142,
]
for (pred, _), tgt in zip(vals, targets):
assert pred == pytest.approx(tgt, rel=1e-3)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment