import lm_eval.models as models import pytest import os import json import openai import mock import pickle import hashlib def mock_completion(**kwargs): # Mock completion function # Loads from a cached+pickled response if it exists, otherwise it will actually try to ping os.makedirs("tests/testdata", exist_ok=True) hash = hashlib.sha256( json.dumps(kwargs, sort_keys=True).encode("utf-8") ).hexdigest() fname = f"tests/testdata/gpt3_test_{hash}.pkl" if os.path.exists(fname): with open(fname, "rb") as fh: return pickle.load(fh) ret = openai.Completion.create(**kwargs) ret.api_key = "" with open(fname, "wb") as fh: pickle.dump(ret, fh) return ret @mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion) def test_gpt3(): if "OPENAI_API_SECRET_KEY" not in os.environ: os.environ["OPENAI_API_SECRET_KEY"] = "" gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada") ( (ll_dog, ig_dog), (ll_cat, ig_cat), (_, ll_max_0), (_, ll_max_1), (_, ll_max_2), *vals, ) = gpt3.loglikelihood( [ ("The quick brown fox jumps over the lazy", " dog"), ("The quick brown fox jumps over the lazy", " cat"), ("The quick brown fox jumps over the lazy", ", lazy dog"), ("The quick brown fox jumps over the lazy", ", lazy fox"), ( "The quick brown fox jumps over the lazy", ", lazy fox and they both fall to the ground", ), ( """A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""", ), ( """The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""", """ (with threshold activation); see ยง Terminology""", ), ( """Multilayer perceptrons are sometimes coll""", """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]""", ), ( """An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""", """ activation function.""", ), ( """MLP utilizes a supervised""", """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]""", ), ( """Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""", """ in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """, ), ( """Specifically, we train GPT-3, an autoregressive language model with 175""", """ billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.""", ), ( """A mult""", """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""", ), ("""Hello""", """ World"""), ] ) assert ll_dog > ll_cat assert not ig_cat assert ig_dog assert not ll_max_0 assert not ll_max_1 assert not ll_max_2 # test empty context gpt3.loglikelihood([("", "test")]) (gen,) = gpt3.greedy_until( [("The quick brown fox jumps over the lazy", [".", "\n"])] ) assert gen == " dog" print([x[0] for x in vals]) targets = [ -34.848301606999996, -47.148329679999996, -45.44380149599999, -5.285246016, -133.97821690686004, -321.2616693239001, -658.0299524401041, -34.848301606999996, -7.525115, ] for (pred, _), tgt in zip(vals, targets): assert pred == pytest.approx(tgt, rel=1e-3) @mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion) def test_gpt3_perplexity(): if "OPENAI_API_SECRET_KEY" not in os.environ: os.environ["OPENAI_API_SECRET_KEY"] = "" gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada") test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss." perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0] tgt = -84.38819608 assert perplexity == pytest.approx(tgt, rel=1e-3) # Hack: modify gpt3 to have shorter context length to induce rolling windows with mock.patch.object( models.gpt3.GPT3LM, "max_length", new_callable=mock.PropertyMock ) as mock_max_length: mock_max_length.return_value = 5 gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada") perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0] tgt = -101.81967209999999 assert perplexity == pytest.approx(tgt, rel=1e-3)