Add `TextSynth` API (#299)

* Add `TextSynth` API

Add `TextSynth` API (#299)
* Add `TextSynth` API
4f0410a4 · Jonathan Tow · GitHub · ddd97430 · 4f0410a4 · 4f0410a4
Unverified Commit 4f0410a4 authored Nov 03, 2022 by Jonathan Tow Committed by GitHub Nov 03, 2022
19 changed files
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
 from . import gpt2
 from . import gpt3
+from . import textsynth
 from . import dummy

 MODEL_REGISTRY = {
    "hf": gpt2.HFLM,
    "gpt2": gpt2.GPT2LM,
    "gpt3": gpt3.GPT3LM,
+    "textsynth": textsynth.TextSynthLM,
    "dummy": dummy.DummyLM,
 }


--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
+""" TextSynth API
+Implementation provided by Fabrice Bellard:
+    https://github.com/EleutherAI/lm-evaluation-harness/issues/295
+
+In order to use the API, you must have a valid TextSynth account and
+enough credits.
+
+Example usage:
+
+    python main.py --model textsynth --model_args engine=gptj_6B --no_cache --tasks piqa
+
+Homepage: https://textsynth.com/index.html
+"""
+import logging
+import os
+import requests as _requests
+import time
+from tqdm import tqdm
+from lm_eval.base import BaseLM
+
+
+logger = logging.getLogger(__name__)
+
+
+def textsynth_completion(**kwargs):
+    """Query TextSynth API for completion.
+    Retry with back-off until they respond.
+    """
+    backoff_time = 3
+    while True:
+        try:
+            return _requests.post(**kwargs)
+        except _requests.exceptions.RequestException:
+            import traceback
+
+            traceback.print_exc()
+            time.sleep(backoff_time)
+            backoff_time *= 1.5
+
+
+class TextSynthLM(BaseLM):
+    def __init__(self, engine, truncate=False):
+        """
+        :param engine: str
+            TextSynth API engine (e.g. `gptj_6B`)
+        :param truncate: bool
+            Truncate input if too long (if False and input is too long, throw error)
+        """
+        super().__init__()
+
+        self.engine = engine
+        self.truncate = truncate
+        self.api_url = "https://api.textsynth.com"
+        # Read from environment variable TEXTSYNTH_API_SECRET_KEY
+        self.api_key = os.environ["TEXTSYNTH_API_SECRET_KEY"]
+
+    @property
+    def eot_token_id(self):
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        raise NotImplementedError()
+
+    @property
+    def max_length(self):
+        # NOTE: Turn on truncation to avoid errors on long inputs.
+        return 2048
+
+    @property
+    def max_gen_toks(self):
+        return 256
+
+    @property
+    def batch_size(self):
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        raise NotImplementedError()
+
+    @property
+    def device(self):
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        raise NotImplementedError()
+
+    def tok_encode(self, string: str):
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        raise NotImplementedError()
+
+    def tok_decode(self, tokens):
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        raise NotImplementedError()
+
+    def loglikelihood(self, requests):
+        res = []
+        for context, continuation in tqdm(requests):
+            response = textsynth_completion(
+                url=self.api_url + "/v1/engines/" + self.engine + "/logprob",
+                headers={"Authorization": "Bearer " + self.api_key},
+                json={"context": context, "continuation": continuation},
+            )
+            resp = response.json()
+            if "logprob" in resp:
+                logprob = resp["logprob"]
+                is_greedy = resp["is_greedy"]
+                res.append((logprob, is_greedy))
+            else:
+                logger.error(
+                    f"The following response does not contain `logprobs`. Got:\n{resp}"
+                )
+                assert False
+        return res
+
+    def loglikelihood_rolling(self, requests):
+        # TODO: The TextSynth API does not support tokenized inputs so we cannot
+        # manually partition long contexts into smaller rolling windows as
+        # done for other models derived from `BaseLM`. Override this method
+        # with a windowing scheme that works for direct string inputs.
+        raise NotImplementedError(
+            "`loglikelihood_rolling` is currently not supported due to lack of "
+            "input tokenization support from TextSynth."
+        )
+
+    def greedy_until(self, requests):
+        if not requests:
+            return []
+
+        res = []
+        for request in tqdm(requests):
+            inp = request[0]
+            until = request[1]
+            response = textsynth_completion(
+                url=self.api_url + "/v1/engines/" + self.engine + "/completions",
+                headers={"Authorization": "Bearer " + self.api_key},
+                json={
+                    "prompt": inp,
+                    "max_tokens": self.max_gen_toks,
+                    "top_k": 1,
+                    "stop": until,
+                },
+            )
+            resp = response.json()
+            if "text" in resp:
+                s = resp["text"]
+                res.append(s)
+            else:
+                logger.error(
+                    f"The following response does not contain generated `text`. "
+                    "Got:\n{resp}"
+                )
+                assert False
+        return res
+
+    def _model_call(self, inps):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+
+    def _model_generate(self, context, max_length, eos_token_id):
+        # Isn't used because we override greedy_until
+        raise NotImplementedError()
--- a/tests/test_gpt3.py
+++ b/tests/test_gpt3.py
-import lm_eval.models as models
-import pytest
-import os
-import json
-import openai
-import mock
-import pickle
-import hashlib
-
-
-def mock_completion(**kwargs):
-    # Mock completion function
-    # Loads from a cached+pickled response if it exists, otherwise it will actually try to ping
-    os.makedirs("tests/testdata", exist_ok=True)
-    hash = hashlib.sha256(
-        json.dumps(kwargs, sort_keys=True).encode("utf-8")
-    ).hexdigest()
-    fname = f"tests/testdata/gpt3_test_{hash}.pkl"
-
-    if os.path.exists(fname):
-        with open(fname, "rb") as fh:
-            return pickle.load(fh)
-    ret = openai.Completion.create(**kwargs)
-    ret.api_key = ""
-    with open(fname, "wb") as fh:
-        pickle.dump(ret, fh)
-    return ret
-
-
-@mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
-def test_gpt3():
-    if "OPENAI_API_SECRET_KEY" not in os.environ:
-        os.environ["OPENAI_API_SECRET_KEY"] = ""
-    gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
-    (
-        (ll_dog, ig_dog),
-        (ll_cat, ig_cat),
-        (_, ll_max_0),
-        (_, ll_max_1),
-        (_, ll_max_2),
-        *vals,
-    ) = gpt3.loglikelihood(
-        [
-            ("The quick brown fox jumps over the lazy", " dog"),
-            ("The quick brown fox jumps over the lazy", " cat"),
-            ("The quick brown fox jumps over the lazy", ", lazy dog"),
-            ("The quick brown fox jumps over the lazy", ", lazy fox"),
-            (
-                "The quick brown fox jumps over the lazy",
-                ", lazy fox and they both fall to the ground",
-            ),
-            (
-                """A mult""",
-                """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
-            ),
-            (
-                """The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""",
-                """ (with threshold activation); see § Terminology""",
-            ),
-            (
-                """Multilayer perceptrons are sometimes coll""",
-                """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]""",
-            ),
-            (
-                """An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""",
-                """ activation function.""",
-            ),
-            (
-                """MLP utilizes a supervised""",
-                """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]""",
-            ),
-            (
-                """Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""",
-                """ in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """,
-            ),
-            (
-                """Specifically, we train GPT-3, an autoregressive language model with 175""",
-                """ billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.""",
-            ),
-            (
-                """A mult""",
-                """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
-            ),
-            ("""Hello""", """ World"""),
-        ]
-    )
-
-    assert ll_dog > ll_cat
-    assert not ig_cat
-
-    assert ig_dog
-    assert not ll_max_0
-    assert not ll_max_1
-    assert not ll_max_2
-
-    # test empty context
-    gpt3.loglikelihood([("", "test")])
-
-    (gen,) = gpt3.greedy_until(
-        [("The quick brown fox jumps over the lazy", [".", "\n"])]
-    )
-
-    assert gen == " dog"
-
-    print([x[0] for x in vals])
-
-    targets = [
-        -34.848301606999996,
-        -47.148329679999996,
-        -45.44380149599999,
-        -5.285246016,
-        -133.97821690686004,
-        -321.2616693239001,
-        -658.0299524401041,
-        -34.848301606999996,
-        -7.525115,
-    ]
-
-    for (pred, _), tgt in zip(vals, targets):
-        assert pred == pytest.approx(tgt, rel=1e-3)
-
-
-@mock.patch("lm_eval.models.gpt3.oa_completion", new=mock_completion)
-def test_gpt3_perplexity():
-    if "OPENAI_API_SECRET_KEY" not in os.environ:
-        os.environ["OPENAI_API_SECRET_KEY"] = ""
-    gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
-    test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
-    perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
-    tgt = -84.38819608
-    assert perplexity == pytest.approx(tgt, rel=1e-3)
-
-    # Hack: modify gpt3 to have shorter context length to induce rolling windows
-    with mock.patch.object(
-        models.gpt3.GPT3LM, "max_length", new_callable=mock.PropertyMock
-    ) as mock_max_length:
-        mock_max_length.return_value = 5
-        gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
-        perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
-    tgt = -101.81967209999999
-    assert perplexity == pytest.approx(tgt, rel=1e-3)
--- a/tests/test_models.py
+++ b/tests/test_models.py
+import hashlib
+import json
+import openai
+import os
+import pickle
 import pytest
 import unittest.mock as mock
+
 import lm_eval.models as models


+LOGLIKELIHOOD_TEST_CASES = [
+    ("The quick brown fox jumps over the lazy", " dog"),
+    ("The quick brown fox jumps over the lazy", " cat"),
+    ("The quick brown fox jumps over the lazy", ", lazy dog"),
+    ("The quick brown fox jumps over the lazy", ", lazy fox"),
+    (
+        "The quick brown fox jumps over the lazy",
+        ", lazy fox and they both fall to the ground",
+    ),
+    (
+        """A mult""",
+        """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
+    ),
+    (
+        """The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""",
+        """ (with threshold activation); see § Terminology""",
+    ),
+    (
+        """Multilayer perceptrons are sometimes coll""",
+        """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]""",
+    ),
+    (
+        """An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""",
+        """ activation function.""",
+    ),
+    (
+        """MLP utilizes a supervised""",
+        """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]""",
+    ),
+    (
+        """Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""",
+        """ in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """,
+    ),
+    (
+        """Specifically, we train GPT-3, an autoregressive language model with 175""",
+        """ billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.""",
+    ),
+    (
+        """A mult""",
+        """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
+    ),
+    ("""Hello""", """ World"""),
+]
+
+
+# Test HuggingFace Models (GPT-2)
+
+
 def test_gpt2():
    gpt2 = models.get_model("gpt2").create_from_arg_string("device=cpu")
    (
@@ -12,51 +66,7 @@ def test_gpt2():
        (_, ll_max_1),
        (_, ll_max_2),
        *vals,
-    ) = gpt2.loglikelihood(
-        [
-            ("The quick brown fox jumps over the lazy", " dog"),
-            ("The quick brown fox jumps over the lazy", " cat"),
-            ("The quick brown fox jumps over the lazy", ", lazy dog"),
-            ("The quick brown fox jumps over the lazy", ", lazy fox"),
-            (
-                "The quick brown fox jumps over the lazy",
-                ", lazy fox and they both fall to the ground",
-            ),
-            (
-                """A mult""",
-                """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
-            ),
-            (
-                """The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons""",
-                """ (with threshold activation); see § Terminology""",
-            ),
-            (
-                """Multilayer perceptrons are sometimes coll""",
-                """oquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]""",
-            ),
-            (
-                """An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear""",
-                """ activation function.""",
-            ),
-            (
-                """MLP utilizes a supervised""",
-                """ learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]""",
-            ),
-            (
-                """Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic""",
-                """ in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. """,
-            ),
-            (
-                """Specifically, we train GPT-3, an autoregressive language model with 175""",
-                """ billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.""",
-            ),
-            (
-                """A mult""",
-                """ilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)""",
-            ),
-            ("""Hello""", """ World"""),
-        ]
-    )
+    ) = gpt2.loglikelihood(LOGLIKELIHOOD_TEST_CASES)

    assert ll_dog > ll_cat
    assert not ig_cat
@@ -145,3 +155,169 @@ def test_gpt2_perplexity():
        ]
    )
    assert perplexity == pytest.approx(tgt, rel=1e-3)
+
+
+# Test OpenAI Models (GPT-3)
+
+
+def openai_mock_completion(**kwargs):
+    # Mock completion function
+    # Loads from a cached+pickled response if it exists, otherwise it will actually try to ping
+    os.makedirs("tests/testdata", exist_ok=True)
+    hash = hashlib.sha256(
+        json.dumps(kwargs, sort_keys=True).encode("utf-8")
+    ).hexdigest()
+    fname = f"tests/testdata/gpt3_test_{hash}.pkl"
+
+    if os.path.exists(fname):
+        with open(fname, "rb") as fh:
+            return pickle.load(fh)
+    ret = openai.Completion.create(**kwargs)
+    ret.api_key = ""
+    with open(fname, "wb") as fh:
+        pickle.dump(ret, fh)
+    return ret
+
+
+@mock.patch("lm_eval.models.gpt3.oa_completion", new=openai_mock_completion)
+def test_gpt3():
+    if "OPENAI_API_SECRET_KEY" not in os.environ:
+        os.environ["OPENAI_API_SECRET_KEY"] = ""
+    gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
+    (
+        (ll_dog, ig_dog),
+        (ll_cat, ig_cat),
+        (_, ll_max_0),
+        (_, ll_max_1),
+        (_, ll_max_2),
+        *vals,
+    ) = gpt3.loglikelihood(LOGLIKELIHOOD_TEST_CASES)
+
+    assert ll_dog > ll_cat
+    assert not ig_cat
+
+    assert ig_dog
+    assert not ll_max_0
+    assert not ll_max_1
+    assert not ll_max_2
+
+    # test empty context
+    gpt3.loglikelihood([("", "test")])
+
+    (gen,) = gpt3.greedy_until(
+        [("The quick brown fox jumps over the lazy", [".", "\n"])]
+    )
+
+    assert gen == " dog"
+
+    print([x[0] for x in vals])
+
+    targets = [
+        -34.848301606999996,
+        -47.148329679999996,
+        -45.44380149599999,
+        -5.285246016,
+        -133.97821690686004,
+        -321.2616693239001,
+        -658.0299524401041,
+        -34.848301606999996,
+        -7.525115,
+    ]
+
+    for (pred, _), tgt in zip(vals, targets):
+        assert pred == pytest.approx(tgt, rel=1e-3)
+
+
+@mock.patch("lm_eval.models.gpt3.oa_completion", new=openai_mock_completion)
+def test_gpt3_perplexity():
+    if "OPENAI_API_SECRET_KEY" not in os.environ:
+        os.environ["OPENAI_API_SECRET_KEY"] = ""
+    gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
+    test_string = "We study empirical scaling laws for language model performance on the cross-entropy loss."
+    perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
+    tgt = -84.38819608
+    assert perplexity == pytest.approx(tgt, rel=1e-3)
+
+    # Hack: modify gpt3 to have shorter context length to induce rolling windows
+    with mock.patch.object(
+        models.gpt3.GPT3LM, "max_length", new_callable=mock.PropertyMock
+    ) as mock_max_length:
+        mock_max_length.return_value = 5
+        gpt3 = models.get_model("gpt3").create_from_arg_string("engine=ada")
+        perplexity = gpt3.loglikelihood_rolling([(test_string,)])[0]
+    tgt = -101.81967209999999
+    assert perplexity == pytest.approx(tgt, rel=1e-3)
+
+
+# Test TextSynth Models (GPT-J)
+
+
+def textsynth_mock_completion(**kwargs):
+    # Mock completion function
+    # Loads from a cached+pickled response if it exists, otherwise it will actually try to ping
+    import requests
+
+    os.makedirs("tests/testdata", exist_ok=True)
+    hash = hashlib.sha256(
+        json.dumps(kwargs, sort_keys=True).encode("utf-8")
+    ).hexdigest()
+    fname = f"tests/testdata/textsynth_test_{hash}.pkl"
+
+    if os.path.exists(fname):
+        with open(fname, "rb") as fh:
+            return pickle.load(fh)
+    ret = requests.post(**kwargs)
+    with open(fname, "wb") as fh:
+        pickle.dump(ret, fh)
+    return ret
+
+
+@mock.patch(
+    "lm_eval.models.textsynth.textsynth_completion", new=textsynth_mock_completion
+)
+def test_textsynth():
+    if "TEXTSYNTH_API_SECRET_KEY" not in os.environ:
+        os.environ["TEXTSYNTH_API_SECRET_KEY"] = ""
+    textsynth = models.get_model("textsynth").create_from_arg_string("engine=gptj_6B")
+    (
+        (ll_dog, ig_dog),
+        (ll_cat, ig_cat),
+        (_, ll_max_0),
+        (_, ll_max_1),
+        (_, ll_max_2),
+        *vals,
+    ) = textsynth.loglikelihood(LOGLIKELIHOOD_TEST_CASES)
+
+    assert ll_dog > ll_cat
+    assert not ig_cat
+
+    assert ig_dog
+    assert not ll_max_0
+    assert not ll_max_1
+    assert not ll_max_2
+
+    # test empty context
+    textsynth.loglikelihood([("", "test")])
+
+    (gen,) = textsynth.greedy_until(
+        [("The quick brown fox jumps over the lazy", [".", "\n"])]
+    )
+
+    assert gen == " dog"
+
+    print([x[0] for x in vals])
+
+    targets = [
+        -17.90513712817,
+        -41.83518912287,
+        -33.82445643841,
+        -2.377361565302,
+        -99.53018069754,
+        -243.5642283598,
+        -528.6862613790,
+        -17.90513712817,
+        -5.041000672142,
+    ]
+
+    for (pred, _), tgt in zip(vals, targets):
+        assert pred == pytest.approx(tgt, rel=1e-3)
--- a/tests/testdata/textsynth_test_1bd91ff14e5d94a883b0508d1423c138b7c17480ffb420d019adb59c367beba6.pkl
+++ b/tests/testdata/textsynth_test_1bd91ff14e5d94a883b0508d1423c138b7c17480ffb420d019adb59c367beba6.pkl
--- a/tests/testdata/textsynth_test_26d59d74f909be575a1a3ff661064ee44ada219a1a93f90e43b3e0855c8f5e28.pkl
+++ b/tests/testdata/textsynth_test_26d59d74f909be575a1a3ff661064ee44ada219a1a93f90e43b3e0855c8f5e28.pkl
--- a/tests/testdata/textsynth_test_2a333f73ac695f21ccc9818fc1c5c18295ff6cc3c6e86268f404cfed5aee8428.pkl
+++ b/tests/testdata/textsynth_test_2a333f73ac695f21ccc9818fc1c5c18295ff6cc3c6e86268f404cfed5aee8428.pkl
--- a/tests/testdata/textsynth_test_43d375c048824d415ea6e315702d24ddcdfa906b8675a9f551f84b7bd5810e73.pkl
+++ b/tests/testdata/textsynth_test_43d375c048824d415ea6e315702d24ddcdfa906b8675a9f551f84b7bd5810e73.pkl
--- a/tests/testdata/textsynth_test_5bf000f1dd82089eacd5e452f4348f355948bcb1dfc73c6cd12e5fa8ebc8390c.pkl
+++ b/tests/testdata/textsynth_test_5bf000f1dd82089eacd5e452f4348f355948bcb1dfc73c6cd12e5fa8ebc8390c.pkl
--- a/tests/testdata/textsynth_test_605679bcc8f9a07452313421729c1afe579667127dc46033eec75e7617f13cd6.pkl
+++ b/tests/testdata/textsynth_test_605679bcc8f9a07452313421729c1afe579667127dc46033eec75e7617f13cd6.pkl
--- a/tests/testdata/textsynth_test_8b3ce71a3174d826f699916a175681a99d6adada4874a100d5700817bd601615.pkl
+++ b/tests/testdata/textsynth_test_8b3ce71a3174d826f699916a175681a99d6adada4874a100d5700817bd601615.pkl
--- a/tests/testdata/textsynth_test_92ce62887f42665f7b752661ea8e33658074b94e7eef68b6282ce7b2f76422ea.pkl
+++ b/tests/testdata/textsynth_test_92ce62887f42665f7b752661ea8e33658074b94e7eef68b6282ce7b2f76422ea.pkl
--- a/tests/testdata/textsynth_test_a3f6f4a6b4b4a99a107f1ab64a80a4bcac0bf275fc6ea96d499fb22f5a4c8736.pkl
+++ b/tests/testdata/textsynth_test_a3f6f4a6b4b4a99a107f1ab64a80a4bcac0bf275fc6ea96d499fb22f5a4c8736.pkl
--- a/tests/testdata/textsynth_test_be0208eaf5c87d5d62cf410da0547c13aa91bbeff15a1cb72d80ad13b49d3434.pkl
+++ b/tests/testdata/textsynth_test_be0208eaf5c87d5d62cf410da0547c13aa91bbeff15a1cb72d80ad13b49d3434.pkl
--- a/tests/testdata/textsynth_test_c05f35fda4bdc2eefd6389e3317043c89b6816221ef36286b15969febee34757.pkl
+++ b/tests/testdata/textsynth_test_c05f35fda4bdc2eefd6389e3317043c89b6816221ef36286b15969febee34757.pkl
--- a/tests/testdata/textsynth_test_c578855af1e00883017e5c142701e542813e8949f6e6f8471a8b2f4a144ba6d6.pkl
+++ b/tests/testdata/textsynth_test_c578855af1e00883017e5c142701e542813e8949f6e6f8471a8b2f4a144ba6d6.pkl
--- a/tests/testdata/textsynth_test_d4ff4c625c1f2b4fab137e4829a8e67a3582f72655c7bfaab017c471f8216a1d.pkl
+++ b/tests/testdata/textsynth_test_d4ff4c625c1f2b4fab137e4829a8e67a3582f72655c7bfaab017c471f8216a1d.pkl
--- a/tests/testdata/textsynth_test_e595600e98cdb86290b9ea562ee4bb6cc4e0ba82260189e55e66e6086654f28a.pkl
+++ b/tests/testdata/textsynth_test_e595600e98cdb86290b9ea562ee4bb6cc4e0ba82260189e55e66e6086654f28a.pkl
--- a/tests/testdata/textsynth_test_f85c600ce285362820c732899542ac6782d0b9fe6e6cb93b77919fe5a4d377ed.pkl
+++ b/tests/testdata/textsynth_test_f85c600ce285362820c732899542ac6782d0b9fe6e6cb93b77919fe5a4d377ed.pkl