updates to verify end to end with example

0fa2c7df · Matt Hoffner · b56dee4e · 0fa2c7df · 0fa2c7df · 0fa2c7df
Commit 0fa2c7df authored Jul 04, 2023 by Matt Hoffner
4 changed files
--- a/README.md
+++ b/README.md
@@ -141,6 +141,15 @@ python main.py \
    --tasks hellaswag
 ```
+GGML quantized models can be loaded by using `llama-cpp-python` server:
+```bash
+python main.py \
+    --model ggml \
+    --model_args base_url=http://localhost:8000 \
+    --tasks hellaswag
+```
 We support wildcards in task names, for example you can run all of the machine-translated lambada tasks via `--task lambada_openai_mt_*`.
 We currently only support one prompt per task, which we strive to make the "standard" as defined by the benchmark's authors. If you would like to study how varying prompts causes changes in the evaluation score, check out the [BigScience fork](https://github.com/bigscience-workshop/lm-evaluation-harness) of this repo. We are currently working on upstreaming this capability to `main`.

--- a/lm_eval/models/ggml.py
+++ b/lm_eval/models/ggml.py
@@ -3,16 +3,33 @@ import logging
 import time
 from tqdm import tqdm
 from requests.exceptions import RequestException
+import transformers
 from lm_eval.utils import Reorderer
 from lm_eval.base import BaseLM
 logger = logging.getLogger(__name__)
-def ggml_completion(base_url, retries=3, delay=5, **kwargs):
+class GGMLLM(BaseLM):
+    def __init__(self, base_url, truncate=False):
+        super().__init__()
+        self.base_url = base_url
+        self.truncate = truncate
+        self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2")
+        self.logpobs = 10
+        self.max_length = 1024
+        self.vocab_size = self.tokenizer.vocab_size
+    def ggml_completion(self, base_url, context, continuation=None, stop=None, retries=3, delay=5, **kwargs):
        for _ in range(retries):
            try:
-            response = requests.post(f"{base_url}/v1/completions", json=kwargs)
+                prompt = context
+                if continuation:
+                    prompt += continuation
+                request = {'prompt': prompt, 'logprobs': self.logpobs}
+                if stop is not None:
+                    request['stop'] = stop
+                response = requests.post(f"{base_url}/v1/completions", json=request)
                response.raise_for_status()
                return response.json()
            except RequestException as e:
@@ -21,11 +38,6 @@ def ggml_completion(base_url, retries=3, delay=5, **kwargs):
        else:
            raise Exception(f"Failed to get a valid response after {retries} retries. Last exception: {e}")
-class GGMLLM(BaseLM):
-    def __init__(self, base_url, truncate=False):
-        super().__init__()
-        self.base_url = base_url
-        self.truncate = truncate
    def loglikelihood(self, requests):
        reorderer = Reorderer(requests, len)
@@ -33,21 +45,22 @@ class GGMLLM(BaseLM):
        res = []
        for context, continuation in tqdm(requests):
-            response = ggml_completion(self.base_url, context=context, continuation=continuation)
+            response = self.ggml_completion(self.base_url, context=context, continuation=continuation)
            if response and "choices" in response and response["choices"]:
                choice = response["choices"][0]
                logprobs = choice.get("logprobs")
-                try:
+                if logprobs and "token_logprobs" in logprobs:
                    logprob = logprobs["token_logprobs"][0]
-                except TypeError:
-                    raise ValueError("Invalid logprobs data. Expected 'logprobs' to contain 'token_logprobs' list.")
                    is_greedy = choice["finish_reason"] == "length"
                    res.append((logprob, is_greedy))
+                else:
+                    logger.warning("Invalid logprobs data. Expected 'logprobs' to contain 'token_logprobs' list.")
            else:
                logger.error(f"Invalid response for loglikelihood. Response: {response}")
                assert False
        return reorderer.get_original(res)
    def greedy_until(self, requests):
        if not requests:
            return []
@@ -60,8 +73,7 @@ class GGMLLM(BaseLM):
            inp = request[0]
            request_args = request[1]
            until = request_args["until"]
-            response = ggml_completion(self.base_url, context=inp, stop=until)
+            response = self.ggml_completion(self.base_url, context=inp, stop=until)
-            print(response);
            if response and "choices" in response and response["choices"]:
                choice = response["choices"][0]
                if "text" in choice:
@@ -99,6 +111,12 @@ class GGMLLM(BaseLM):
        # Placeholder implementation
        raise NotImplementedError()
+    def tok_encode(self, string: str):
+        return self.tokenizer.encode(string, add_special_tokens=False)
+    def tok_decode(self, tokens):
+        return self.tokenizer.decode(tokens)
    @property
    def batch_size(self):
        # Placeholder implementation
@@ -114,19 +132,10 @@ class GGMLLM(BaseLM):
        # Placeholder implementation
        raise NotImplementedError()
-    @property
    def max_length(self):        
-        return 1024
+        return self.max_length
    @property
    def max_gen_toks(self):
        # Placeholder implementation
        raise NotImplementedError()
\ No newline at end of file
-    def tok_encode(self, string: str):
-        # Placeholder implementation
-        raise NotImplementedError()
-    def tok_decode(self, tokens):
-        # Placeholder implementation
-        raise NotImplementedError()
--- a/tests/test_ggml.py
+++ b/tests/test_ggml.py
@@ -38,7 +38,7 @@ def ggml_completion_mock(base_url, **kwargs):
 class GGMLLMTest(unittest.TestCase):
-    @patch('lm_eval.models.ggml.ggml_completion', side_effect=ggml_completion_mock)
+    @patch('lm_eval.models.ggml.GGMLLM.ggml_completion', side_effect=ggml_completion_mock)
    def test_loglikelihood(self, ggml_completion_mock):
        lm = GGMLLM(base_url)
@@ -50,7 +50,7 @@ class GGMLLMTest(unittest.TestCase):
        expected_res = [(logprob, True) for logprob in [-1.2345, -1.2345]]
        self.assertEqual(res, expected_res)
-    @patch('lm_eval.models.ggml.ggml_completion', side_effect=ggml_completion_mock)
+    @patch('lm_eval.models.ggml.GGMLLM.ggml_completion', side_effect=ggml_completion_mock)
    def test_greedy_until(self, ggml_completion_mock):
        lm = GGMLLM(base_url)
@@ -62,7 +62,7 @@ class GGMLLMTest(unittest.TestCase):
        expected_res = ["generated text until stop1", "generated text until stop1"]
        self.assertEqual(res, expected_res)
-    @patch('lm_eval.models.ggml.ggml_completion', side_effect=ggml_completion_mock)
+    @patch('lm_eval.models.ggml.GGMLLM.ggml_completion', side_effect=ggml_completion_mock)
    def test_loglikelihood_rolling(self, ggml_completion_mock):
        lm = GGMLLM(base_url)

--- a/tests/testdata/ggml_test_c28e46a48a7076dc266da0a1a93be005d91162fb16950e31daee94d23d9e091e.pkl
+++ b/tests/testdata/ggml_test_c28e46a48a7076dc266da0a1a93be005d91162fb16950e31daee94d23d9e091e.pkl