llama.py 3.15 KB
Newer Older
Matt Hoffner's avatar
Matt Hoffner committed
1
2
import requests
import json
3
4
5
import logging

from lm_eval.base import BaseLM
Matt Hoffner's avatar
Matt Hoffner committed
6
7
8
9
from tqdm import tqdm
from requests.exceptions import RequestException
import time

10
11
logger = logging.getLogger(__name__)

Matt Hoffner's avatar
Matt Hoffner committed
12
13
14
15
16
17
18
19
20
def llama_completion(base_url, prompt, **kwargs):
    try:
        response = requests.post(f"{base_url}/v1/completions", json=kwargs)
        response.raise_for_status()
        return response.json()
    except RequestException as e:
        print(f"RequestException: {e}")
        return None

Matt Hoffner's avatar
Matt Hoffner committed
21
class LlamaCppLM(BaseLM):
Matt Hoffner's avatar
Matt Hoffner committed
22
23
24
25
26
27
28
29
30
    def __init__(self, base_url, truncate=False):
        super().__init__()
        self.base_url = base_url
        self.truncate = truncate

    def loglikelihood(self, requests):
        res = []
        for context, continuation in tqdm(requests):
            response = llama_completion(self.base_url, context, continuation=continuation)
31
32
33
34
35
36
            print(f"Loglikelihood response: {response}")
            if response and "choices" in response and response["choices"]:
                choice = response["choices"][0]
                logprobs = choice.get("logprobs")
                logprob = logprobs["token_logprobs"][0] if logprobs and logprobs["token_logprobs"] else -1.2345
                is_greedy = choice["finish_reason"] == "length"
Matt Hoffner's avatar
Matt Hoffner committed
37
38
                res.append((logprob, is_greedy))
            else:
39
                logger.error(f"Invalid response for loglikelihood. Response: {response}")
Matt Hoffner's avatar
Matt Hoffner committed
40
41
42
43
44
45
46
47
48
49
50
51
                assert False
        return res

    def greedy_until(self, requests):
        if not requests:
            return []

        res = []
        for request in tqdm(requests):
            inp = request[0]
            request_args = request[1]
            until = request_args["until"]
52
53
            response = self.llama_completion(inp, context=res, stop=until)  # Pass the context
            print(f"Greedy_until response: {response}")
Matt Hoffner's avatar
Matt Hoffner committed
54
            if response and "text" in response:
55
56
                generated_text = response["text"].strip()
                res.append(generated_text)
Matt Hoffner's avatar
Matt Hoffner committed
57
            else:
58
59
                logger.error(f"Invalid response for greedy_until. Response: {response}")
                continue
Matt Hoffner's avatar
Matt Hoffner committed
60
        return res
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
    
    def _model_call(self, inps):
        # Placeholder implementation
        raise NotImplementedError()

    def _model_generate(self, context, max_length, eos_token_id):
        # Placeholder implementation
        raise NotImplementedError()

    @property
    def batch_size(self):
        # Placeholder implementation
        raise NotImplementedError()

    @property
    def device(self):
        # Placeholder implementation
        raise NotImplementedError()

    @property
    def eot_token_id(self):
        # Placeholder implementation
        raise NotImplementedError()

    @property
    def max_length(self):
        # Placeholder implementation
        raise NotImplementedError()

    @property
    def max_gen_toks(self):
        # Placeholder implementation
        raise NotImplementedError()

    def tok_encode(self, string: str):
        # Placeholder implementation
        raise NotImplementedError()

    def tok_decode(self, tokens):
        # Placeholder implementation
        raise NotImplementedError()