openai_completions.py 11.4 KB
Newer Older
Lintang Sutawika's avatar
Lintang Sutawika committed
1
import logging
Jason Phang's avatar
gpt3  
Jason Phang committed
2
import os
Baber Abbasi's avatar
Baber Abbasi committed
3
from functools import cached_property
4
from operator import itemgetter
Baber Abbasi's avatar
Baber Abbasi committed
5
from typing import Any, Dict, List, Optional, Tuple, Union
6

7
from lm_eval.api.registry import register_model
Baber Abbasi's avatar
Baber Abbasi committed
8
from lm_eval.models.api_models import TemplateAPI
9
from lm_eval.models.utils import handle_stop_sequences
Lintang Sutawika's avatar
Lintang Sutawika committed
10
11
12


eval_logger = logging.getLogger(__name__)
Leo Gao's avatar
Leo Gao committed
13

lintangsutawika's avatar
update  
lintangsutawika committed
14

Baber Abbasi's avatar
Baber Abbasi committed
15
16
@register_model("local-completions")
class LocalCompletionsAPI(TemplateAPI):
lintangsutawika's avatar
lintangsutawika committed
17
18
    def __init__(
        self,
19
20
21
22
23
        base_url=None,
        tokenizer_backend="auto",
        verify_certificate=True,
        ca_cert_path=None,
        auth_token=None,
Baber Abbasi's avatar
Baber Abbasi committed
24
25
        **kwargs,
    ):
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
        # Auto-detect tokenizer backend
        if tokenizer_backend == "auto":
            if base_url:
                from lm_eval.utils import check_remote_tokenizer_support

                if check_remote_tokenizer_support(
                    base_url,
                    verify_certificate=verify_certificate,
                    ca_cert_path=ca_cert_path,
                    auth_token=auth_token,
                ):
                    eval_logger.info(
                        "Auto-detected remote tokenizer support. Using remote tokenizer backend."
                    )
                    tokenizer_backend = "remote"
                else:
                    eval_logger.info(
                        "Remote tokenizer not supported. Using huggingface tokenizer backend."
                    )
                    tokenizer_backend = "huggingface"
            else:
                eval_logger.warning(
                    "No base_url provided. Using huggingface tokenizer backend."
                )
                tokenizer_backend = "huggingface"

Baber Abbasi's avatar
Baber Abbasi committed
52
        super().__init__(
53
54
55
56
57
58
            base_url=base_url,
            tokenizer_backend=tokenizer_backend,
            verify_certificate=verify_certificate,
            ca_cert_path=ca_cert_path,
            auth_token=auth_token,
            **kwargs,
Baber Abbasi's avatar
Baber Abbasi committed
59
        )
lintangsutawika's avatar
lintangsutawika committed
60

Baber Abbasi's avatar
Baber Abbasi committed
61
62
63
64
65
    def _create_payload(
        self,
        messages: Union[List[List[int]], List[dict], List[str], str],
        generate=False,
        gen_kwargs: Optional[dict] = None,
66
        seed: int = 1234,
67
        eos=None,
Baber Abbasi's avatar
Baber Abbasi committed
68
69
70
71
        **kwargs,
    ) -> dict:
        if generate:
            gen_kwargs.pop("do_sample", False)
72
73
74
75
            if "max_tokens" in gen_kwargs:
                max_tokens = gen_kwargs.pop("max_tokens")
            else:
                max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
Baber Abbasi's avatar
Baber Abbasi committed
76
            temperature = gen_kwargs.pop("temperature", 0)
77
            stop = handle_stop_sequences(gen_kwargs.pop("until", None), eos)
Baber Abbasi's avatar
Baber Abbasi committed
78
79
80
81
82
83
            return {
                "prompt": messages,
                "model": self.model,
                "max_tokens": max_tokens,
                "temperature": temperature,
                "stop": stop,
84
                "seed": seed,
Baber Abbasi's avatar
Baber Abbasi committed
85
86
                **gen_kwargs,
            }
Baber Abbasi's avatar
Baber Abbasi committed
87
        else:
Baber Abbasi's avatar
Baber Abbasi committed
88
89
90
            return {
                "model": self.model,
                "prompt": messages,
91
                "temperature": 0,
Baber Abbasi's avatar
Baber Abbasi committed
92
93
                "max_tokens": 1,
                "logprobs": 1,
94
                "seed": seed,
Baber Abbasi's avatar
Baber Abbasi committed
95
96
97
98
99
100
101
102
103
                "echo": True,
            }

    @staticmethod
    def parse_logprobs(
        outputs: Union[Dict, List[Dict]],
        tokens: List[List[int]] = None,
        ctxlens: List[int] = None,
        **kwargs,
lintangsutawika's avatar
lintangsutawika committed
104
105
    ) -> List[Tuple[float, bool]]:
        res = []
Baber Abbasi's avatar
Baber Abbasi committed
106
107
108
        if not isinstance(outputs, list):
            outputs = [outputs]
        for out in outputs:
109
110
111
            for choice, ctxlen in zip(
                sorted(out["choices"], key=itemgetter("index")), ctxlens
            ):
Baber Abbasi's avatar
Baber Abbasi committed
112
113
                assert ctxlen > 0, "Context length must be greater than 0"
                logprobs = sum(choice["logprobs"]["token_logprobs"][ctxlen:-1])
114
                tokens_logprobs = choice["logprobs"]["token_logprobs"][ctxlen:-1]
Baber Abbasi's avatar
Baber Abbasi committed
115
116
                top_logprobs = choice["logprobs"]["top_logprobs"][ctxlen:-1]
                is_greedy = True
117
118
                for tok, top in zip(tokens_logprobs, top_logprobs):
                    if tok != max(top.values()):
Baber Abbasi's avatar
Baber Abbasi committed
119
120
121
122
123
124
125
                        is_greedy = False
                        break
                res.append((logprobs, is_greedy))
        return res

    @staticmethod
    def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
lintangsutawika's avatar
lintangsutawika committed
126
        res = []
Baber Abbasi's avatar
Baber Abbasi committed
127
128
129
        if not isinstance(outputs, list):
            outputs = [outputs]
        for out in outputs:
130
            tmp = [None] * len(out["choices"])
Baber Abbasi's avatar
Baber Abbasi committed
131
            for choices in out["choices"]:
132
133
                tmp[choices["index"]] = choices["text"]
            res = res + tmp
Baber Abbasi's avatar
Baber Abbasi committed
134
        return res
lintangsutawika's avatar
lintangsutawika committed
135

Baber Abbasi's avatar
Baber Abbasi committed
136
137
138
    @property
    def api_key(self):
        return os.environ.get("OPENAI_API_KEY", "")
lintangsutawika's avatar
lintangsutawika committed
139
140


Baber Abbasi's avatar
Baber Abbasi committed
141
142
@register_model("local-chat-completions")
class LocalChatCompletion(LocalCompletionsAPI):
143
144
145
146
147
148
149
    """
    Minimal chat-completions wrapper.
    - Only accepts messages as list[dict].
    - No tokenization or template logic.
    - Use with --apply_chat_template or ensure upstream formats messages correctly.
    """

Baber Abbasi's avatar
Baber Abbasi committed
150
151
    def __init__(
        self,
152
153
154
155
        base_url=None,
        verify_certificate=True,
        ca_cert_path=None,
        auth_token=None,
Baber Abbasi's avatar
Baber Abbasi committed
156
157
158
159
        **kwargs,
    ):
        super().__init__(
            base_url=base_url,
160
161
162
163
164
            tokenizer_backend=None,
            tokenized_requests=None,
            verify_certificate=verify_certificate,
            ca_cert_path=ca_cert_path,
            auth_token=auth_token,
Baber Abbasi's avatar
Baber Abbasi committed
165
166
167
168
169
            **kwargs,
        )
        if self._batch_size > 1:
            eval_logger.warning(
                "Chat completions does not support batching. Defaulting to batch size 1."
lintangsutawika's avatar
lintangsutawika committed
170
            )
Baber Abbasi's avatar
Baber Abbasi committed
171
172
173
            self._batch_size = 1

    def _create_payload(
174
175
176
177
178
        self,
        messages: List[Dict],
        generate=False,
        gen_kwargs: dict = None,
        seed=1234,
179
        eos=None,
180
        **kwargs,
Baber Abbasi's avatar
Baber Abbasi committed
181
    ) -> dict:
182
183
184
185
186
        assert isinstance(messages, list) and all(
            isinstance(m, dict) for m in messages
        ), (
            "LocalChatCompletion expects messages as list[dict]. "
            "If you see this error, ensure --apply_chat_template is set or upstream code formats messages correctly."
Baber Abbasi's avatar
Baber Abbasi committed
187
        )
188
        gen_kwargs = gen_kwargs or {}
Baber Abbasi's avatar
Baber Abbasi committed
189
        gen_kwargs.pop("do_sample", False)
190
191
192
193
        if "max_tokens" in gen_kwargs:
            max_tokens = gen_kwargs.pop("max_tokens")
        else:
            max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
Baber Abbasi's avatar
Baber Abbasi committed
194
        temperature = gen_kwargs.pop("temperature", 0)
195
        stop = handle_stop_sequences(gen_kwargs.pop("until", None), eos)
Baber Abbasi's avatar
Baber Abbasi committed
196
197
198
199
200
201
202
203
        if not isinstance(stop, (list, tuple)):
            stop = [stop]
        return {
            "messages": messages,
            "model": self.model,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "stop": stop[:4],
204
            "seed": seed,
Baber Abbasi's avatar
Baber Abbasi committed
205
206
207
208
209
210
211
212
213
            **gen_kwargs,
        }

    @staticmethod
    def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
        res = []
        if not isinstance(outputs, list):
            outputs = [outputs]
        for out in outputs:
214
            tmp = [None] * len(out["choices"])
Baber Abbasi's avatar
Baber Abbasi committed
215
            for choices in out["choices"]:
216
217
                tmp[choices["index"]] = choices["message"]["content"]
            res = res + tmp
Baber Abbasi's avatar
Baber Abbasi committed
218
219
220
221
222
223
224
225
226
227
        return res

    def tok_encode(
        self,
        string: Union[str, Any],
        left_truncate_len=None,
        add_special_tokens=None,
        **kwargs,
    ) -> Union[List[str], List[int], Any]:
        return string
lintangsutawika's avatar
lintangsutawika committed
228

Baber Abbasi's avatar
Baber Abbasi committed
229
    def loglikelihood(self, requests, **kwargs):
Baber Abbasi's avatar
Baber Abbasi committed
230
231
232
        raise NotImplementedError(
            "Loglikelihood is not supported for chat completions. Consider using the completions API instead."
        )
lintangsutawika's avatar
lintangsutawika committed
233
234


Baber Abbasi's avatar
Baber Abbasi committed
235
236
237
238
@register_model(
    "openai-completions",
)
class OpenAICompletionsAPI(LocalCompletionsAPI):
239
    def __init__(
240
        self,
Baber Abbasi's avatar
Baber Abbasi committed
241
242
        base_url="https://api.openai.com/v1/completions",
        tokenizer_backend="tiktoken",
243
        **kwargs,
Baber Abbasi's avatar
Baber Abbasi committed
244
245
246
247
    ):
        super().__init__(
            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
        )
248

Baber Abbasi's avatar
Baber Abbasi committed
249
250
251
252
253
254
    @cached_property
    def api_key(self):
        """Override this property to return the API key for the API request."""
        key = os.environ.get("OPENAI_API_KEY", None)
        if key is None:
            raise ValueError(
255
                "API key not found. Please set the `OPENAI_API_KEY` environment variable."
256
            )
Baber Abbasi's avatar
Baber Abbasi committed
257
        return key
258

Baber Abbasi's avatar
Baber Abbasi committed
259
    def loglikelihood(self, requests, **kwargs):
Baber Abbasi's avatar
Baber Abbasi committed
260
261
262
263
264
265
        assert self.model in [
            "babbage-002",
            "davinci-002",
        ], (
            f"Prompt loglikelihoods are only supported by OpenAI's API for {['babbage-002', 'davinci-002']}."
        )
Baber Abbasi's avatar
Baber Abbasi committed
266
        return super().loglikelihood(requests, **kwargs)
267

268
269
270
    def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
        return ""

271

Baber Abbasi's avatar
Baber Abbasi committed
272
@register_model("openai-chat-completions")
Baber Abbasi's avatar
Baber Abbasi committed
273
274
275
276
277
278
279
280
class OpenAIChatCompletion(LocalChatCompletion):
    def __init__(
        self,
        base_url="https://api.openai.com/v1/chat/completions",
        tokenizer_backend=None,
        tokenized_requests=False,
        **kwargs,
    ):
281
282
283
284
        if "o1" in kwargs.get("model", ""):
            eval_logger.warning(
                "o1 models do not support `stop` and only support temperature=1"
            )
285

Baber Abbasi's avatar
Baber Abbasi committed
286
287
288
289
290
291
        super().__init__(
            base_url=base_url,
            tokenizer_backend=tokenizer_backend,
            tokenized_requests=tokenized_requests,
            **kwargs,
        )
292

Baber Abbasi's avatar
Baber Abbasi committed
293
294
295
296
297
298
    @cached_property
    def api_key(self):
        """Override this property to return the API key for the API request."""
        key = os.environ.get("OPENAI_API_KEY", None)
        if key is None:
            raise ValueError(
299
                "API key not found. Please set the `OPENAI_API_KEY` environment variable."
300
            )
Baber Abbasi's avatar
Baber Abbasi committed
301
        return key
302
303
304
305
306

    def loglikelihood(self, requests, **kwargs):
        raise NotImplementedError(
            "Loglikelihood (and therefore `multiple_choice`-type tasks) is not supported for chat completions as OpenAI does not provide prompt logprobs. See https://github.com/EleutherAI/lm-evaluation-harness/issues/942#issuecomment-1777836312 or https://github.com/EleutherAI/lm-evaluation-harness/issues/1196 for more background on this limitation."
        )
307
308
309
310
311
312
313

    def _create_payload(
        self,
        messages: List[Dict],
        generate=False,
        gen_kwargs: dict = None,
        seed=1234,
314
        eos="<|endoftext|>",
315
316
        **kwargs,
    ) -> dict:
Baber Abbasi's avatar
Baber Abbasi committed
317
318
319
        assert type(messages) is not str, (
            "chat-completions require the --apply_chat_template flag."
        )
320
321
322
323
324
325
        gen_kwargs.pop("do_sample", False)
        if "max_tokens" in gen_kwargs:
            max_tokens = gen_kwargs.pop("max_tokens")
        else:
            max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
        temperature = gen_kwargs.pop("temperature", 0)
326
        stop = handle_stop_sequences(gen_kwargs.pop("until", ["<|endoftext|>"]), eos)
327
328
329
330
331
332
333
334
335
336
337
        if not isinstance(stop, (list, tuple)):
            stop = [stop]
        output = {
            "messages": messages,
            "model": self.model,
            "max_completion_tokens": max_tokens,
            "temperature": temperature,
            "stop": stop[:4],
            "seed": seed,
            **gen_kwargs,
        }
338
        if "o1" in self.model or "5" in self.model:
339
340
            output.pop("stop")
            output["temperature"] = 1
Jocelyn's avatar
Jocelyn committed
341
342
        elif "o3" in self.model:
            output.pop("temperature")
343
        return output