openai_completions.py 18 KB
Newer Older
Jason Phang's avatar
gpt3  
Jason Phang committed
1
import os
lintangsutawika's avatar
lintangsutawika committed
2
import time
Baber Abbasi's avatar
Baber Abbasi committed
3
from typing import List, Tuple, Optional
lintangsutawika's avatar
update  
lintangsutawika committed
4
5
6

import copy
from collections import defaultdict
Leo Gao's avatar
Leo Gao committed
7
from tqdm import tqdm
lintangsutawika's avatar
update  
lintangsutawika committed
8

lintangsutawika's avatar
lintangsutawika committed
9
from lm_eval import utils
10
11
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
Leo Gao's avatar
Leo Gao committed
12

lintangsutawika's avatar
update  
lintangsutawika committed
13

Baber Abbasi's avatar
Baber Abbasi committed
14
def get_result(response, ctxlen: int) -> Tuple[float, bool]:
lintangsutawika's avatar
lintangsutawika committed
15
16
17
18
19
20
21
22
23
24
25
26
27
    """Process results from OpenAI API response.

    :param response: dict
        OpenAI API Response
    :param ctxlen: int
        Length of context (so we can slice them away and only keep the predictions)
    :return:
        continuation_logprobs: np.array
            Log probabilities of continuation tokens
        is_greedy: bool
            whether argmax matches given continuation exactly
    """
    is_greedy = True
Baber Abbasi's avatar
Baber Abbasi committed
28
    logprobs = response.logprobs.token_logprobs
lintangsutawika's avatar
lintangsutawika committed
29
30
    continuation_logprobs = sum(logprobs[ctxlen:])

Baber Abbasi's avatar
Baber Abbasi committed
31
32
33
    for i in range(ctxlen, len(response.logprobs.token_logprobs)):
        token = response.logprobs.token_logprobs[i]
        top_tokens = response.logprobs.top_logprobs[i]
lintangsutawika's avatar
lintangsutawika committed
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
        top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
        if top_token != token:
            is_greedy = False
            break

    return continuation_logprobs, is_greedy


def oa_completion(**kwargs):
    """Query OpenAI API for completion.

    Retry with back-off until they respond
    """
    try:
        import openai, tiktoken  # noqa: E401
    except ModuleNotFoundError:
        raise Exception(
            "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
        )

    backoff_time = 3
    while True:
        try:
58
59
            return openai.completions.create(**kwargs)
        except openai.OpenAIError:
lintangsutawika's avatar
lintangsutawika committed
60
61
62
63
64
65
66
            import traceback

            traceback.print_exc()
            time.sleep(backoff_time)
            backoff_time *= 1.5


67
@register_model("openai-completions")
lintangsutawika's avatar
lintangsutawika committed
68
69
class OpenaiCompletionsLM(LM):
    REQ_CHUNK_SIZE = 20
Baber Abbasi's avatar
Baber Abbasi committed
70
    _DEFAULT_MAX_LENGTH = 2048
lintangsutawika's avatar
lintangsutawika committed
71
72
73

    def __init__(
        self,
74
        model: str = "text-davinci-003",
lintangsutawika's avatar
lintangsutawika committed
75
        truncate: bool = False,
Baber Abbasi's avatar
Baber Abbasi committed
76
        max_gen_toks: int = 256,
lintangsutawika's avatar
lintangsutawika committed
77
        batch_size: int = 1,
Baber Abbasi's avatar
Baber Abbasi committed
78
79
        seed: int = 1234,
        max_length: Optional[int] = None,
lintangsutawika's avatar
lintangsutawika committed
80
81
82
83
84
85
86
87
88
    ) -> None:
        """

        :param engine: str
            OpenAI API engine (e.g. davinci)
        :param truncate: bool
            Truncate input if too long (if False and input is too long, throw error)
        """
        super().__init__()
Baber Abbasi's avatar
Baber Abbasi committed
89
        self.seed = seed
lintangsutawika's avatar
lintangsutawika committed
90
91
92
93
94
95
96
        try:
            import openai, tiktoken  # noqa: E401
        except ModuleNotFoundError:
            raise Exception(
                "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
    please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
            )
Baber Abbasi's avatar
Baber Abbasi committed
97
        self.model = model
98
        self.tokenizer = tiktoken.encoding_for_model(self.model)
lintangsutawika's avatar
lintangsutawika committed
99
100
101
        self.vocab_size = self.tokenizer.n_vocab
        self.truncate = truncate
        self.end_of_text_token_id = self.tokenizer.eot_token
Baber Abbasi's avatar
Baber Abbasi committed
102
103
        self._max_gen_toks = max_gen_toks
        self._max_length = max_length
lintangsutawika's avatar
lintangsutawika committed
104
105

        # Read from environment variable OPENAI_API_SECRET_KEY
Baber Abbasi's avatar
Baber Abbasi committed
106
        openai.api_key = os.environ["OPENAI_API_KEY"]
lintangsutawika's avatar
lintangsutawika committed
107
108
109
110
111
112
113

    @property
    def eot_token_id(self):
        return self.end_of_text_token_id

    @property
    def max_length(self) -> int:
Baber Abbasi's avatar
Baber Abbasi committed
114
115
116
117
        if self._max_length:
            return self._max_length
        else:
            return self._DEFAULT_MAX_LENGTH
lintangsutawika's avatar
lintangsutawika committed
118
119
120

    @property
    def max_gen_toks(self) -> int:
Baber Abbasi's avatar
Baber Abbasi committed
121
        return self._max_gen_toks
lintangsutawika's avatar
lintangsutawika committed
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198

    @property
    def batch_size(self):
        # Isn't used because we override _loglikelihood_tokens
        raise NotImplementedError()

    @property
    def device(self):
        # Isn't used because we override _loglikelihood_tokens
        raise NotImplementedError()

    def tok_encode(self, string: str) -> List[int]:
        return self.tokenizer.encode(string)

    def tok_decode(self, tokens: List[int]) -> str:
        return self.tokenizer.decode(tokens)

    def _encode_pair(
        self, context: str, continuation: str
    ) -> Tuple[List[int], List[int]]:
        n_spaces = len(context) - len(context.rstrip())
        if n_spaces > 0:
            continuation = context[-n_spaces:] + continuation
            context = context[:-n_spaces]
        whole_enc = self.tok_encode(context + continuation)
        context_enc = self.tok_encode(context)
        context_enc_len = len(context_enc)
        continuation_enc = whole_enc[context_enc_len:]
        return context_enc, continuation_enc

    def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
        new_reqs = []
        for context, continuation in [req.args for req in requests]:
            if context == "":
                # end of text as context
                context_enc, continuation_enc = [self.eot_token_id], self.tok_encode(
                    continuation
                )
            else:
                context_enc, continuation_enc = self._encode_pair(context, continuation)

            new_reqs.append(((context, continuation), context_enc, continuation_enc))

        return self._loglikelihood_tokens(new_reqs)

    def _loglikelihood_tokens(
        self, requests, disable_tqdm: bool = False
    ) -> List[Tuple[float, bool]]:
        res = []

        def _collate(x):
            # this doesn't efficiently handle last-token differences yet, but those are kinda annoying because
            # it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations
            # we care about, and so we need some kind of backup for when it isn't
            toks = x[1] + x[2]
            return -len(toks), tuple(toks)

        re_ord = utils.Reorderer(requests, _collate)

        for chunk in tqdm(
            list(utils.chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE)),
            disable=disable_tqdm,
        ):
            inps = []
            ctxlens = []
            for cache_key, context_enc, continuation_enc in chunk:
                # max_length+1 because the API takes up to 2049 tokens, including the first context token
                inp = (context_enc + continuation_enc)[-(self.max_length + 1) :]
                # TODO: the logic is much simpler if we just look at the length of continuation tokens
                ctxlen = len(context_enc) - max(
                    0, len(context_enc) + len(continuation_enc) - (self.max_length + 1)
                )

                inps.append(inp)
                ctxlens.append(ctxlen)

            response = oa_completion(
Baber Abbasi's avatar
Baber Abbasi committed
199
                model=self.model,
lintangsutawika's avatar
lintangsutawika committed
200
201
202
203
204
                prompt=inps,
                echo=True,
                max_tokens=0,
                temperature=0.0,
                logprobs=10,
Baber Abbasi's avatar
Baber Abbasi committed
205
                seed=self.seed,
lintangsutawika's avatar
lintangsutawika committed
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
            )

            for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip(
                response.choices, ctxlens, chunk
            ):
                answer = get_result(resp, ctxlen)

                res.append(answer)

                # partial caching
                if cache_key is not None:
                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
        return re_ord.get_original(res)

    def generate_until(self, requests) -> List[str]:
        if not requests:
            return []
        res = []
        requests = [req.args for req in requests]

        def _collate(x):
            toks = self.tok_encode(x[0])
            return len(toks), x[0]

        re_ord = utils.Reorderer(requests, _collate)

        def sameuntil_chunks(xs, size):
            ret = []
            lastuntil = xs[0][1]
            for x in xs:
                if len(ret) >= size or x[1] != lastuntil:
                    yield ret, lastuntil
                    ret = []
                    lastuntil = x[1]
                ret.append(x)

            if ret:
                yield ret, lastuntil

        # todo: more intelligent batching for heterogeneous `until`
        for chunk, request_args in tqdm(
            list(sameuntil_chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE))
        ):
            inps = []
            for context, _ in chunk:
                context_enc = self.tok_encode(context)
                inp = context_enc[-(self.max_length - self.max_gen_toks) :]
                inps.append(inp)

Baber Abbasi's avatar
Baber Abbasi committed
255
256
257
            until = request_args.pop("until", ["<|endoftext|>"])
            request_args.pop("do_sample", None)
            request_args["temperature"] = request_args.get("temperature", 0)
lintangsutawika's avatar
lintangsutawika committed
258
259

            response = oa_completion(
260
                model=self.model,
lintangsutawika's avatar
lintangsutawika committed
261
262
263
                prompt=inps,
                max_tokens=self.max_gen_toks,
                stop=until,
Baber Abbasi's avatar
Baber Abbasi committed
264
265
                seed=self.seed,
                **request_args,
lintangsutawika's avatar
lintangsutawika committed
266
267
            )
            for resp, (context, args_) in zip(response.choices, chunk):
Baber Abbasi's avatar
Baber Abbasi committed
268
                s = getattr(resp, "text")
lintangsutawika's avatar
lintangsutawika committed
269

Baber Abbasi's avatar
Baber Abbasi committed
270
                until_ = until
lintangsutawika's avatar
lintangsutawika committed
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323

                for term in until_:
                    if len(term) > 0:
                        s = s.split(term)[0]

                # partial caching
                self.cache_hook.add_partial(
                    "generate_until", (context, {"until": until_}), s
                )

                res.append(s)
        return re_ord.get_original(res)

    def _model_call(self, inps):
        # Isn't used because we override _loglikelihood_tokens
        raise NotImplementedError()

    def _model_generate(self, context, max_length, eos_token_id):
        # Isn't used because we override generate_until
        raise NotImplementedError()

    def loglikelihood_rolling(self, requests) -> List[float]:
        loglikelihoods = []

        for (string,) in tqdm([req.args for req in requests]):
            rolling_token_windows = list(
                map(
                    utils.make_disjoint_window,
                    utils.get_rolling_token_windows(
                        token_list=self.tok_encode(string),
                        prefix_token=self.eot_token_id,
                        max_seq_len=self.max_length,
                        context_len=1,
                    ),
                )
            )

            # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
            rolling_token_windows = [(None,) + x for x in rolling_token_windows]

            string_nll = self._loglikelihood_tokens(
                rolling_token_windows,
                disable_tqdm=True,
            )

            # discard is_greedy
            string_nll = [x[0] for x in string_nll]

            string_nll = sum(string_nll)
            loglikelihoods.append(string_nll)
        return loglikelihoods


324
def oa_chat_completion(client, **kwargs):
325
326
327
328
329
330
331
332
333
334
335
336
    """Query OpenAI API for chat completion.

    Retry with back-off until they respond
    """
    try:
        import openai, tiktoken  # noqa: E401
    except ModuleNotFoundError:
        raise Exception(
            "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
        )

337
338
339
340
    async def _get_completions(**kwargs):
        chat_completions = await client.chat.completions.create(**kwargs)
        return chat_completions

341
342
343
    backoff_time = 3
    while True:
        try:
lintangsutawika's avatar
lintangsutawika committed
344
345
            return client.chat.completions.create(**kwargs)
        except openai.OpenAIError:
346
347
348
349
350
351
352
            import traceback

            traceback.print_exc()
            time.sleep(backoff_time)
            backoff_time *= 1.5


353
354
@register_model("openai-chat-completions")
class OpenaiChatCompletionsLM(LM):
355
    def __init__(
lintangsutawika's avatar
update  
lintangsutawika committed
356
        self, model: str = "gpt-3.5-turbo", truncate: bool = False, batch_size: int = 1
357
    ) -> None:
358
359
        """

lintangsutawika's avatar
lintangsutawika committed
360
361
        :param model: str
            OpenAI API model (e.g. gpt-3.5-turbo)
362
363
364
365
366
367
368
369
370
371
372
        :param truncate: bool
            Truncate input if too long (if False and input is too long, throw error)
        """
        super().__init__()
        try:
            import openai, tiktoken  # noqa: E401
        except ModuleNotFoundError:
            raise Exception(
                "attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
    please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
            )
lintangsutawika's avatar
lintangsutawika committed
373
374
375
376
377
378
379
380
        self.model = model
        self.frequency_penalty = 0
        self.logit_bias = None
        self.n = 1
        self.presence_penalty = 0
        self.temperature = 1
        self.top_p = 1
        self.tokenizer = tiktoken.encoding_for_model(self.model)
381
382
383
384
        self.vocab_size = self.tokenizer.n_vocab
        self.truncate = truncate
        self.end_of_text_token_id = self.tokenizer.eot_token

385
        # Read from environment variable OPENAI_API_KEY
386
        self.client = openai.OpenAI()  # openai.AsyncOpenAI()
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417

    @property
    def eot_token_id(self):
        return self.end_of_text_token_id

    @property
    def max_length(self) -> int:
        # Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
        return 2048

    @property
    def max_gen_toks(self) -> int:
        return 256

    @property
    def batch_size(self):
        # Isn't used because we override _loglikelihood_tokens
        raise NotImplementedError()

    @property
    def device(self):
        # Isn't used because we override _loglikelihood_tokens
        raise NotImplementedError()

    def tok_encode(self, string: str) -> List[int]:
        return self.tokenizer.encode(string)

    def tok_decode(self, tokens: List[int]) -> str:
        return self.tokenizer.decode(tokens)

    def _encode_pair(
lintangsutawika's avatar
update  
lintangsutawika committed
418
        self, context: str, continuation: str
419
420
421
422
423
424
425
426
427
428
    ) -> Tuple[List[int], List[int]]:
        n_spaces = len(context) - len(context.rstrip())
        if n_spaces > 0:
            continuation = context[-n_spaces:] + continuation
            context = context[:-n_spaces]
        whole_enc = self.tok_encode(context + continuation)
        context_enc = self.tok_encode(context)
        context_enc_len = len(context_enc)
        continuation_enc = whole_enc[context_enc_len:]
        return context_enc, continuation_enc
429

430
    def generate_until(self, requests) -> List[str]:
lintangsutawika's avatar
update  
lintangsutawika committed
431
432
        res = defaultdict(list)
        re_ords = {}
433
434
435

        def _collate(x):
            toks = self.tok_encode(x[0])
lintangsutawika's avatar
update  
lintangsutawika committed
436
            return -len(toks), x[0]
437

lintangsutawika's avatar
update  
lintangsutawika committed
438
439
440
441
442
443
444
        # we group requests by their generation_kwargs,
        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
        # in the same batch.
        grouper = utils.Grouper(requests, lambda x: str(x.args[1]))
        for key, reqs in grouper.get_grouped().items():
            # within each set of reqs for given kwargs, we reorder by token length, descending.
            re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
445
446
447
448
449
450
451
452
453
454
455
456
457
458

        def sameuntil_chunks(xs, size):
            ret = []
            lastuntil = xs[0][1]
            for x in xs:
                if len(ret) >= size or x[1] != lastuntil:
                    yield ret, lastuntil
                    ret = []
                    lastuntil = x[1]
                ret.append(x)

            if ret:
                yield ret, lastuntil

lintangsutawika's avatar
update  
lintangsutawika committed
459
460
        pbar = tqdm(total=len(requests), disable=(self.rank != 0))
        for key, re_ord in re_ords.items():
461
462
            # n needs to be 1 because messages in
            # chat completion are not batch but
463
464
            # is regarded as a single conversation.
            chunks = utils.chunks(re_ord.get_reordered(), n=1)
lintangsutawika's avatar
update  
lintangsutawika committed
465
466
467
468
            for chunk in chunks:
                contexts, all_gen_kwargs = zip(*chunk)
                inps = [{"role": "user", "content": context} for context in contexts]

469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
                gen_kwargs = all_gen_kwargs[0]
                until = None
                if isinstance(gen_kwargs, dict):
                    kwargs = copy.deepcopy(gen_kwargs)  # edge case for repeats > 1
                    if "until" in kwargs.keys():
                        until = kwargs.pop("until")
                        if isinstance(until, str):
                            until = [kwargs]
                        elif not isinstance(until, list):
                            raise ValueError(
                                f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
                            )
                else:
                    raise ValueError(
                        f"Expected `kwargs` to be of type `dict` but got {kwargs}"
                    )

                if "max_gen_toks" in kwargs.keys():
                    max_gen_toks = kwargs.pop("max_gen_toks")
                else:
                    max_gen_toks = self.max_gen_toks

                response = oa_chat_completion(
                    client=self.client,
                    messages=inps,
                    model=self.model,
                    frequency_penalty=self.frequency_penalty,
                    # logit_bias=self.logit_bias,
                    max_tokens=max_gen_toks,
                    n=self.n,
                    presence_penalty=self.presence_penalty,
                    temperature=self.temperature,
                    top_p=self.top_p,
lintangsutawika's avatar
update  
lintangsutawika committed
502
                )
503

504
505
                for resp, (context, args_) in zip(response.choices, chunk):
                    s = resp.message.content
506

507
508
509
510
                    if until is not None:
                        for term in until:
                            if len(term) > 0:
                                s = s.split(term)[0]
lintangsutawika's avatar
update  
lintangsutawika committed
511

512
                    res[key].append(s)
lintangsutawika's avatar
update  
lintangsutawika committed
513

514
515
516
517
518
                    self.cache_hook.add_partial(
                        "generate_until", (context, {"until": until}), s
                    )
                    pbar.update(1)
            # reorder this group of results back to original unsorted form
lintangsutawika's avatar
update  
lintangsutawika committed
519
520
521
            res[key] = re_ord.get_original(res[key])

        pbar.close()
522

lintangsutawika's avatar
update  
lintangsutawika committed
523
        return grouper.get_original(res)
524
525
526
527
528
529

    def loglikelihood(self, requests):
        raise NotImplementedError("No support for logits.")

    def loglikelihood_rolling(self, requests):
        raise NotImplementedError("No support for logits.")