import os import numpy as np import transformers from lm_eval.base import BaseLM from lm_eval import utils from tqdm import tqdm import time def get_result(response, ctxlen): """Process results from OpenAI API response. :param response: dict OpenAI API Response :param ctxlen: int Length of context (so we can slice them away and only keep the predictions) :return: continuation_logprobs: np.array Log probabilities of continuation tokens is_greedy: bool whether argmax matches given continuation exactly """ is_greedy = True logprobs = response["logprobs"]["token_logprobs"][:-1] continuation_logprobs = sum(logprobs[ctxlen:]) print(logprobs[ctxlen:]) for i in range(ctxlen, len(response["logprobs"]["tokens"][:-1])): token = response["logprobs"]["tokens"][:-1][i] top_tokens = response["logprobs"]["top_logprobs"][:-1][i] top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x]) if top_token != token: is_greedy = False break return continuation_logprobs, is_greedy class _goose: choices: list def oa_completion(**kwargs): """ Query OpenAI API for completion. Retry with back-off until they respond """ import openai backoff_time = 3 # print(kwargs) if len(kwargs["prompt"]) > 1 and isinstance(kwargs["prompt"], list): import dask res = [] for pmpt in kwargs["prompt"]: k = kwargs.copy() k["prompt"] = [pmpt] res.append(dask.delayed(oa_completion)(**k)) r = dask.compute(*res) ob = _goose() ob.choices = [x.choices[0] for x in r] while True: try: ret = openai.Completion.create(**kwargs) # print(ret.choices[0]) return ret except openai.error.OpenAIError: import traceback traceback.print_exc() time.sleep(backoff_time) backoff_time *= 1.5 class GPT3LM(BaseLM): REQ_CHUNK_SIZE = 20 def __init__(self, engine, truncate=False, api_key=None, pass_strings=False): """ :param engine: str OpenAI API engine (e.g. davinci) :param truncate: bool Truncate input if too long (if False and input is too long, throw error) """ super().__init__() import openai self.engine = engine self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2') self.pass_strings = pass_strings self.vocab_size = self.tokenizer.vocab_size # to make the annoying "Using pad_token, but it is not set yet." error go away self.tokenizer.pad_token = "<|endoftext|>" assert self.tokenizer.encode('hello\n\nhello') == [31373, 198, 198, 31373] self.truncate = truncate self.end_of_text_token_id = self.tokenizer.convert_tokens_to_ids(["<|endoftext|>"])[0] # Read from environment variable OPENAI_API_SECRET_KEY openai.api_key = api_key or os.environ["OPENAI_API_SECRET_KEY"] @property def eot_token_id(self): return self.tokenizer.eos_token_id @property def max_length(self): # Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token return 2048 @property def max_gen_toks(self): return 256 @property def batch_size(self): # Isn't used because we override _loglikelihood_tokens raise NotImplementedError() @property def device(self): # Isn't used because we override _loglikelihood_tokens raise NotImplementedError() def tok_encode(self, string: str): return self.tokenizer.encode(string, add_special_tokens=False) def tok_decode(self, tokens): return self.tokenizer.decode(tokens) def _loglikelihood_tokens(self, requests, disable_tqdm=False): res = [] def _collate(x): # this doesn't efficiently handle last-token differences yet, but those are kinda annoying because # it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations # we care about and so we need some kind of backup for when it isn't toks = x[1] + x[2] return -len(toks), tuple(toks) reord = utils.Reorderer(requests, _collate) for chunk in tqdm(list(utils.chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE)), disable=disable_tqdm): inps = [] ctxlens = [] for cache_key, context_enc, continuation_enc in chunk: # max_length+1 because the API takes up to 2049 tokens, including the first context token inp = (context_enc + continuation_enc)[-(self.max_length+1):] # TODO: the logic is much simpler if we just look at the length of continuation tokens ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - (self.max_length+1)) print(inp) if self.pass_strings: inp = self.tok_decode(inp) inps.append(inp) ctxlens.append(ctxlen) response = oa_completion( engine=self.engine, prompt=inps, echo=True, max_tokens=1, logprobs=10, ) for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip(response.choices, ctxlens, chunk): answer = get_result(resp, ctxlen) res.append(answer) # partial caching if cache_key is not None: self.cache_hook.add_partial("loglikelihood", cache_key, answer) return reord.get_original(res) def greedy_until(self, requests): if not requests: return [] res = [] def _collate(x): toks = self.tok_encode(x[0]) return len(toks), x[0] reord = utils.Reorderer(requests, _collate) def sameuntil_chunks(xs, size): ret = [] lastuntil = xs[0][1] for x in xs: if len(ret) >= size or x[1] != lastuntil: yield ret, lastuntil ret = [] lastuntil = x[1] ret.append(x) if ret: yield ret, lastuntil # todo: more intelligent batching for heterogeneous `until` for chunk, until in tqdm(list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))): inps = [] for context, _ in chunk: context_enc = self.tok_encode(context) inp = context_enc[-(self.max_length - self.max_gen_toks):] inps.append(self.tok_decode(inp)) response = oa_completion( engine=self.engine, prompt=inps, max_tokens=self.max_gen_toks, temperature=0., # logprobs=10, stop=until, ) for resp, (context, until_) in zip(response.choices, chunk): s = resp['text'] for term in until_: s = s.split(term)[0] # partial caching self.cache_hook.add_partial("greedy_until", (context, until_), s) res.append(s) return reord.get_original(res) def _model_call(self, inps): # Isn't used because we override _loglikelihood_tokens raise NotImplementedError() def _model_generate(self, context, max_length, eos_token_id): # Isn't used because we override greedy_until raise NotImplementedError() class GooseAILM(GPT3LM): def __init__(self, engine, truncate=False, api_key=None, force_pile_tokenizer=False): super().__init__(engine, truncate=truncate, api_key=api_key or os.environ["GOOSEAI_API_SECRET_KEY"], pass_strings=True) self.REQ_CHUNK_SIZE = 1 import openai openai.api_base = "https://api.goose.ai/v1" from best_download import download_file if engine == "gpt-neo-20b" or force_pile_tokenizer: download_file("http://eaidata.bmk.sh/data/pile_tokenizer.json", expected_checksum="d27f071586925d23ef1c4acdee28fb8bf5d99c4a9d638b4e3b08812e3eae6ee7", local_file="pile_tokenizer.json") self.tokenizer = transformers.PreTrainedTokenizerFast(tokenizer_file="pile_tokenizer.json") @property def max_length(self): # Note: this is temporary, will be raised to 2048 in the future return 1023 @property def eot_token_id(self): return self.tokenizer.eos_token_id @property def max_gen_toks(self): return 64