Unverified Commit b0f67f2c authored by Hailey Schoelkopf's avatar Hailey Schoelkopf Committed by GitHub
Browse files

Merge pull request #736 from baberabb/big-refactor_opai

[Refactor] fixed openai
parents 6c753760 ca386392
......@@ -36,15 +36,19 @@ The LM class enforces a common interface via which we can extract responses from
```python
class MyCustomLM(LM):
#...
def loglikelihood(self, requests):
def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]:
#...
def loglikelihood_rolling(self, requests):
def loglikelihood_rolling(self, requests: list[Instance]) -> list[tuple[float, bool]]:
#...
def greedy_until(self, requests):
def greedy_until(self, requests: list[Instance]) -> list[str]:
#...
#...
```
Where `Instance` is a dataclass defined in [`lm_eval.api.instance`](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/api/instance.py) with property `args` which returns a tuple of (context, continuation).
We support
......
import abc
import os
from typing import Union
from typing import Union, List, Tuple
from sqlitedict import SqliteDict
import json
import hashlib
......@@ -25,31 +25,32 @@ class LM(abc.ABC):
self.cache_hook = CacheHook(None)
@abc.abstractmethod
def loglikelihood(self, requests):
def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
"""Compute log-likelihood of generating a continuation from a context.
Downstream tasks should attempt to use loglikelihood instead of other
LM calls whenever possible.
:param requests: list
A list of pairs (context, continuation)
context: str
:param requests: list[Instance]
A list of Instance objects, with property `args` which returns a tuple (context, continuation).
`context: str`
Context string. Implementations of LM must be able to handle an
empty context string.
continuation: str
`continuation: str`
The continuation over which log likelihood will be calculated. If
there is a word boundary, the space should be in the continuation.
For example, context="hello" continuation=" world" is correct.
:return: list
:return: list[tuple[float, bool]]
A list of pairs (logprob, isgreedy)
logprob: float
The log probability of `continuation`
isgreedy:
Whether `continuation` would be generated by greedy sampling from `context`
`logprob: float`
The log probability of `continuation`.
`isgreedy`:
Whether `continuation` would be generated by greedy sampling from `context`.
"""
pass
@abc.abstractmethod
def loglikelihood_rolling(self, requests):
def loglikelihood_rolling(self, requests) -> List[Tuple[float, bool]]:
"""Compute full log-likelihood of a string, with no truncation, for perplexity computation
- We will use the full max context length of the model.
- For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
......@@ -77,11 +78,11 @@ class LM(abc.ABC):
1. Each token is predicted exactly once
2. For the last pair, we provide the full context, but only score the last two tokens
:param requests: list
A list of strings
:param requests: list[Instance]
A list of Instance objects with property `args` which returns a tuple (context, continuation).
string: str
String for which we are computing per-token loglikelihood
:return: list
:return: list[tuple[float, bool]]
A list of pairs (logprob, isgreedy)
logprob: float
The log probability of `continuation`
......@@ -92,17 +93,17 @@ class LM(abc.ABC):
# TODO: Add an optional max length
@abc.abstractmethod
def greedy_until(self, requests):
def greedy_until(self, requests) -> List[str]:
"""Generate greedily until a stopping sequence
:param requests: list
A list of pairs (context, until)
:param requests: list[Instance]
A list of Instance objects with property `args` which returns a tuple (context, until).
context: str
Context string
until: [str]
The string sequences to generate until. These string sequences
may each span across multiple tokens, or may be part of one token.
:return: list
:return: list[str]
A list of strings continuation
continuation: str
The generated continuation.
......
......@@ -85,7 +85,9 @@ def simple_evaluate(
1234
) # TODO: this may affect training runs that are run with evaluation mid-run.
assert tasks != [], "No tasks specified"
assert (
tasks != []
), "No tasks specified, or no tasks found. Please verify the task names."
if isinstance(model, str):
if model_args is None:
......
import os
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from tqdm import tqdm
import time
from lm_eval.logger import eval_logger
from typing import List, Literal, Any
from typing import List, Any, Tuple
def anthropic_completion(
......@@ -15,10 +14,25 @@ def anthropic_completion(
temperature: float,
stop: List[str],
**kwargs: Any,
):
"""Query Anthropic API for completion.
Retry with back-off until they respond
) -> str:
"""Wrapper function around the Anthropic completion API client with exponential back-off
in case of RateLimitError.
params:
client: anthropic.Anthropic
Anthropic API client
model: str
Anthropic model e.g. 'claude-instant-v1', 'claude-2'
prompt: str
Prompt to feed to the model
max_tokens_to_sample: int
Maximum number of tokens to sample from the model
temperature: float
Sampling temperature
stop: List[str]
List of stop sequences
kwargs: Any
Additional model_args to pass to the API client
"""
try:
......@@ -29,7 +43,7 @@ def anthropic_completion(
please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e .[anthropic]`",
)
backoff_time = 3
backoff_time: float = 3
while True:
try:
response = client.completions.create(
......@@ -94,15 +108,15 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
@property
def eot_token_id(self):
# Not sure but anthropic.AI_PROMPT -> [203, 203, 50803, 30]
# Not sure but anthropic.HUMAN_PROMPT ?
raise NotImplementedError("No idea about anthropic tokenization.")
@property
def max_length(self):
def max_length(self) -> int:
return 2048
@property
def max_gen_toks(self):
def max_gen_toks(self) -> int:
return self.max_tokens_to_sample
@property
......@@ -124,14 +138,15 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
raise NotImplementedError("No support for logits.")
def greedy_until(self, requests):
def greedy_until(self, requests) -> List[str]:
if not requests:
return []
requests = [req.args for req in requests]
_requests: List[Tuple[str, dict]] = [req.args for req in requests]
res = []
for request in tqdm(requests):
for request in tqdm(_requests):
try:
inp = request[0]
request_args = request[1]
......@@ -145,16 +160,16 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
prompt=inp,
max_tokens_to_sample=max_gen_toks,
temperature=temperature, # TODO: implement non-greedy sampling for Anthropic
stop=until,
stop=until, # type: ignore
**self.kwargs,
)
res.append(response)
self.cache_hook.add_partial("greedy_until", request, response)
except anthropic.APIConnectionError as e: # noqa: F821
except anthropic.APIConnectionError as e: # type: ignore # noqa: F821
eval_logger.critical(f"Server unreachable: {e.__cause__}")
break
except anthropic.APIStatusError as e: # noqa: F821
except anthropic.APIStatusError as e: # type: ignore # noqa: F821
eval_logger.critical(f"API error {e.status_code}: {e.message}")
break
......
import os
import time
import transformers
import numpy as np
from typing import List, Tuple
from tqdm import tqdm
from lm_eval import utils
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
def get_result(response, ctxlen):
def get_result(response: dict, ctxlen: int) -> Tuple[float, bool]:
"""Process results from OpenAI API response.
:param response: dict
......@@ -43,7 +40,13 @@ def oa_completion(**kwargs):
Retry with back-off until they respond
"""
import openai
try:
import openai, tiktoken # noqa: E401
except ModuleNotFoundError:
raise Exception(
"attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
)
backoff_time = 3
while True:
......@@ -61,7 +64,12 @@ def oa_completion(**kwargs):
class OpenaiCompletionsLM(LM):
REQ_CHUNK_SIZE = 20
def __init__(self, engine, truncate=False):
def __init__(
self,
engine: str = "text-davinci-003",
truncate: bool = False,
batch_size: int = 1,
):
"""
:param engine: str
......@@ -70,28 +78,25 @@ class OpenaiCompletionsLM(LM):
Truncate input if too long (if False and input is too long, throw error)
"""
super().__init__()
import openai
try:
import openai, tiktoken # noqa: E401
except ModuleNotFoundError:
raise Exception(
"attempted to use 'openai' LM type, but package `openai` or `tiktoken` are not installed. \
please install these via `pip install lm-eval[openai]` or `pip install -e .[openai]`",
)
self.engine = engine
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2")
self.vocab_size = self.tokenizer.vocab_size
# to make the annoying "Using pad_token, but it is not set yet." error go away
self.tokenizer.pad_token = "<|endoftext|>"
assert self.tokenizer.encode("hello\n\nhello") == [31373, 198, 198, 31373]
self.tokenizer = tiktoken.encoding_for_model(self.engine)
self.vocab_size = self.tokenizer.n_vocab
self.truncate = truncate
self.end_of_text_token_id = self.tokenizer.convert_tokens_to_ids(
["<|endoftext|>"]
)[0]
self.end_of_text_token_id = self.tokenizer.eot_token
# Read from environment variable OPENAI_API_SECRET_KEY
openai.api_key = os.environ["OPENAI_API_SECRET_KEY"]
@property
def eot_token_id(self):
return self.tokenizer.eos_token_id
return self.end_of_text_token_id
@property
def max_length(self):
......@@ -112,19 +117,49 @@ class OpenaiCompletionsLM(LM):
# Isn't used because we override _loglikelihood_tokens
raise NotImplementedError()
def tok_encode(self, string: str):
return self.tokenizer.encode(string, add_special_tokens=False)
def tok_encode(self, string: str) -> List[int]:
return self.tokenizer.encode(string)
def tok_decode(self, tokens):
def tok_decode(self, tokens: List[int]) -> str:
return self.tokenizer.decode(tokens)
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
def _encode_pair(
self, context: str, continuation: str
) -> Tuple[List[int], List[int]]:
n_spaces = len(context) - len(context.rstrip())
if n_spaces > 0:
continuation = context[-n_spaces:] + continuation
context = context[:-n_spaces]
whole_enc = self.tok_encode(context + continuation)
context_enc = self.tok_encode(context)
context_enc_len = len(context_enc)
continuation_enc = whole_enc[context_enc_len:]
return context_enc, continuation_enc
def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
new_reqs = []
for context, continuation in [req.args for req in requests]:
if context == "":
# end of text as context
context_enc, continuation_enc = [self.eot_token_id], self.tok_encode(
continuation
)
else:
context_enc, continuation_enc = self._encode_pair(context, continuation)
new_reqs.append(((context, continuation), context_enc, continuation_enc))
return self._loglikelihood_tokens(new_reqs)
def _loglikelihood_tokens(
self, requests, disable_tqdm=False
) -> List[Tuple[float, bool]]:
res = []
def _collate(x):
# this doesn't efficiently handle last-token differences yet, but those are kinda annoying because
# it's not guaranteed that the 100 or so logprobs we get to see actually contain all the continuations
# we care about and so we need some kind of backup for when it isn't
# we care about, and so we need some kind of backup for when it isn't
toks = x[1] + x[2]
return -len(toks), tuple(toks)
......@@ -166,13 +201,13 @@ class OpenaiCompletionsLM(LM):
# partial caching
if cache_key is not None:
self.cache_hook.add_partial("loglikelihood", cache_key, answer)
return re_ord.get_original(res)
def greedy_until(self, requests):
def greedy_until(self, requests) -> List[str]:
if not requests:
return []
res = []
requests = [req.args for req in requests]
def _collate(x):
toks = self.tok_encode(x[0])
......@@ -203,12 +238,7 @@ class OpenaiCompletionsLM(LM):
inp = context_enc[-(self.max_length - self.max_gen_toks) :]
inps.append(inp)
try:
until = request_args["until"][
0
] # TODO: does this handle a list of stop seqs correctly?
except KeyError:
until = "<|endoftext|>"
until = request_args.get("until", ["<|endoftext|>"])
response = oa_completion(
engine=self.engine,
......@@ -222,7 +252,7 @@ class OpenaiCompletionsLM(LM):
for resp, (context, args_) in zip(response.choices, chunk):
s = resp["text"]
until_ = args_.get(["until"], [])
until_ = args_.get("until", ["<|endoftext|>"])
for term in until_:
if len(term) > 0:
......@@ -234,7 +264,6 @@ class OpenaiCompletionsLM(LM):
)
res.append(s)
return re_ord.get_original(res)
def _model_call(self, inps):
......@@ -244,3 +273,34 @@ class OpenaiCompletionsLM(LM):
def _model_generate(self, context, max_length, eos_token_id):
# Isn't used because we override greedy_until
raise NotImplementedError()
def loglikelihood_rolling(self, requests) -> List[float]:
loglikelihoods = []
for (string,) in tqdm([req.args for req in requests]):
rolling_token_windows = list(
map(
utils.make_disjoint_window,
utils.get_rolling_token_windows(
token_list=self.tok_encode(string),
prefix_token=self.eot_token_id,
max_seq_len=self.max_length,
context_len=1,
),
)
)
# TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
rolling_token_windows = [(None,) + x for x in rolling_token_windows]
string_nll = self._loglikelihood_tokens(
rolling_token_windows,
disable_tqdm=True,
)
# discard is_greedy
string_nll = [x[0] for x in string_nll]
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
return loglikelihoods
......@@ -36,7 +36,6 @@ setuptools.setup(
"evaluate>=0.4.0",
"jsonlines",
"numexpr",
"openai>=0.6.4",
"omegaconf>=2.2",
"peft>=0.2.0",
"pybind11>=2.6.2",
......@@ -67,5 +66,6 @@ setuptools.setup(
],
"gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
"anthropic": ["anthropic"],
"openai": ["openai", "tiktoken"],
},
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment