Unverified Commit 42dc2448 authored by Baber Abbasi's avatar Baber Abbasi Committed by GitHub
Browse files

Refactor API models (#2008)



* refactor pad_token handling to fn

* fix docs

* add pad_token_handling to vllm

* start on API superclass

* don't detokenize the returned logits

* streamline vllm tokenizer

* add type hint

* pre-commit

* seems to be in working order

* add model to init

* refactor api models

* nit

* cleanup

* add pbar

* fix type hints

* change optional dependencies

* json encode chat template

* add type hints

* deal with different prompt input requiremnts

* nits

* fix

* cache inside async

* fix

* fix

* nits

* nits

* nits

* nit

* fixup

* fixup

* nit

* add dummy retry

* add dummy retry

* handle imports; skip failing test

* add type hint

* add tests

* add dependency to tests

* add package names to exception

* nit

* docs; type hints

* handle api key

* nit

* tokenizer bug

* fix tokenizer

* nit

* nit

* add better error messages

* nit

* remove decorator

* CI: install api dep

* revert evaluator.py

* consolidate

* consolidate

* nits

* nit

* fix typealias

* nit

* nit

* nit

* Update lm_eval/models/api_models.py

typo
Co-authored-by: default avatarHailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* Update lm_eval/models/openai_completions.py
Co-authored-by: default avatarHailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* Update lm_eval/models/anthropic_llms.py
Co-authored-by: default avatarHailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* Update lm_eval/models/api_models.py
Co-authored-by: default avatarHailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>

* fix typo

* add news section

* add info for API

* pre-commit

* typo

* fix bug: unpack logliklehood requests

* fix bug: shared gen_kwargs mutated

* nit: handle copy properly

* Update README.md

* Update README.md

* Update README.md

* Update api_models.py

* Update README.md

---------
Co-authored-by: default avatarHailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com>
parent 4a62757d
......@@ -56,7 +56,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e '.[dev,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
pip install -e '.[dev,sentencepiece,api]' --extra-index-url https://download.pytorch.org/whl/cpu
# Install optional git dependencies
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
......@@ -84,7 +84,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e '.[dev,optimum,deepsparse,sparseml]' --extra-index-url https://download.pytorch.org/whl/cpu
pip install -e '.[dev,optimum,deepsparse,sparseml,api]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Test with pytest
run: python -m pytest tests/models --showlocals -s -vv
- name: Archive artifacts
......
This diff is collapsed.
......@@ -55,7 +55,7 @@ class LM(abc.ABC):
pass
@abc.abstractmethod
def loglikelihood_rolling(self, requests) -> List[Tuple[float]]:
def loglikelihood_rolling(self, requests) -> List[float]:
"""Compute full log-likelihood of a string, with no truncation, for perplexity computation
- We will use the full max context length of the model.
- For inputs that exceed the max context length, we divide the tokenized string into chunks of up to
......@@ -101,14 +101,13 @@ class LM(abc.ABC):
"""Generate greedily until a stopping sequence
:param requests: list[Instance]
A list of Instance objects with property `args` which returns a tuple (context, until).
A list of Instance objects with property `args` which returns a tuple (context, gen_kwargs).
context: str
Context string
until: [str]
The string sequences to generate until. These string sequences
may each span across multiple tokens, or may be part of one token.
gen_kwargs: dict
A dictionary of keyword arguments to pass to the generation function e.g. top_k, until, etc.
:return: list[str]
A list of strings continuation
A list of model generated continuations.
continuation: str
The generated continuation.
"""
......@@ -325,14 +324,19 @@ class TemplateLM(LM):
return self.eot_token_id
@abc.abstractmethod
def tok_encode(self, string: str, **kwargs):
def tok_encode(self, string: str, **kwargs) -> List[int]:
"""
Tokenize a string using the model's tokenizer and return a list of token IDs.
"""
pass
@abc.abstractmethod
def _loglikelihood_tokens(self, requests, **kwargs):
def _loglikelihood_tokens(self, requests, **kwargs) -> List[Tuple[float, bool]]:
pass
def _encode_pair(self, context, continuation):
def _encode_pair(
self, context: str, continuation: str
) -> Tuple[List[int], List[int]]:
n_spaces = len(context) - len(context.rstrip())
if n_spaces > 0:
continuation = context[-n_spaces:] + continuation
......@@ -373,7 +377,7 @@ class TemplateLM(LM):
@abc.abstractmethod
def loglikelihood_rolling(
self, requests, disable_tqdm: bool = False
) -> List[Tuple[float, bool]]:
) -> List[float]:
pass
@abc.abstractmethod
......
from . import (
anthropic_llms,
api_models,
dummy,
gguf,
huggingface,
......
from typing import Any, List, Tuple
import os
from functools import cached_property
from typing import Any, Dict, List, Tuple, Union
from tqdm import tqdm
from lm_eval import utils
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.models.openai_completions import LocalCompletionsAPI
from lm_eval.models.utils import retry_on_specific_exceptions
......@@ -138,7 +141,7 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
return messages()
@register_model("anthropic")
@register_model("anthropic-completions")
class AnthropicLM(LM):
REQ_CHUNK_SIZE = 20 # TODO: not used
......@@ -271,90 +274,89 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
@register_model("anthropic-chat", "anthropic-chat-completions")
class AnthropicChatLM(AnthropicLM):
REQ_CHUNK_SIZE = 20 # TODO: not used
class AnthropicChat(LocalCompletionsAPI):
def __init__(
self,
model: str,
batch_size: int = 1,
max_tokens: int = 256,
temperature: float = 0, # defaults to 1
**kwargs, # top_p, top_k, etc.
) -> None:
"""Anthropic API wrapper.
:param model: str
Anthropic model e.g. 'claude-3-opus-20240229', 'claude-3-sonnet-20240229'
:param max_tokens: int
Maximum number of tokens to sample from the model
:param temperature: float
Sampling temperature
:param kwargs: Any
Additional model_args to pass to the API client
"""
super().__init__()
base_url="https://api.anthropic.com/v1/messages",
tokenizer_backend=None,
**kwargs,
):
super().__init__(
base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
)
eval_logger.warning(
"Chat completions does not support batching. Defaulting to batch size 1."
)
self._batch_size = 1
self.anthropic_version = "2023-06-01"
eval_logger.warning(
f"Using Anthropic Version: {self.anthropic_version}. Confirm the current version here: https://docs.anthropic.com/en/api/versioning"
)
try:
import anthropic
except ModuleNotFoundError:
raise Exception(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
@cached_property
def api_key(self):
"""Override this property to return the API key for the API request."""
key = os.environ.get("ANTHROPIC_API_KEY", None)
if key is None:
raise ValueError(
"API key not found. Please set the ANTHROPIC_API_KEY environment variable."
)
self.model = model
# defaults to os.environ.get("ANTHROPIC_API_KEY")
self.client = anthropic.Anthropic()
self.temperature = temperature
self.max_tokens = max_tokens
self.tokenizer = self.client.get_tokenizer()
self.kwargs = kwargs
@property
def max_gen_toks(self) -> int:
return self.max_tokens
def generate_until(self, requests) -> List[str]:
try:
import anthropic
except ModuleNotFoundError:
raise Exception(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
)
if not requests:
return []
_requests: List[Tuple[str, dict]] = [req.args for req in requests]
return key
@cached_property
def header(self):
return {
"x-api-key": f"{self.api_key}",
"anthropic-version": self.anthropic_version,
}
def _create_payload(
self, messages: List[Dict], generate=True, gen_kwargs: dict = None, **kwargs
) -> dict:
system = (
messages[0].get("content") if messages[0].get("role") == "system" else None
)
if system:
messages = messages[1:]
gen_kwargs.pop("do_sample", False)
max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
temperature = gen_kwargs.pop("temperature", 0)
stop = gen_kwargs.pop("until", ["\n\nHuman:"])
if not isinstance(stop, list):
stop = [stop]
out = {
"messages": messages,
"model": self.model,
"max_tokens": max_tokens,
"temperature": temperature,
"stop_sequences": stop,
**gen_kwargs,
}
if system:
out["system"] = system
return out
def parse_generations(
self, outputs: Union[Dict, List[Dict]], **kwargs
) -> List[str]:
res = []
for request in tqdm(_requests):
try:
inp = request[0]
request_args = request[1]
# generation_kwargs
until = request_args.get("until")
max_tokens = request_args.get("max_gen_toks", self.max_length)
temperature = request_args.get("temperature", self.temperature)
response = anthropic_chat(
client=self.client,
model=self.model,
prompt=inp,
max_tokens=max_tokens,
temperature=temperature, # TODO: implement non-greedy sampling for Anthropic
stop=until, # type: ignore
**self.kwargs,
)
res.append(response)
self.cache_hook.add_partial("generate_until", request, response)
except anthropic.APIConnectionError as e: # type: ignore # noqa: F821
eval_logger.critical(f"Server unreachable: {e.__cause__}")
break
except anthropic.APIStatusError as e: # type: ignore # noqa: F821
eval_logger.critical(f"API error {e.status_code}: {e.message}")
break
if not isinstance(outputs, list):
outputs = [outputs]
for out in outputs:
for choices in out["content"]:
res.append(choices["text"])
return res
def tok_encode(
self,
string: str,
left_truncate_len=None,
add_special_tokens=None,
**kwargs,
) -> List[str]:
return [string]
def _loglikelihood_tokens(self, requests, **kwargs):
raise NotImplementedError(
"Anthropic Chat Completions API does not support the return of log"
)
This diff is collapsed.
This diff is collapsed.
......@@ -57,7 +57,7 @@ Homepage = "https://github.com/EleutherAI/lm-evaluation-harness"
Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
[project.optional-dependencies]
anthropic = ["anthropic"]
api = ["requests", "aiohttp", "tenacity", "tqdm", "tiktoken"]
dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"]
deepsparse = ["deepsparse-nightly[llm]>=1.8.0.20240404"]
gptq = ["auto-gptq[triton]>=0.6.0"]
......@@ -67,7 +67,6 @@ neuronx = ["optimum[neuronx]"]
mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"]
multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
openai = ["openai==1.3.9", "tiktoken"]
optimum = ["optimum[openvino]"]
promptsource = ["promptsource>=0.2.3"]
sentencepiece = ["sentencepiece>=0.1.98"]
......
from unittest.mock import MagicMock, patch
import pytest
from lm_eval.models.openai_completions import LocalCompletionsAPI
@pytest.fixture
def api():
return LocalCompletionsAPI(
base_url="http://test-url.com", tokenizer_backend=None, model="gpt-3.5-turbo"
)
@pytest.fixture
def api_tokenized():
return LocalCompletionsAPI(
base_url="http://test-url.com",
model="EleutherAI/pythia-1b",
tokenizer_backend="huggingface",
)
def test_create_payload_generate(api):
messages = ["Generate a story"]
gen_kwargs = {
"max_tokens": 100,
"temperature": 0.7,
"until": ["The End"],
"do_sample": True,
}
payload = api._create_payload(messages, generate=True, gen_kwargs=gen_kwargs)
assert payload == {
"prompt": ["Generate a story"],
"model": "gpt-3.5-turbo",
"max_tokens": 100,
"temperature": 0.7,
"stop": ["The End"],
}
def test_create_payload_loglikelihood(api):
messages = ["The capital of France is"]
payload = api._create_payload(messages, generate=False, gen_kwargs=None)
assert payload == {
"model": "gpt-3.5-turbo",
"prompt": ["The capital of France is"],
"max_tokens": 1,
"logprobs": 1,
"echo": True,
}
@pytest.mark.parametrize(
"input_messages, generate, gen_kwargs, expected_payload",
[
(
["Hello, how are"],
True,
{"max_gen_toks": 100, "temperature": 0.7},
{
"prompt": "Hello, how are",
"model": "gpt-3.5-turbo",
"max_tokens": 100,
"temperature": 0.7,
"stop": ["<|endoftext|>"],
},
),
(
["Hello, how are", "you"],
True,
{},
{
"prompt": "Hello, how are",
"model": "gpt-3.5-turbo",
"max_tokens": 256,
"temperature": 0,
"stop": ["<|endoftext|>"],
},
),
],
)
def test_model_generate_call_usage(
api, input_messages, generate, gen_kwargs, expected_payload
):
with patch("requests.post") as mock_post:
mock_response = MagicMock()
mock_response.json.return_value = {"result": "success"}
mock_post.return_value = mock_response
# Act
result = api.model_call(
input_messages, generate=generate, gen_kwargs=gen_kwargs
)
# Assert
mock_post.assert_called_once()
_, kwargs = mock_post.call_args
assert "json" in kwargs
assert kwargs["json"] == expected_payload
assert result == {"result": "success"}
@pytest.mark.parametrize(
"input_messages, generate, gen_kwargs, expected_payload",
[
(
[[1, 2, 3, 4, 5]],
False,
None,
{
"model": "EleutherAI/pythia-1b",
"prompt": [[1, 2, 3, 4, 5]],
"max_tokens": 1,
"logprobs": 1,
"echo": True,
},
),
],
)
def test_model_tokenized_call_usage(
api_tokenized, input_messages, generate, gen_kwargs, expected_payload
):
with patch("requests.post") as mock_post:
mock_response = MagicMock()
mock_response.json.return_value = {"result": "success"}
mock_post.return_value = mock_response
# Act
result = api_tokenized.model_call(
input_messages, generate=generate, gen_kwargs=gen_kwargs
)
# Assert
mock_post.assert_called_once()
_, kwargs = mock_post.call_args
assert "json" in kwargs
assert kwargs["json"] == expected_payload
assert result == {"result": "success"}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment