Commit 4d3a2c28 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.5' into v0.6.5-dev

parents 92ec5d8e 2d1b9baa
......@@ -5,14 +5,18 @@ import openai
import pytest
import os
import pytest_asyncio
import requests
from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import RemoteOpenAIServer, models_path_prefix
EMBEDDING_MODEL_NAME = os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")
MODEL_NAME = os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
@pytest.fixture(scope="module")
def embedding_server():
def server():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
......@@ -20,31 +24,29 @@ def embedding_server():
"--enforce-eager",
"--max-model-len",
"8192",
"--chat-template",
DUMMY_CHAT_TEMPLATE,
]
with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server:
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def embedding_client(embedding_server):
async with embedding_server.get_async_client() as async_client:
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
model_name: str):
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
input_texts = [
"The chef prepared a delicious meal.",
]
# test single embedding
embeddings = await embedding_client.embeddings.create(
embeddings = await client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
......@@ -58,7 +60,7 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
# test using token IDs
input_tokens = [1, 1, 1, 1, 1]
embeddings = await embedding_client.embeddings.create(
embeddings = await client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
......@@ -72,18 +74,14 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
model_name: str):
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
# test List[str]
input_texts = [
"The cat sat on the mat.", "A feline was resting on a rug.",
"Stars twinkle brightly in the night sky."
]
embeddings = await embedding_client.embeddings.create(
embeddings = await client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
......@@ -91,11 +89,14 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
assert embeddings.id is not None
assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 32
assert embeddings.usage.total_tokens == 32
# test List[List[int]]
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
[25, 32, 64, 77]]
embeddings = await embedding_client.embeddings.create(
embeddings = await client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
......@@ -109,22 +110,70 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_conversation_embedding(server: RemoteOpenAIServer,
client: openai.AsyncOpenAI,
model_name: str):
messages = [{
"role": "user",
"content": "The cat sat on the mat.",
}, {
"role": "assistant",
"content": "A feline was resting on a rug.",
}, {
"role": "user",
"content": "Stars twinkle brightly in the night sky.",
}]
chat_response = requests.post(server.url_for("v1/embeddings"),
json={
"model": model_name,
"messages": messages,
"encoding_format": "float",
})
chat_response.raise_for_status()
chat_embeddings = chat_response.json()
tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
prompt = tokenizer.apply_chat_template(
messages,
chat_template=DUMMY_CHAT_TEMPLATE,
add_generation_prompt=True,
continue_final_message=False,
tokenize=False,
)
completion_response = await client.embeddings.create(
model=model_name,
input=prompt,
encoding_format="float",
# To be consistent with chat
extra_body={"add_special_tokens": False},
)
completion_embeddings = completion_response.model_dump(mode="json")
assert chat_embeddings.pop("id") is not None
assert completion_embeddings.pop("id") is not None
assert chat_embeddings.pop("created") <= completion_embeddings.pop(
"created")
assert chat_embeddings == completion_embeddings
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"Hello my name is",
"The best thing about vLLM is that it supports many different models"
]
responses_float = await embedding_client.embeddings.create(
input=input_texts, model=model_name, encoding_format="float")
responses_float = await client.embeddings.create(input=input_texts,
model=model_name,
encoding_format="float")
responses_base64 = await embedding_client.embeddings.create(
input=input_texts, model=model_name, encoding_format="base64")
responses_base64 = await client.embeddings.create(input=input_texts,
model=model_name,
encoding_format="base64")
decoded_responses_base64_data = []
for data in responses_base64.data:
......@@ -138,10 +187,65 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
1]
# Default response is float32 decoded from base64 by OpenAI Client
responses_default = await embedding_client.embeddings.create(
input=input_texts, model=model_name)
responses_default = await client.embeddings.create(input=input_texts,
model=model_name)
assert responses_float.data[0].embedding == responses_default.data[
0].embedding
assert responses_float.data[1].embedding == responses_default.data[
1].embedding
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
# test single embedding
embeddings = await client.embeddings.create(
model=model_name,
input=input_texts,
extra_body={"truncate_prompt_tokens": 10})
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
input_tokens = [
1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
]
embeddings = await client.embeddings.create(
model=model_name,
input=input_tokens,
extra_body={"truncate_prompt_tokens": 10})
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
with pytest.raises(openai.BadRequestError):
embeddings = await client.embeddings.create(
model=model_name,
input=input_texts,
extra_body={"truncate_prompt_tokens": 8193})
assert "error" in embeddings.object
assert "truncate_prompt_tokens value is greater than max_model_len. "\
"Please, select a smaller truncation size." in embeddings.message
......@@ -71,19 +71,21 @@ EXPECTED_VALUES = {
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS)],
"vllm:request_params_n": [("_count", _NUM_REQUESTS)],
"vllm:request_params_best_of": [("_count", _NUM_REQUESTS)],
"vllm:request_params_max_tokens":
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS)],
"vllm:prompt_tokens": [("_total",
_NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
"vllm:generation_tokens":
[("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
"vllm:generation_tokens": [
("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
],
"vllm:request_success": [("_total", _NUM_REQUESTS)],
}
@pytest.mark.asyncio
async def test_metrics_counts(client: openai.AsyncOpenAI):
base_url = str(client.base_url)[:-3].strip("/")
async def test_metrics_counts(server: RemoteOpenAIServer,
client: openai.AsyncClient):
for _ in range(_NUM_REQUESTS):
# sending a request triggers the metrics to be logged.
await client.completions.create(
......@@ -91,7 +93,7 @@ async def test_metrics_counts(client: openai.AsyncOpenAI):
prompt=_TOKENIZED_PROMPT,
max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST)
response = requests.get(base_url + "/metrics")
response = requests.get(server.url_for("metrics"))
print(response.text)
assert response.status_code == HTTPStatus.OK
......@@ -152,9 +154,9 @@ EXPECTED_METRICS = [
"vllm:request_params_n_sum",
"vllm:request_params_n_bucket",
"vllm:request_params_n_count",
"vllm:request_params_best_of_sum",
"vllm:request_params_best_of_bucket",
"vllm:request_params_best_of_count",
"vllm:request_params_max_tokens_sum",
"vllm:request_params_max_tokens_bucket",
"vllm:request_params_max_tokens_count",
"vllm:num_preemptions_total",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
......@@ -175,16 +177,15 @@ EXPECTED_METRICS = [
@pytest.mark.asyncio
async def test_metrics_exist(client: openai.AsyncOpenAI):
base_url = str(client.base_url)[:-3].strip("/")
async def test_metrics_exist(server: RemoteOpenAIServer,
client: openai.AsyncClient):
# sending a request triggers the metrics to be logged.
await client.completions.create(model=MODEL_NAME,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0)
response = requests.get(base_url + "/metrics")
response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK
for metric in EXPECTED_METRICS:
......
......@@ -21,3 +21,38 @@ async def test_empty_prompt():
prompt="",
max_tokens=5,
temperature=0.0)
@pytest.mark.asyncio
async def test_out_of_vocab_token_ids():
model_name = "gpt2"
server_args = ["--enforce-eager"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
with pytest.raises(openai.BadRequestError,
match=re.compile('.*out of vocabulary.*')):
await client.completions.create(model=model_name,
prompt=[999999],
max_tokens=5,
temperature=0.0)
@pytest.mark.asyncio
async def test_reject_multistep_with_guided_decoding():
model_name = "gpt2"
server_args = ["--enforce-eager", "--num-scheduler-steps", "8"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
with pytest.raises(openai.BadRequestError,
match=re.compile(
'.*Guided decoding .* multi-step decoding.*')):
await client.completions.create(
model=model_name,
prompt="Hello",
max_tokens=5,
temperature=0.0,
extra_body={"response_format": {
"type": "json_object"
}})
import contextlib
import os
from typing import Any, List, NamedTuple
import openai # use the official client for correctness check
import pytest
from ...utils import RemoteOpenAIServer, models_path_prefix
# # any model with a chat template should work here
MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct")
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
API_KEY = "abc-123"
ERROR_API_KEY = "abc"
ROOT_PATH = "llm"
@pytest.fixture(scope="module")
def server():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--enforce-eager",
"--max-model-len",
"4080",
"--root-path", # use --root-path=/llm for testing
"/" + ROOT_PATH,
"--chat-template",
DUMMY_CHAT_TEMPLATE,
]
envs = os.environ.copy()
envs["VLLM_API_KEY"] = API_KEY
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
yield remote_server
class TestCase(NamedTuple):
model_name: str
base_url: List[str]
api_key: str
expected_error: Any
@pytest.mark.asyncio
@pytest.mark.parametrize(
"test_case",
[
TestCase(
model_name=MODEL_NAME,
base_url=["v1"], # http://localhost:8000/v1
api_key=ERROR_API_KEY,
expected_error=openai.AuthenticationError),
TestCase(
model_name=MODEL_NAME,
base_url=[ROOT_PATH, "v1"], # http://localhost:8000/llm/v1
api_key=ERROR_API_KEY,
expected_error=openai.AuthenticationError),
TestCase(
model_name=MODEL_NAME,
base_url=["v1"], # http://localhost:8000/v1
api_key=API_KEY,
expected_error=None),
TestCase(
model_name=MODEL_NAME,
base_url=[ROOT_PATH, "v1"], # http://localhost:8000/llm/v1
api_key=API_KEY,
expected_error=None),
],
)
async def test_chat_session_root_path_with_api_key(server: RemoteOpenAIServer,
test_case: TestCase):
saying: str = "Here is a common saying about apple. An apple a day, keeps"
ctx = contextlib.nullcontext()
if test_case.expected_error is not None:
ctx = pytest.raises(test_case.expected_error)
with ctx:
client = openai.AsyncOpenAI(
api_key=test_case.api_key,
base_url=server.url_for(*test_case.base_url),
max_retries=0)
chat_completion = await client.chat.completions.create(
model=test_case.model_name,
messages=[{
"role": "user",
"content": "tell me a common saying"
}, {
"role": "assistant",
"content": saying
}],
extra_body={
"continue_final_message": True,
"add_generation_prompt": False
})
assert chat_completion.id is not None
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "stop"
message = choice.message
assert len(message.content) > 0
assert message.role == "assistant"
import os
import pytest
import requests
from vllm.entrypoints.openai.protocol import ScoreResponse
from ...utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3")
@pytest.fixture(scope="module")
def server():
args = [
"--enforce-eager",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_text_1_str_text_2_list(server: RemoteOpenAIServer,
model_name: str):
text_1 = "What is the capital of France?"
text_2 = [
"The capital of Brazil is Brasilia.", "The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
json={
"model": model_name,
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
assert score.data[0].score <= 0.01
assert score.data[1].score >= 0.9
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_text_1_list_text_2_list(server: RemoteOpenAIServer,
model_name: str):
text_1 = [
"What is the capital of the United States?",
"What is the capital of France?"
]
text_2 = [
"The capital of Brazil is Brasilia.", "The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
json={
"model": model_name,
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
assert score.data[0].score <= 0.01
assert score.data[1].score >= 0.9
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_text_1_str_text_2_str(server: RemoteOpenAIServer,
model_name: str):
text_1 = "What is the capital of France?"
text_2 = "The capital of France is Paris."
score_response = requests.post(server.url_for("score"),
json={
"model": model_name,
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
assert score.data[0].score >= 0.9
......@@ -16,15 +16,22 @@ CHAT_TEMPLATE = "Dummy chat template for testing {}"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
@dataclass
class MockHFConfig:
model_type: str = "any"
@dataclass
class MockModelConfig:
task = "generate"
tokenizer = MODEL_NAME
trust_remote_code = False
tokenizer_mode = "auto"
max_model_len = 100
tokenizer_revision = None
embedding_mode = False
multimodal_config = MultiModalConfig()
hf_config = MockHFConfig()
logits_processor_pattern = None
@dataclass
......@@ -43,6 +50,7 @@ async def _async_serving_chat_init():
BASE_MODEL_PATHS,
response_role="assistant",
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
lora_modules=None,
prompt_adapters=None,
request_logger=None)
......@@ -64,6 +72,7 @@ def test_serving_chat_should_set_correct_max_tokens():
BASE_MODEL_PATHS,
response_role="assistant",
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
lora_modules=None,
prompt_adapters=None,
request_logger=None)
......
......@@ -10,6 +10,8 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
LoadLoraAdapterRequest,
UnloadLoraAdapterRequest)
from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
from vllm.lora.request import LoRARequest
from ...utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b")
......@@ -35,6 +37,16 @@ async def _async_serving_engine_init():
return serving_engine
@pytest.mark.asyncio
async def test_serving_model_name():
serving_engine = await _async_serving_engine_init()
assert serving_engine._get_model_name(None) == MODEL_NAME
request = LoRARequest(lora_name="adapter",
lora_path="/path/to/adapter2",
lora_int_id=1)
assert serving_engine._get_model_name(request) == request.lora_name
@pytest.mark.asyncio
async def test_load_lora_adapter_success():
serving_engine = await _async_serving_engine_init()
......
......@@ -6,7 +6,7 @@ import pytest
from ...utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
@pytest.mark.asyncio
......
import openai # use the official client for correctness check
import pytest
import os
import pytest_asyncio
......@@ -57,9 +56,11 @@ async def client(server):
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
indirect=["tokenizer_name"],
)
async def test_tokenize_completions(client: openai.AsyncOpenAI,
model_name: str, tokenizer_name: str):
base_url = str(client.base_url)[:-3].strip("/")
async def test_tokenize_completions(
server: RemoteOpenAIServer,
model_name: str,
tokenizer_name: str,
):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
tokenizer_mode="fast")
......@@ -67,7 +68,7 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI,
prompt = "vllm1 This is a test prompt."
tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
response = requests.post(base_url + "/tokenize",
response = requests.post(server.url_for("tokenize"),
json={
"add_special_tokens": add_special,
"model": model_name,
......@@ -88,9 +89,11 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI,
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
indirect=["tokenizer_name"],
)
async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
tokenizer_name: str):
base_url = str(client.base_url)[:-3].strip("/")
async def test_tokenize_chat(
server: RemoteOpenAIServer,
model_name: str,
tokenizer_name: str,
):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
tokenizer_mode="fast")
......@@ -106,17 +109,29 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
"role": "user",
"content": "Can I ask a question? vllm1"
}]
for continue_final in [False, True]:
if add_generation and continue_final:
continue
if continue_final:
conversation.append({
"role": "assistant",
"content": "Sure,"
})
prompt = tokenizer.apply_chat_template(
add_generation_prompt=add_generation,
continue_final_message=continue_final,
conversation=conversation,
tokenize=False)
tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
tokens = tokenizer.encode(prompt,
add_special_tokens=add_special)
response = requests.post(base_url + "/tokenize",
response = requests.post(server.url_for("tokenize"),
json={
"add_generation_prompt":
add_generation,
"continue_final_message":
continue_final,
"add_special_tokens": add_special,
"messages": conversation,
"model": model_name
......@@ -136,17 +151,18 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
indirect=["tokenizer_name"],
)
async def test_detokenize(client: openai.AsyncOpenAI, model_name: str,
tokenizer_name: str):
base_url = str(client.base_url)[:-3].strip("/")
async def test_detokenize(
server: RemoteOpenAIServer,
model_name: str,
tokenizer_name: str,
):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
tokenizer_mode="fast")
prompt = "This is a test prompt. vllm1"
tokens = tokenizer.encode(prompt, add_special_tokens=False)
print(f"CALLING {base_url} FOR {model_name}")
response = requests.post(base_url + "/detokenize",
response = requests.post(server.url_for("detokenize"),
json={
"model": model_name,
"tokens": tokens
......
from typing import Dict, List
import os
import openai
import pytest
import pytest_asyncio
from vllm.multimodal.utils import encode_video_base64, fetch_video
from ...utils import RemoteOpenAIServer, models_path_prefix, urls_port
MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
MAXIMUM_VIDEOS = 4
# TEST_VIDEO_URLS = [
# "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4",
# "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ElephantsDream.mp4",
# "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerBlazes.mp4",
# "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4",
# ]
TEST_VIDEO_URLS = [
f"http://localhost:{urls_port}/BigBuckBunny.mp4",
f"http://localhost:{urls_port}/ElephantsDream.mp4",
f"http://localhost:{urls_port}/ForBiggerBlazes.mp4",
f"http://localhost:{urls_port}/ForBiggerFun.mp4",
]
@pytest.fixture(scope="module")
def server():
args = [
"--task",
"generate",
"--dtype",
"bfloat16",
"--max-model-len",
"32768",
"--max-num-seqs",
"2",
"--enforce-eager",
"--trust-remote-code",
"--limit-mm-per-prompt",
f"video={MAXIMUM_VIDEOS}",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.fixture(scope="session")
def base64_encoded_video() -> Dict[str, str]:
return {
video_url: encode_video_base64(fetch_video(video_url))
for video_url in TEST_VIDEO_URLS
}
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video(client: openai.AsyncOpenAI,
model_name: str, video_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "video_url",
"video_url": {
"url": video_url
}
},
{
"type": "text",
"text": "What's in this video?"
},
],
}]
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
message = choice.message
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video_beamsearch(client: openai.AsyncOpenAI,
model_name: str,
video_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "video_url",
"video_url": {
"url": video_url
}
},
{
"type": "text",
"text": "What's in this video?"
},
],
}]
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
n=2,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5,
extra_body=dict(use_beam_search=True))
assert len(chat_completion.choices) == 2
assert chat_completion.choices[
0].message.content != chat_completion.choices[1].message.content
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video_base64encoded(
client: openai.AsyncOpenAI, model_name: str, video_url: str,
base64_encoded_video: Dict[str, str]):
messages = [{
"role":
"user",
"content": [
{
"type": "video_url",
"video_url": {
"url":
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
}
},
{
"type": "text",
"text": "What's in this video?"
},
],
}]
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
message = choice.message
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video_base64encoded_beamsearch(
client: openai.AsyncOpenAI, model_name: str, video_url: str,
base64_encoded_video: Dict[str, str]):
messages = [{
"role":
"user",
"content": [
{
"type": "video_url",
"video_url": {
"url":
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
}
},
{
"type": "text",
"text": "What's in this video?"
},
],
}]
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
n=2,
max_completion_tokens=10,
extra_body=dict(use_beam_search=True))
assert len(chat_completion.choices) == 2
assert chat_completion.choices[
0].message.content != chat_completion.choices[1].message.content
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_chat_streaming_video(client: openai.AsyncOpenAI,
model_name: str, video_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "video_url",
"video_url": {
"url": video_url
}
},
{
"type": "text",
"text": "What's in this video?"
},
],
}]
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
output = chat_completion.choices[0].message.content
stop_reason = chat_completion.choices[0].finish_reason
# test streaming
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
stream=True,
)
chunks: List[str] = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
if delta.role:
assert delta.role == "assistant"
if delta.content:
chunks.append(delta.content)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == stop_reason
assert delta.content
assert "".join(chunks) == output
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
"video_urls",
[TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))])
async def test_multi_video_input(client: openai.AsyncOpenAI, model_name: str,
video_urls: List[str]):
messages = [{
"role":
"user",
"content": [
*({
"type": "video_url",
"video_url": {
"url": video_url
}
} for video_url in video_urls),
{
"type": "text",
"text": "What's in this video?"
},
],
}]
if len(video_urls) > MAXIMUM_VIDEOS:
with pytest.raises(openai.BadRequestError): # test multi-video input
await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
# the server should still work afterwards
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
completion = completion.choices[0].text
assert completion is not None and len(completion) >= 0
else:
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
......@@ -32,9 +32,18 @@ TEST_IMAGE_URLS = [
@pytest.fixture(scope="module")
def server():
args = [
"--dtype", "bfloat16", "--max-model-len", "4096", "--max-num-seqs",
"5", "--enforce-eager", "--trust-remote-code", "--limit-mm-per-prompt",
f"image={MAXIMUM_IMAGES}"
"--task",
"generate",
"--dtype",
"bfloat16",
"--max-model-len",
"2048",
"--max-num-seqs",
"5",
"--enforce-eager",
"--trust-remote-code",
"--limit-mm-per-prompt",
f"image={MAXIMUM_IMAGES}",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
......@@ -78,9 +87,10 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
}]
# test single completion
chat_completion = await client.chat.completions.create(model=model_name,
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5)
assert len(chat_completion.choices) == 1
......@@ -88,7 +98,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=772, total_tokens=782)
completion_tokens=10, prompt_tokens=775, total_tokens=785)
message = choice.message
message = chat_completion.choices[0].message
......@@ -101,12 +111,48 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
model_name: str,
image_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url
}
},
{
"type": "text",
"text": "What's in this image?"
},
],
}]
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
n=2,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5,
extra_body=dict(use_beam_search=True))
assert len(chat_completion.choices) == 2
assert chat_completion.choices[
0].message.content != chat_completion.choices[1].message.content
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
......@@ -133,9 +179,10 @@ async def test_single_chat_session_image_base64encoded(
}]
# test single completion
chat_completion = await client.chat.completions.create(model=model_name,
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5)
assert len(chat_completion.choices) == 1
......@@ -143,7 +190,7 @@ async def test_single_chat_session_image_base64encoded(
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=772, total_tokens=782)
completion_tokens=10, prompt_tokens=775, total_tokens=785)
message = choice.message
message = chat_completion.choices[0].message
......@@ -156,12 +203,47 @@ async def test_single_chat_session_image_base64encoded(
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_single_chat_session_image_base64encoded_beamsearch(
client: openai.AsyncOpenAI, model_name: str, image_url: str,
base64_encoded_image: Dict[str, str]):
messages = [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url":
f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
}
},
{
"type": "text",
"text": "What's in this image?"
},
],
}]
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
n=2,
max_completion_tokens=10,
extra_body=dict(use_beam_search=True))
assert len(chat_completion.choices) == 2
assert chat_completion.choices[
0].message.content != chat_completion.choices[1].message.content
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
......@@ -188,7 +270,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
max_completion_tokens=10,
temperature=0.0,
)
output = chat_completion.choices[0].message.content
......@@ -198,7 +280,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
max_completion_tokens=10,
temperature=0.0,
stream=True,
)
......@@ -249,7 +331,7 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
max_completion_tokens=10,
temperature=0.0,
)
......@@ -266,7 +348,7 @@ async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_tokens=10,
max_completion_tokens=10,
temperature=0.0,
)
message = chat_completion.choices[0].message
......
from typing import Dict
import os
import pytest
import pytest_asyncio
import requests
from vllm.multimodal.utils import encode_image_base64, fetch_image
from ...utils import VLLM_PATH, RemoteOpenAIServer, models_path_prefix, urls_port
MODEL_NAME = os.path.join(models_path_prefix, "TIGER-Lab/VLM2Vec-Full")
MAXIMUM_IMAGES = 2
vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja"
assert vlm2vec_jinja_path.exists()
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
]
TEST_IMAGE_URLS = [
f"http://localhost:{urls_port}/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
f"http://localhost:{urls_port}/Grayscale_8bits_palette_sample_image.png",
f"http://localhost:{urls_port}/1280px-Venn_diagram_rgb.svg.png",
f"http://localhost:{urls_port}/RGBA_comp.png",
]
@pytest.fixture(scope="module")
def server():
args = [
"--task",
"embed",
"--dtype",
"bfloat16",
"--max-model-len",
"2048",
"--max-num-seqs",
"5",
"--enforce-eager",
"--trust-remote-code",
"--limit-mm-per-prompt",
f"image={MAXIMUM_IMAGES}",
"--chat-template",
str(vlm2vec_jinja_path),
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.fixture(scope="session")
def base64_encoded_image() -> Dict[str, str]:
return {
image_url: encode_image_base64(fetch_image(image_url))
for image_url in TEST_IMAGE_URLS
}
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
image_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "image_url",
"image_url": {
"url": image_url
}
},
{
"type": "text",
"text": "Represent the given image."
},
],
}]
response = requests.post(server.url_for("v1/embeddings"),
json={
"model": model_name,
"messages": messages,
"encoding_format": "float"
})
response.raise_for_status()
embeddings = response.json()
assert embeddings["id"] is not None
assert len(embeddings["data"]) == 1
assert len(embeddings["data"][0]["embedding"]) == 3072
assert embeddings["usage"]["completion_tokens"] == 0
assert embeddings["usage"]["prompt_tokens"] == 765
assert embeddings["usage"]["total_tokens"] == 765
from typing import List
from unittest.mock import MagicMock
import pytest
from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction, run_tool_extraction_streaming)
from vllm.entrypoints.openai.protocol import FunctionCall
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
SIMPLE_FUNCTION_CALL = FunctionCall(
name="get_weather",
arguments='{"city": "San Francisco", "metric": "celsius"}',
)
MORE_TYPES_FUNCTION_OUTPUT = (
"register_user(name='John Doe', "
"age=37, "
"address={'city': 'San Francisco', 'state': 'CA'}, "
"role=None, "
"passed_test=True, "
"aliases=['John', 'Johnny'])")
MORE_TYPES_FUNCTION_CALL = FunctionCall(
name="register_user",
arguments='{"name": "John Doe", '
'"age": 37, '
'"address": {"city": "San Francisco", "state": "CA"}, '
'"role": null, '
'"passed_test": true, '
'"aliases": ["John", "Johnny"]}',
)
PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()"
PARAMETERLESS_FUNCTION_CALL = FunctionCall(
name="get_weather",
arguments='{}',
)
EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})"
EMPTY_DICT_FUNCTION_CALL = FunctionCall(
name="do_something_cool",
arguments='{"additional_data": {}}',
)
EMPTY_LIST_FUNCTION_OUTPUT = "do_something_cool(steps=[])"
EMPTY_LIST_FUNCTION_CALL = FunctionCall(
name="do_something_cool",
arguments='{"steps": []}',
)
ESCAPED_STRING_FUNCTION_OUTPUT = (
r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')")
ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
name="get_weather",
arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
)
@pytest.mark.parametrize("streaming", [True, False])
def test_no_tool_call(streaming: bool):
mock_tokenizer = MagicMock()
tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
mock_tokenizer)
model_output = "How can I help you today?"
content, tool_calls = run_tool_extraction(tool_parser,
model_output,
streaming=streaming)
assert content == model_output
assert len(tool_calls) == 0
TEST_CASES = [
pytest.param(True,
f"[{SIMPLE_FUNCTION_OUTPUT}]", [SIMPLE_FUNCTION_CALL],
id="simple_streaming"),
pytest.param(False,
f"[{SIMPLE_FUNCTION_OUTPUT}]", [SIMPLE_FUNCTION_CALL],
id="simple_nonstreaming"),
pytest.param(True,
f"[{MORE_TYPES_FUNCTION_OUTPUT}]", [MORE_TYPES_FUNCTION_CALL],
id="more_types_streaming"),
pytest.param(False,
f"[{MORE_TYPES_FUNCTION_OUTPUT}]", [MORE_TYPES_FUNCTION_CALL],
id="more_types_nonstreaming"),
pytest.param(True,
f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
[PARAMETERLESS_FUNCTION_CALL],
id="parameterless_streaming"),
pytest.param(False,
f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
[PARAMETERLESS_FUNCTION_CALL],
id="parameterless_nonstreaming"),
pytest.param(True,
f"[{EMPTY_DICT_FUNCTION_OUTPUT}]", [EMPTY_DICT_FUNCTION_CALL],
id="empty_dict_streaming"),
pytest.param(False,
f"[{EMPTY_DICT_FUNCTION_OUTPUT}]", [EMPTY_DICT_FUNCTION_CALL],
id="empty_dict_nonstreaming"),
pytest.param(True,
f"[{EMPTY_LIST_FUNCTION_OUTPUT}]", [EMPTY_LIST_FUNCTION_CALL],
id="empty_list_streaming"),
pytest.param(False,
f"[{EMPTY_LIST_FUNCTION_OUTPUT}]", [EMPTY_LIST_FUNCTION_CALL],
id="empty_list_nonstreaming"),
pytest.param(True,
f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
[ESCAPED_STRING_FUNCTION_CALL],
id="escaped_string_streaming"),
pytest.param(False,
f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
[ESCAPED_STRING_FUNCTION_CALL],
id="escaped_string_nonstreaming"),
pytest.param(True,
f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
[SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
id="parallel_calls_streaming"),
pytest.param(False,
f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
[SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
id="parallel_calls_nonstreaming"),
]
@pytest.mark.parametrize("streaming, model_output, expected_tool_calls",
TEST_CASES)
def test_tool_call(streaming: bool, model_output: str,
expected_tool_calls: List[FunctionCall]):
mock_tokenizer = MagicMock()
tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
mock_tokenizer)
content, tool_calls = run_tool_extraction(tool_parser,
model_output,
streaming=streaming)
assert content is None
assert len(tool_calls) == len(expected_tool_calls)
for actual, expected in zip(tool_calls, expected_tool_calls):
assert actual.type == "function"
assert actual.function == expected
def test_streaming_tool_call_with_large_steps():
mock_tokenizer = MagicMock()
tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
mock_tokenizer)
model_output_deltas = [
"[get_weather(city='San",
" Francisco', metric='celsius'), "
f"{PARAMETERLESS_FUNCTION_OUTPUT}, "
f"{EMPTY_LIST_FUNCTION_OUTPUT}]",
]
reconstructor = run_tool_extraction_streaming(
tool_parser, model_output_deltas, assert_one_tool_per_delta=False)
assert reconstructor.other_content == ""
assert len(reconstructor.tool_calls) == 3
assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
from typing import Iterable, List, Tuple, Union
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaMessage,
ExtractedToolCallInformation,
FunctionCall, ToolCall)
from vllm.entrypoints.openai.tool_parsers import ToolParser
class StreamingToolReconstructor:
def __init__(self, assert_one_tool_per_delta: bool = True):
self.tool_calls: List[ToolCall] = []
self.other_content: str = ""
self._assert_one_tool_per_delta = assert_one_tool_per_delta
def append_delta(self, delta: DeltaMessage):
if delta.content is not None:
self.other_content += delta.content
else:
assert delta.tool_calls, (
"Streaming results should have either content or tool calls "
"(or both)")
if self._assert_one_tool_per_delta:
# Note: This isn't strictly required by the API and may not be
# possible to adhere to depending on the token space and number of
# tokens per streamed response from the model, but it is required
# by tool_use tests, so we enforce it here by default also.
assert len(delta.tool_calls) < 2, (
"Streaming should include only one tool call per update.")
for call_delta in delta.tool_calls:
assert call_delta.type == "function", (
"Streaming tool calls should only emit function calls. Got "
f"{call_delta.type}")
current_tool_call = self.tool_calls[
call_delta.index] if call_delta.index < len(
self.tool_calls) else None
if current_tool_call:
assert (not call_delta.function.name), (
"Streaming tool calls should emit the full function name "
f"exactly once. Got {call_delta.function.name}")
assert (not call_delta.id), (
"Streaming tool calls must emit function id only once. Got "
f"{call_delta.id}")
assert (call_delta.index == len(self.tool_calls) - 1), (
f"Incorrect index for tool delta. Got {call_delta.index}, "
f"expected {len(self.tool_calls) - 1}")
current_tool_call.function.arguments += (
call_delta.function.arguments)
else:
assert call_delta.id is not None, (
"Streaming tool calls must have an id on first appearance")
assert call_delta.function.name is not None, (
"Streaming tool calls must have a function name on first "
"appearance")
assert call_delta.index == len(self.tool_calls), (
f"Incorrect index for tool delta. Got {call_delta.index}, "
f"expected {len(self.tool_calls)}")
self.tool_calls.append(
ToolCall(id=call_delta.id,
function=FunctionCall(
name=call_delta.function.name,
arguments=call_delta.function.arguments
or "")))
def run_tool_extraction(
tool_parser: ToolParser,
model_output: str,
request: Union[ChatCompletionRequest, None] = None,
streaming: bool = False,
assert_one_tool_per_delta: bool = True,
) -> Tuple[Union[str, None], List[ToolCall]]:
if streaming:
reconstructor = run_tool_extraction_streaming(
tool_parser,
model_output,
request,
assert_one_tool_per_delta=assert_one_tool_per_delta)
return reconstructor.other_content or None, reconstructor.tool_calls
else:
extracted = run_tool_extraction_nonstreaming(tool_parser, model_output,
request)
assert extracted.tools_called == bool(extracted.tool_calls)
return extracted.content, extracted.tool_calls
def run_tool_extraction_nonstreaming(
tool_parser: ToolParser,
model_output: str,
request: Union[ChatCompletionRequest, None] = None
) -> ExtractedToolCallInformation:
request = request or ChatCompletionRequest(messages=[], model="test-model")
return tool_parser.extract_tool_calls(model_output, request)
def run_tool_extraction_streaming(
tool_parser: ToolParser,
model_deltas: Iterable[str],
request: Union[ChatCompletionRequest, None] = None,
assert_one_tool_per_delta: bool = True,
) -> StreamingToolReconstructor:
request = request or ChatCompletionRequest(messages=[], model="test-model")
reconstructor = StreamingToolReconstructor(
assert_one_tool_per_delta=assert_one_tool_per_delta)
previous_text = ""
previous_tokens: List[int] = []
for delta in model_deltas:
token_delta = [
tool_parser.vocab.get(token)
for token in tool_parser.model_tokenizer.tokenize(delta)
if token in tool_parser.vocab
]
current_text = previous_text + delta
current_tokens = previous_tokens + token_delta
delta_message = tool_parser.extract_tool_calls_streaming(
previous_text, current_text, delta, previous_tokens,
current_tokens, token_delta, request)
if delta_message is not None:
reconstructor.append_delta(delta_message)
previous_text = current_text
previous_tokens = current_tokens
return reconstructor
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment