Commit 469e903b authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.2' into v0.8.2-dev

parents 389ebcf7 25f560a6
......@@ -2,7 +2,7 @@
import contextlib
import os
from typing import Any, List, NamedTuple
from typing import Any, NamedTuple
import openai # use the official client for correctness check
import pytest
......@@ -11,7 +11,6 @@ from ...utils import RemoteOpenAIServer, models_path_prefix
# # any model with a chat template should work here
MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct")
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
API_KEY = "abc-123"
ERROR_API_KEY = "abc"
ROOT_PATH = "llm"
......@@ -28,8 +27,6 @@ def server():
"4080",
"--root-path", # use --root-path=/llm for testing
"/" + ROOT_PATH,
"--chat-template",
DUMMY_CHAT_TEMPLATE,
]
envs = os.environ.copy()
......@@ -40,7 +37,7 @@ def server():
class TestCase(NamedTuple):
model_name: str
base_url: List[str]
base_url: list[str]
api_key: str
expected_error: Any
......
......@@ -36,10 +36,10 @@ INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/c
INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are an unhelpful assistant."}}
{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "Hello world!"}}
{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "Hello world!"}}
{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
......@@ -55,7 +55,7 @@ def test_empty_file():
proc = subprocess.Popen([
sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
input_file.name, "-o", output_file.name, "--model",
os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")
os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")
], )
proc.communicate()
proc.wait()
......@@ -115,7 +115,7 @@ def test_embeddings():
proc = subprocess.Popen([
sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
input_file.name, "-o", output_file.name, "--model",
os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")
os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")
], )
proc.communicate()
proc.wait()
......
# SPDX-License-Identifier: Apache-2.0
import os
import math
from typing import Any
import pytest
import requests
import torch.nn.functional as F
from torch import tensor
from vllm.entrypoints.openai.protocol import ScoreResponse
from ...utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3")
@pytest.fixture(scope="module")
def server():
args = ["--enforce-eager", "--max-model-len", "100"]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
MODELS = [
{
"name": os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3"),
"is_cross_encoder": True
},
{
"name": "BAAI/bge-base-en-v1.5",
"is_cross_encoder": False
},
]
DTYPE = "half"
def run_transformers(hf_model, model, text_pairs):
if model["is_cross_encoder"]:
return hf_model.predict(text_pairs).tolist()
else:
hf_embeddings = [
hf_model.encode(text_pair) for text_pair in text_pairs
]
return [
F.cosine_similarity(tensor(pair[0]), tensor(pair[1]), dim=0)
for pair in hf_embeddings
]
@pytest.fixture(scope="class", params=MODELS)
def model(request):
yield request.param
@pytest.fixture(scope="class")
def server(model: dict[str, Any]):
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
with RemoteOpenAIServer(model["name"], args) as remote_server:
yield remote_server
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_text_1_str_text_2_list(server: RemoteOpenAIServer, model_name: str):
text_1 = "What is the capital of France?"
text_2 = [
"The capital of Brazil is Brasilia.", "The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
json={
"model": model_name,
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
assert score.data[0].score <= 0.01
assert score.data[1].score >= 0.9
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_text_1_list_text_2_list(server: RemoteOpenAIServer, model_name: str):
text_1 = [
"What is the capital of the United States?",
"What is the capital of France?"
]
text_2 = [
"The capital of Brazil is Brasilia.", "The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
json={
"model": model_name,
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
assert score.data[0].score <= 0.01
assert score.data[1].score >= 0.9
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_text_1_str_text_2_str(server: RemoteOpenAIServer, model_name: str):
text_1 = "What is the capital of France?"
text_2 = "The capital of France is Paris."
score_response = requests.post(server.url_for("score"),
json={
"model": model_name,
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
assert score.data[0].score >= 0.9
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_score_max_model_len(server: RemoteOpenAIServer, model_name: str):
text_1 = "What is the capital of France?" * 20
text_2 = [
"The capital of Brazil is Brasilia.", "The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
json={
"model": model_name,
"text_1": text_1,
"text_2": text_2,
})
assert score_response.status_code == 400
# Assert just a small fragments of the response
assert "Please reduce the length of the input." in \
score_response.text
# Test truncation
score_response = requests.post(server.url_for("score"),
json={
"model": model_name,
"text_1": text_1,
"text_2": text_2,
"truncate_prompt_tokens": 101
})
assert score_response.status_code == 400
assert "Please, select a smaller truncation size." in \
score_response.text
@pytest.fixture(scope="class")
def runner(model: dict[str, Any], hf_runner):
kwargs = {
"dtype": DTYPE,
"is_cross_encoder" if model["is_cross_encoder"]\
else "is_sentence_transformer": True
}
with hf_runner(model["name"], **kwargs) as hf_model:
yield hf_model
class TestModel:
def test_text_1_str_text_2_list(self, server: RemoteOpenAIServer,
model: dict[str, Any], runner):
text_1 = "What is the capital of France?"
text_2 = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
text_pairs = [[text_1, text_2[0]], [text_1, text_2[1]]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
def test_text_1_list_text_2_list(self, server: RemoteOpenAIServer,
model: dict[str, Any], runner):
text_1 = [
"What is the capital of the United States?",
"What is the capital of France?"
]
text_2 = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
vllm_outputs = [d.score for d in score.data]
text_pairs = [[text_1[0], text_2[0]], [text_1[1], text_2[1]]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
def test_text_1_str_text_2_str(self, server: RemoteOpenAIServer,
model: dict[str, Any], runner):
text_1 = "What is the capital of France?"
text_2 = "The capital of France is Paris."
score_response = requests.post(server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
vllm_outputs = [d.score for d in score.data]
text_pairs = [[text_1, text_2]]
hf_outputs = run_transformers(runner, model, text_pairs)
for i in range(len(vllm_outputs)):
assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
def test_score_max_model_len(self, server: RemoteOpenAIServer,
model: dict[str, Any]):
text_1 = "What is the capital of France?" * 20
text_2 = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
})
assert score_response.status_code == 400
# Assert just a small fragments of the response
assert "Please reduce the length of the input." in \
score_response.text
# Test truncation
score_response = requests.post(server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
"truncate_prompt_tokens": 101
})
assert score_response.status_code == 400
assert "Please, select a smaller truncation size." in \
score_response.text
......@@ -39,6 +39,7 @@ class MockModelConfig:
diff_sampling_param: Optional[dict] = None
allowed_local_media_path: str = ""
encoder_config = None
generation_config: str = "auto"
def get_diff_sampling_param(self):
return self.diff_sampling_param or {}
......
......@@ -9,8 +9,8 @@ import os
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.protocol import (ErrorResponse,
LoadLoraAdapterRequest,
UnloadLoraAdapterRequest)
LoadLoRAAdapterRequest,
UnloadLoRAAdapterRequest)
from vllm.entrypoints.openai.serving_models import (BaseModelPath,
OpenAIServingModels)
from vllm.lora.request import LoRARequest
......@@ -53,7 +53,7 @@ async def test_serving_model_name():
@pytest.mark.asyncio
async def test_load_lora_adapter_success():
serving_models = await _async_serving_models_init()
request = LoadLoraAdapterRequest(lora_name="adapter",
request = LoadLoRAAdapterRequest(lora_name="adapter",
lora_path="/path/to/adapter2")
response = await serving_models.load_lora_adapter(request)
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
......@@ -64,7 +64,7 @@ async def test_load_lora_adapter_success():
@pytest.mark.asyncio
async def test_load_lora_adapter_missing_fields():
serving_models = await _async_serving_models_init()
request = LoadLoraAdapterRequest(lora_name="", lora_path="")
request = LoadLoRAAdapterRequest(lora_name="", lora_path="")
response = await serving_models.load_lora_adapter(request)
assert isinstance(response, ErrorResponse)
assert response.type == "InvalidUserInput"
......@@ -74,14 +74,14 @@ async def test_load_lora_adapter_missing_fields():
@pytest.mark.asyncio
async def test_load_lora_adapter_duplicate():
serving_models = await _async_serving_models_init()
request = LoadLoraAdapterRequest(lora_name="adapter1",
request = LoadLoRAAdapterRequest(lora_name="adapter1",
lora_path="/path/to/adapter1")
response = await serving_models.load_lora_adapter(request)
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
lora_name='adapter1')
assert len(serving_models.lora_requests) == 1
request = LoadLoraAdapterRequest(lora_name="adapter1",
request = LoadLoRAAdapterRequest(lora_name="adapter1",
lora_path="/path/to/adapter1")
response = await serving_models.load_lora_adapter(request)
assert isinstance(response, ErrorResponse)
......@@ -93,12 +93,12 @@ async def test_load_lora_adapter_duplicate():
@pytest.mark.asyncio
async def test_unload_lora_adapter_success():
serving_models = await _async_serving_models_init()
request = LoadLoraAdapterRequest(lora_name="adapter1",
request = LoadLoRAAdapterRequest(lora_name="adapter1",
lora_path="/path/to/adapter1")
response = await serving_models.load_lora_adapter(request)
assert len(serving_models.lora_requests) == 1
request = UnloadLoraAdapterRequest(lora_name="adapter1")
request = UnloadLoRAAdapterRequest(lora_name="adapter1")
response = await serving_models.unload_lora_adapter(request)
assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
lora_name='adapter1')
......@@ -108,7 +108,7 @@ async def test_unload_lora_adapter_success():
@pytest.mark.asyncio
async def test_unload_lora_adapter_missing_fields():
serving_models = await _async_serving_models_init()
request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None)
request = UnloadLoRAAdapterRequest(lora_name="", lora_int_id=None)
response = await serving_models.unload_lora_adapter(request)
assert isinstance(response, ErrorResponse)
assert response.type == "InvalidUserInput"
......@@ -118,7 +118,7 @@ async def test_unload_lora_adapter_missing_fields():
@pytest.mark.asyncio
async def test_unload_lora_adapter_not_found():
serving_models = await _async_serving_models_init()
request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
request = UnloadLoRAAdapterRequest(lora_name="nonexistent_adapter")
response = await serving_models.unload_lora_adapter(request)
assert isinstance(response, ErrorResponse)
assert response.type == "NotFoundError"
......
......@@ -28,5 +28,12 @@ def test_sleep_mode():
response = requests.post(remote_server.url_for("/sleep"),
data={"level": "1"})
assert response.status_code == 200
response = requests.get(remote_server.url_for("/is_sleeping"))
assert response.status_code == 200
assert response.json().get("is_sleeping") is True
response = requests.post(remote_server.url_for("/wake_up"))
assert response.status_code == 200
response = requests.get(remote_server.url_for("/is_sleeping"))
assert response.status_code == 200
assert response.json().get("is_sleeping") is False
......@@ -3,12 +3,14 @@
# imports for guided decoding tests
import io
import json
from unittest.mock import patch
import librosa
import numpy as np
import openai
import pytest
import soundfile as sf
from openai._base_client import AsyncAPIClient
from vllm.assets.audio import AudioAsset
......@@ -120,3 +122,73 @@ async def test_completion_endpoints():
res = await client.completions.create(model=model_name, prompt="Hello")
assert res.code == 400
assert res.message == "The model does not support Completions API"
@pytest.mark.asyncio
async def test_streaming_response(winning_call):
model_name = "openai/whisper-small"
server_args = ["--enforce-eager"]
transcription = ""
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
res_no_stream = await client.audio.transcriptions.create(
model=model_name,
file=winning_call,
response_format="json",
language="en",
temperature=0.0)
# Unfortunately this only works when the openai client is patched
# to use streaming mode, not exposed in the transcription api.
original_post = AsyncAPIClient.post
async def post_with_stream(*args, **kwargs):
kwargs['stream'] = True
return await original_post(*args, **kwargs)
with patch.object(AsyncAPIClient, "post", new=post_with_stream):
client = remote_server.get_async_client()
res = await client.audio.transcriptions.create(
model=model_name,
file=winning_call,
language="en",
temperature=0.0,
extra_body=dict(stream=True))
# Reconstruct from chunks and validate
async for chunk in res:
# just a chunk
text = chunk.choices[0]['delta']['content']
transcription += text
assert transcription == res_no_stream.text
@pytest.mark.asyncio
async def test_stream_options(winning_call):
model_name = "openai/whisper-small"
server_args = ["--enforce-eager"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
original_post = AsyncAPIClient.post
async def post_with_stream(*args, **kwargs):
kwargs['stream'] = True
return await original_post(*args, **kwargs)
with patch.object(AsyncAPIClient, "post", new=post_with_stream):
client = remote_server.get_async_client()
res = await client.audio.transcriptions.create(
model=model_name,
file=winning_call,
language="en",
temperature=0.0,
extra_body=dict(stream=True,
stream_include_usage=True,
stream_continuous_usage_stats=True))
final = False
continuous = True
async for chunk in res:
if not len(chunk.choices):
# final usage sent
final = True
else:
continuous = continuous and hasattr(chunk, 'usage')
assert final and continuous
# SPDX-License-Identifier: Apache-2.0
from typing import Dict, List
import os
import openai
import pytest
......@@ -34,8 +32,6 @@ def server():
args = [
"--task",
"generate",
"--dtype",
"bfloat16",
"--max-model-len",
"32768",
"--max-num-seqs",
......@@ -57,7 +53,7 @@ async def client(server):
@pytest.fixture(scope="session")
def base64_encoded_video() -> Dict[str, str]:
def base64_encoded_video() -> dict[str, str]:
return {
video_url: encode_video_base64(fetch_video(video_url))
for video_url in TEST_VIDEO_URLS
......@@ -99,7 +95,7 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
completion_tokens=10, prompt_tokens=6287, total_tokens=6297)
message = choice.message
message = chat_completion.choices[0].message
......@@ -159,7 +155,7 @@ async def test_single_chat_session_video_beamsearch(client: openai.AsyncOpenAI,
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video_base64encoded(
client: openai.AsyncOpenAI, model_name: str, video_url: str,
base64_encoded_video: Dict[str, str]):
base64_encoded_video: dict[str, str]):
messages = [{
"role":
......@@ -192,7 +188,7 @@ async def test_single_chat_session_video_base64encoded(
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
completion_tokens=10, prompt_tokens=6287, total_tokens=6297)
message = choice.message
message = chat_completion.choices[0].message
......@@ -217,7 +213,7 @@ async def test_single_chat_session_video_base64encoded(
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video_base64encoded_beamsearch(
client: openai.AsyncOpenAI, model_name: str, video_url: str,
base64_encoded_video: Dict[str, str]):
base64_encoded_video: dict[str, str]):
messages = [{
"role":
......@@ -287,7 +283,7 @@ async def test_chat_streaming_video(client: openai.AsyncOpenAI,
temperature=0.0,
stream=True,
)
chunks: List[str] = []
chunks: list[str] = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
......@@ -310,7 +306,7 @@ async def test_chat_streaming_video(client: openai.AsyncOpenAI,
"video_urls",
[TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))])
async def test_multi_video_input(client: openai.AsyncOpenAI, model_name: str,
video_urls: List[str]):
video_urls: list[str]):
messages = [{
"role":
......
# SPDX-License-Identifier: Apache-2.0
from typing import Dict, List
import openai
import pytest
import os
......@@ -36,8 +34,6 @@ def server():
args = [
"--task",
"generate",
"--dtype",
"bfloat16",
"--max-model-len",
"2048",
"--max-num-seqs",
......@@ -59,7 +55,7 @@ async def client(server):
@pytest.fixture(scope="session")
def base64_encoded_image() -> Dict[str, str]:
def base64_encoded_image() -> dict[str, str]:
return {
image_url: encode_image_base64(fetch_image(image_url))
for image_url in TEST_IMAGE_URLS
......@@ -161,7 +157,7 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_single_chat_session_image_base64encoded(
client: openai.AsyncOpenAI, model_name: str, image_url: str,
base64_encoded_image: Dict[str, str]):
base64_encoded_image: dict[str, str]):
messages = [{
"role":
......@@ -219,7 +215,7 @@ async def test_single_chat_session_image_base64encoded(
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_single_chat_session_image_base64encoded_beamsearch(
client: openai.AsyncOpenAI, model_name: str, image_url: str,
base64_encoded_image: Dict[str, str]):
base64_encoded_image: dict[str, str]):
messages = [{
"role":
......@@ -289,7 +285,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
temperature=0.0,
stream=True,
)
chunks: List[str] = []
chunks: list[str] = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
......@@ -312,7 +308,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
"image_urls",
[TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
image_urls: List[str]):
image_urls: list[str]):
messages = [{
"role":
......
# SPDX-License-Identifier: Apache-2.0
from typing import Dict
import os
import pytest
import requests
......@@ -38,8 +36,6 @@ def server():
args = [
"--task",
"embed",
"--dtype",
"bfloat16",
"--max-model-len",
"2048",
"--max-num-seqs",
......@@ -57,7 +53,7 @@ def server():
@pytest.fixture(scope="session")
def base64_encoded_image() -> Dict[str, str]:
def base64_encoded_image() -> dict[str, str]:
return {
image_url: encode_image_base64(fetch_image(image_url))
for image_url in TEST_IMAGE_URLS
......
# SPDX-License-Identifier: Apache-2.0
from typing import List
from unittest.mock import MagicMock
import pytest
......@@ -125,7 +124,7 @@ TEST_CASES = [
@pytest.mark.parametrize("streaming, model_output, expected_tool_calls",
TEST_CASES)
def test_tool_call(streaming: bool, model_output: str,
expected_tool_calls: List[FunctionCall]):
expected_tool_calls: list[FunctionCall]):
mock_tokenizer = MagicMock()
tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
mock_tokenizer)
......
# SPDX-License-Identifier: Apache-2.0
from typing import Iterable, List, Tuple, Union
from collections.abc import Iterable
from typing import Union
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaMessage,
......@@ -12,7 +13,7 @@ from vllm.entrypoints.openai.tool_parsers import ToolParser
class StreamingToolReconstructor:
def __init__(self, assert_one_tool_per_delta: bool = True):
self.tool_calls: List[ToolCall] = []
self.tool_calls: list[ToolCall] = []
self.other_content: str = ""
self._assert_one_tool_per_delta = assert_one_tool_per_delta
......@@ -72,7 +73,7 @@ def run_tool_extraction(
request: Union[ChatCompletionRequest, None] = None,
streaming: bool = False,
assert_one_tool_per_delta: bool = True,
) -> Tuple[Union[str, None], List[ToolCall]]:
) -> tuple[Union[str, None], list[ToolCall]]:
if streaming:
reconstructor = run_tool_extraction_streaming(
tool_parser,
......@@ -106,7 +107,7 @@ def run_tool_extraction_streaming(
reconstructor = StreamingToolReconstructor(
assert_one_tool_per_delta=assert_one_tool_per_delta)
previous_text = ""
previous_tokens: List[int] = []
previous_tokens: list[int] = []
for delta in model_deltas:
token_delta = [
tool_parser.vocab.get(token)
......
......@@ -5,10 +5,13 @@ from typing import Optional
import pytest
import os
from packaging.version import Version
from transformers import __version__ as TRANSFORMERS_VERSION
from vllm.assets.image import ImageAsset
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
from vllm.entrypoints.chat_utils import (_resolve_hf_chat_template,
_try_extract_ast, load_chat_template,
parse_chat_messages,
parse_chat_messages_futures,
resolve_chat_template_content_format)
......@@ -22,11 +25,14 @@ from ..utils import VLLM_PATH
EXAMPLES_DIR = VLLM_PATH / "examples"
PHI3V_MODEL_ID = os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")
ULTRAVOX_MODEL_ID = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b")
QWEN2VL_MODEL_ID = os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")
QWEN25VL_MODEL_ID = os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct")
MLLAMA_MODEL_ID = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-11B-Vision-Instruct")
LLAMA_GUARD_MODEL_ID = os.path.join(models_path_prefix, "meta-llama/Llama-Guard-3-1B")
HERMES_MODEL_ID = os.path.join(models_path_prefix, "NousResearch/Hermes-3-Llama-3.1-8B")
@pytest.fixture(scope="function")
......@@ -36,7 +42,7 @@ def phi3v_model_config():
tokenizer=PHI3V_MODEL_ID,
tokenizer_mode="auto",
trust_remote_code=True,
dtype="bfloat16",
dtype="auto",
seed=0,
limit_mm_per_prompt={
"image": 2,
......@@ -60,7 +66,7 @@ def mllama_model_config():
tokenizer=MLLAMA_MODEL_ID,
tokenizer_mode="auto",
trust_remote_code=True,
dtype="bfloat16",
dtype="auto",
seed=0,
limit_mm_per_prompt={
"image": 2,
......@@ -671,7 +677,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
tokenizer=MLLAMA_MODEL_ID,
tokenizer_mode="auto",
trust_remote_code=True,
dtype="bfloat16",
dtype="auto",
seed=0,
limit_mm_per_prompt={
"image": 2,
......@@ -705,25 +711,70 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
vllm_result = apply_hf_chat_template(
tokenizer,
trust_remote_code=model_config.trust_remote_code,
conversation=conversation,
chat_template=None,
tools=None,
add_generation_prompt=True,
)
assert hf_result == vllm_result
@pytest.mark.parametrize(
"model",
[
QWEN2VL_MODEL_ID, # tokenizer.chat_template is of type str
HERMES_MODEL_ID, # tokenizer.chat_template is of type dict
])
@pytest.mark.parametrize("use_tools", [True, False])
def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
"""checks that chat_template is a dict type for HF models."""
# Build the tokenizer group and grab the underlying tokenizer
tokenizer_group = TokenizerGroup(
model,
enable_lora=False,
max_num_seqs=5,
max_input_length=None,
)
tokenizer = tokenizer_group.tokenizer
tools = [{
"type": "function",
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
"parameters": sample_json_schema
}
}] if use_tools else None
# Test detecting the tokenizer's chat_template
chat_template = _resolve_hf_chat_template(
tokenizer,
chat_template=None,
tools=tools,
trust_remote_code=True,
)
assert isinstance(chat_template, str)
# yapf: disable
@pytest.mark.parametrize(
("model", "expected_format"),
[(PHI3V_MODEL_ID, "string"),
(QWEN2VL_MODEL_ID, "openai"),
(QWEN25VL_MODEL_ID, "openai"),
(ULTRAVOX_MODEL_ID, "string"),
(MLLAMA_MODEL_ID, "openai"),
(LLAMA_GUARD_MODEL_ID, "openai")],
)
# yapf: enable
def test_resolve_content_format_hf_defined(model, expected_format):
if model == QWEN25VL_MODEL_ID and Version(TRANSFORMERS_VERSION) < Version(
"4.49.0"):
pytest.skip("Qwen2.5-VL requires transformers>=4.49.0")
tokenizer_group = TokenizerGroup(
model,
enable_lora=False,
......@@ -732,7 +783,13 @@ def test_resolve_content_format_hf_defined(model, expected_format):
)
tokenizer = tokenizer_group.tokenizer
chat_template = tokenizer.chat_template
# Test detecting the tokenizer's chat_template
chat_template = _resolve_hf_chat_template(
tokenizer,
chat_template=None,
tools=None,
trust_remote_code=True,
)
assert isinstance(chat_template, str)
print("[TEXT]")
......@@ -742,8 +799,10 @@ def test_resolve_content_format_hf_defined(model, expected_format):
resolved_format = resolve_chat_template_content_format(
None, # Test detecting the tokenizer's chat_template
None,
"auto",
tokenizer,
trust_remote_code=True,
)
assert resolved_format == expected_format
......@@ -793,8 +852,10 @@ def test_resolve_content_format_examples(template_path, expected_format):
resolved_format = resolve_chat_template_content_format(
chat_template,
None,
"auto",
dummy_tokenizer,
trust_remote_code=True,
)
assert resolved_format == expected_format
# SPDX-License-Identifier: Apache-2.0
import asyncio
import tempfile
from pathlib import Path
from ssl import SSLContext
import pytest
from vllm.entrypoints.ssl import SSLCertRefresher
class MockSSLContext(SSLContext):
def __init__(self):
self.load_cert_chain_count = 0
self.load_ca_count = 0
def load_cert_chain(
self,
certfile,
keyfile=None,
password=None,
):
self.load_cert_chain_count += 1
def load_verify_locations(
self,
cafile=None,
capath=None,
cadata=None,
):
self.load_ca_count += 1
def create_file() -> str:
with tempfile.NamedTemporaryFile(dir='/tmp', delete=False) as f:
return f.name
def touch_file(path: str) -> None:
Path(path).touch()
@pytest.mark.asyncio
async def test_ssl_refresher():
ssl_context = MockSSLContext()
key_path = create_file()
cert_path = create_file()
ca_path = create_file()
ssl_refresher = SSLCertRefresher(ssl_context, key_path, cert_path, ca_path)
await asyncio.sleep(1)
assert ssl_context.load_cert_chain_count == 0
assert ssl_context.load_ca_count == 0
touch_file(key_path)
await asyncio.sleep(1)
assert ssl_context.load_cert_chain_count == 1
assert ssl_context.load_ca_count == 0
touch_file(cert_path)
touch_file(ca_path)
await asyncio.sleep(1)
assert ssl_context.load_cert_chain_count == 2
assert ssl_context.load_ca_count == 1
ssl_refresher.stop()
touch_file(cert_path)
touch_file(ca_path)
await asyncio.sleep(1)
assert ssl_context.load_cert_chain_count == 2
assert ssl_context.load_ca_count == 1
# SPDX-License-Identifier: Apache-2.0
from vllm import SamplingParams
from vllm.config import LoadFormat
test_model = "openai-community/gpt2"
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
def test_model_loader_download_files(vllm_runner):
with vllm_runner(test_model,
load_format=LoadFormat.FASTSAFETENSORS) as llm:
deserialized_outputs = llm.generate(prompts, sampling_params)
assert deserialized_outputs
# SPDX-License-Identifier: Apache-2.0
import glob
import tempfile
import huggingface_hub.constants
import torch
from vllm.model_executor.model_loader.weight_utils import (
download_weights_from_hf, fastsafetensors_weights_iterator,
safetensors_weights_iterator)
def test_fastsafetensors_model_loader():
with tempfile.TemporaryDirectory() as tmpdir:
huggingface_hub.constants.HF_HUB_OFFLINE = False
download_weights_from_hf("openai-community/gpt2",
allow_patterns=["*.safetensors"],
cache_dir=tmpdir)
safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
assert len(safetensors) > 0
fastsafetensors_tensors = {}
hf_safetensors_tensors = {}
for name, tensor in fastsafetensors_weights_iterator(
safetensors, True):
fastsafetensors_tensors[name] = tensor
for name, tensor in safetensors_weights_iterator(safetensors, True):
hf_safetensors_tensors[name] = tensor
assert len(fastsafetensors_tensors) == len(hf_safetensors_tensors)
for name, fastsafetensors_tensor in fastsafetensors_tensors.items():
fastsafetensors_tensor = fastsafetensors_tensor.to('cpu')
assert fastsafetensors_tensor.dtype == hf_safetensors_tensors[
name].dtype
assert fastsafetensors_tensor.shape == hf_safetensors_tensors[
name].shape
assert torch.all(
fastsafetensors_tensor.eq(hf_safetensors_tensors[name]))
if __name__ == "__main__":
test_fastsafetensors_model_loader()
# SPDX-License-Identifier: Apache-2.0
from typing import Optional, Tuple, Union
from typing import Optional, Union
import torch
......@@ -9,8 +9,7 @@ from vllm.platforms import current_platform
# Using the default value (240.0) from pytorch will cause accuracy
# issue on dynamic quantization models. Here use 224.0 for rocm.
ROCM_FP8_MAX = 224.0
FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm() \
else torch.float8_e4m3fn
FP8_DTYPE = current_platform.fp8_dtype()
def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
......@@ -19,7 +18,7 @@ def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
def ref_dynamic_per_token_quant(x: torch.tensor,
quant_dtype: torch.dtype,
scale_ub: Optional[torch.tensor] = None) \
-> Tuple[torch.tensor, torch.tensor]:
-> tuple[torch.tensor, torch.tensor]:
assert quant_dtype in [torch.int8, FP8_DTYPE]
if scale_ub is not None:
......@@ -68,7 +67,7 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
# ref_dynamic_per_token_quant, when we have a dynamic_per_tensor int8 quant
# kernel
def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
-> Tuple[torch.tensor, torch.tensor]:
-> tuple[torch.tensor, torch.tensor]:
fp8_traits = torch.finfo(FP8_DTYPE)
fp8_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \
......
# SPDX-License-Identifier: Apache-2.0
import random
from typing import Type
import pytest
import torch
......@@ -86,7 +85,7 @@ def test_act_and_mul(
@pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode()
def test_activation(
activation: Type[torch.nn.Module],
activation: type[torch.nn.Module],
num_tokens: int,
d: int,
dtype: torch.dtype,
......
# SPDX-License-Identifier: Apache-2.0
import pytest
import torch
from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.allspark_utils import (
ALLSPARK_AMPERE_K_ALIGN, ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
ALLSPARK_AMPERE_N_ALIGN)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
quantize_weights)
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
def is_gptq_allspark_supported(min_capability: int,
max_capability: int) -> bool:
if not current_platform.is_cuda():
return False
capability = current_platform.get_device_capability()
assert capability is not None
return capability.to_int() >= min_capability \
and capability.to_int() <= max_capability
MNK_FACTORS = [
(1, 4, 8),
(13, 17, 67),
(26, 37, 13),
(48, 16, 24),
(67, 13, 88),
(257, 13, 11),
(658, 13, 11),
(1033, 9, 17),
]
DTYPES = [torch.float16, torch.bfloat16]
HAS_ZP_OPTS = [False, True]
def compute_max_diff(output, output_ref):
return torch.mean(torch.abs(output - output_ref)) / torch.mean(
torch.abs(output_ref))
def rand_data(shape, dtype=torch.float16):
return torch.randn(shape, dtype=dtype, device="cuda")
@pytest.mark.skipif(
not is_gptq_allspark_supported(80, 89),
reason="AllSpark Ampere kernel is not supported on this GPU type.")
@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
@pytest.mark.parametrize("group_size", [-1])
@pytest.mark.parametrize("has_zp", HAS_ZP_OPTS)
@pytest.mark.parametrize("dtype", DTYPES)
def test_gptq_allspark_gemm_ampere(mnk_factors, group_size, has_zp, dtype):
m_factor, n_factor, k_factor = mnk_factors
m = m_factor
n = n_factor * ALLSPARK_AMPERE_N_ALIGN
k = k_factor * ALLSPARK_AMPERE_K_ALIGN
input = rand_data((m, k), dtype=dtype)
weight = rand_data((k, n), dtype=dtype)
# Quantize (and apply act_order if provided)
w_ref, qw, s, zp = quantize_weights(weight, scalar_types.uint8b128,
group_size, has_zp)
qw = qw.to(torch.uint8)
if has_zp:
zp = zp.to(dtype)
properties = torch.cuda.get_device_properties(qw.device.index)
sm_count = properties.multi_processor_count
sm_version = properties.major * 10 + properties.minor
n_32align = (n + 32 - 1) // 32 * 32
qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
qw, s, zp, has_zp)
opcheck(torch.ops._C.rearrange_kn_weight_as_n32k16_order,
(qw, s, zp, has_zp, qw_reorder, s_reorder, zp_reorder, k, n,
n_32align))
opcheck(torch.ops._C.allspark_w8a16_gemm,
(input, qw_reorder, s_reorder, zp_reorder, n, group_size, sm_count,
sm_version, ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, has_zp, True),
test_utils=DEFAULT_OPCHECK_TEST_UTILS)
output = ops.allspark_w8a16_gemm(input, qw_reorder, s_reorder, zp_reorder,
n, group_size, sm_count, sm_version,
ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
has_zp, True)
output_ref = torch.matmul(input, w_ref)
torch.cuda.synchronize()
max_diff = compute_max_diff(output, output_ref)
assert max_diff < 0.04
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment