Commit 469e903b authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.2' into v0.8.2-dev

parents 389ebcf7 25f560a6
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
import contextlib import contextlib
import os import os
from typing import Any, List, NamedTuple from typing import Any, NamedTuple
import openai # use the official client for correctness check import openai # use the official client for correctness check
import pytest import pytest
...@@ -11,7 +11,6 @@ from ...utils import RemoteOpenAIServer, models_path_prefix ...@@ -11,7 +11,6 @@ from ...utils import RemoteOpenAIServer, models_path_prefix
# # any model with a chat template should work here # # any model with a chat template should work here
MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct") MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct")
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
API_KEY = "abc-123" API_KEY = "abc-123"
ERROR_API_KEY = "abc" ERROR_API_KEY = "abc"
ROOT_PATH = "llm" ROOT_PATH = "llm"
...@@ -28,8 +27,6 @@ def server(): ...@@ -28,8 +27,6 @@ def server():
"4080", "4080",
"--root-path", # use --root-path=/llm for testing "--root-path", # use --root-path=/llm for testing
"/" + ROOT_PATH, "/" + ROOT_PATH,
"--chat-template",
DUMMY_CHAT_TEMPLATE,
] ]
envs = os.environ.copy() envs = os.environ.copy()
...@@ -40,7 +37,7 @@ def server(): ...@@ -40,7 +37,7 @@ def server():
class TestCase(NamedTuple): class TestCase(NamedTuple):
model_name: str model_name: str
base_url: List[str] base_url: list[str]
api_key: str api_key: str
expected_error: Any expected_error: Any
......
...@@ -36,10 +36,10 @@ INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/c ...@@ -36,10 +36,10 @@ INPUT_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/c
INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} INVALID_INPUT_BATCH = """{"invalid_field": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}""" {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "NousResearch/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}"""
INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}} INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}} {"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are an unhelpful assistant."}}
{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "Hello world!"}} {"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "Hello world!"}}
{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}""" {"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
...@@ -55,7 +55,7 @@ def test_empty_file(): ...@@ -55,7 +55,7 @@ def test_empty_file():
proc = subprocess.Popen([ proc = subprocess.Popen([
sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
input_file.name, "-o", output_file.name, "--model", input_file.name, "-o", output_file.name, "--model",
os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct") os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")
], ) ], )
proc.communicate() proc.communicate()
proc.wait() proc.wait()
...@@ -115,7 +115,7 @@ def test_embeddings(): ...@@ -115,7 +115,7 @@ def test_embeddings():
proc = subprocess.Popen([ proc = subprocess.Popen([
sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i", sys.executable, "-m", "vllm.entrypoints.openai.run_batch", "-i",
input_file.name, "-o", output_file.name, "--model", input_file.name, "-o", output_file.name, "--model",
os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct") os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")
], ) ], )
proc.communicate() proc.communicate()
proc.wait() proc.wait()
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os import os
import math
from typing import Any
import pytest import pytest
import requests import requests
import torch.nn.functional as F
from torch import tensor
from vllm.entrypoints.openai.protocol import ScoreResponse from vllm.entrypoints.openai.protocol import ScoreResponse
from ...utils import RemoteOpenAIServer, models_path_prefix from ...utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3") MODELS = [
{
"name": os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3"),
@pytest.fixture(scope="module") "is_cross_encoder": True
def server(): },
args = ["--enforce-eager", "--max-model-len", "100"] {
"name": "BAAI/bge-base-en-v1.5",
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: "is_cross_encoder": False
},
]
DTYPE = "half"
def run_transformers(hf_model, model, text_pairs):
if model["is_cross_encoder"]:
return hf_model.predict(text_pairs).tolist()
else:
hf_embeddings = [
hf_model.encode(text_pair) for text_pair in text_pairs
]
return [
F.cosine_similarity(tensor(pair[0]), tensor(pair[1]), dim=0)
for pair in hf_embeddings
]
@pytest.fixture(scope="class", params=MODELS)
def model(request):
yield request.param
@pytest.fixture(scope="class")
def server(model: dict[str, Any]):
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
with RemoteOpenAIServer(model["name"], args) as remote_server:
yield remote_server yield remote_server
@pytest.mark.asyncio @pytest.fixture(scope="class")
@pytest.mark.parametrize("model_name", [MODEL_NAME]) def runner(model: dict[str, Any], hf_runner):
def test_text_1_str_text_2_list(server: RemoteOpenAIServer, model_name: str): kwargs = {
text_1 = "What is the capital of France?" "dtype": DTYPE,
text_2 = [ "is_cross_encoder" if model["is_cross_encoder"]\
"The capital of Brazil is Brasilia.", "The capital of France is Paris." else "is_sentence_transformer": True
] }
score_response = requests.post(server.url_for("score"), with hf_runner(model["name"], **kwargs) as hf_model:
json={ yield hf_model
"model": model_name,
"text_1": text_1,
"text_2": text_2, class TestModel:
})
score_response.raise_for_status() def test_text_1_str_text_2_list(self, server: RemoteOpenAIServer,
score = ScoreResponse.model_validate(score_response.json()) model: dict[str, Any], runner):
text_1 = "What is the capital of France?"
assert score.id is not None text_2 = [
assert score.data is not None "The capital of Brazil is Brasilia.",
assert len(score.data) == 2 "The capital of France is Paris."
assert score.data[0].score <= 0.01 ]
assert score.data[1].score >= 0.9
score_response = requests.post(server.url_for("score"),
json={
@pytest.mark.asyncio "model": model["name"],
@pytest.mark.parametrize("model_name", [MODEL_NAME]) "text_1": text_1,
def test_text_1_list_text_2_list(server: RemoteOpenAIServer, model_name: str): "text_2": text_2,
text_1 = [ })
"What is the capital of the United States?", score_response.raise_for_status()
"What is the capital of France?" score = ScoreResponse.model_validate(score_response.json())
]
text_2 = [ assert score.id is not None
"The capital of Brazil is Brasilia.", "The capital of France is Paris." assert score.data is not None
] assert len(score.data) == 2
score_response = requests.post(server.url_for("score"), vllm_outputs = [d.score for d in score.data]
json={
"model": model_name, text_pairs = [[text_1, text_2[0]], [text_1, text_2[1]]]
"text_1": text_1, hf_outputs = run_transformers(runner, model, text_pairs)
"text_2": text_2,
}) for i in range(len(vllm_outputs)):
score_response.raise_for_status() assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
score = ScoreResponse.model_validate(score_response.json())
def test_text_1_list_text_2_list(self, server: RemoteOpenAIServer,
assert score.id is not None model: dict[str, Any], runner):
assert score.data is not None text_1 = [
assert len(score.data) == 2 "What is the capital of the United States?",
assert score.data[0].score <= 0.01 "What is the capital of France?"
assert score.data[1].score >= 0.9 ]
text_2 = [
"The capital of Brazil is Brasilia.",
@pytest.mark.asyncio "The capital of France is Paris."
@pytest.mark.parametrize("model_name", [MODEL_NAME]) ]
def test_text_1_str_text_2_str(server: RemoteOpenAIServer, model_name: str):
text_1 = "What is the capital of France?" score_response = requests.post(server.url_for("score"),
text_2 = "The capital of France is Paris." json={
"model": model["name"],
score_response = requests.post(server.url_for("score"), "text_1": text_1,
json={ "text_2": text_2,
"model": model_name, })
"text_1": text_1, score_response.raise_for_status()
"text_2": text_2, score = ScoreResponse.model_validate(score_response.json())
})
score_response.raise_for_status() assert score.id is not None
score = ScoreResponse.model_validate(score_response.json()) assert score.data is not None
assert len(score.data) == 2
assert score.id is not None
assert score.data is not None vllm_outputs = [d.score for d in score.data]
assert len(score.data) == 1
assert score.data[0].score >= 0.9 text_pairs = [[text_1[0], text_2[0]], [text_1[1], text_2[1]]]
hf_outputs = run_transformers(runner, model, text_pairs)
@pytest.mark.asyncio for i in range(len(vllm_outputs)):
@pytest.mark.parametrize("model_name", [MODEL_NAME]) assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
def test_score_max_model_len(server: RemoteOpenAIServer, model_name: str):
def test_text_1_str_text_2_str(self, server: RemoteOpenAIServer,
text_1 = "What is the capital of France?" * 20 model: dict[str, Any], runner):
text_2 = [ text_1 = "What is the capital of France?"
"The capital of Brazil is Brasilia.", "The capital of France is Paris." text_2 = "The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
score_response = requests.post(server.url_for("score"), json={
json={ "model": model["name"],
"model": model_name, "text_1": text_1,
"text_1": text_1, "text_2": text_2,
"text_2": text_2, })
}) score_response.raise_for_status()
assert score_response.status_code == 400 score = ScoreResponse.model_validate(score_response.json())
# Assert just a small fragments of the response
assert "Please reduce the length of the input." in \ assert score.id is not None
score_response.text assert score.data is not None
assert len(score.data) == 1
# Test truncation
score_response = requests.post(server.url_for("score"), vllm_outputs = [d.score for d in score.data]
json={
"model": model_name, text_pairs = [[text_1, text_2]]
"text_1": text_1, hf_outputs = run_transformers(runner, model, text_pairs)
"text_2": text_2,
"truncate_prompt_tokens": 101 for i in range(len(vllm_outputs)):
}) assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
assert score_response.status_code == 400
assert "Please, select a smaller truncation size." in \ def test_score_max_model_len(self, server: RemoteOpenAIServer,
score_response.text model: dict[str, Any]):
text_1 = "What is the capital of France?" * 20
text_2 = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
})
assert score_response.status_code == 400
# Assert just a small fragments of the response
assert "Please reduce the length of the input." in \
score_response.text
# Test truncation
score_response = requests.post(server.url_for("score"),
json={
"model": model["name"],
"text_1": text_1,
"text_2": text_2,
"truncate_prompt_tokens": 101
})
assert score_response.status_code == 400
assert "Please, select a smaller truncation size." in \
score_response.text
...@@ -39,6 +39,7 @@ class MockModelConfig: ...@@ -39,6 +39,7 @@ class MockModelConfig:
diff_sampling_param: Optional[dict] = None diff_sampling_param: Optional[dict] = None
allowed_local_media_path: str = "" allowed_local_media_path: str = ""
encoder_config = None encoder_config = None
generation_config: str = "auto"
def get_diff_sampling_param(self): def get_diff_sampling_param(self):
return self.diff_sampling_param or {} return self.diff_sampling_param or {}
......
...@@ -9,8 +9,8 @@ import os ...@@ -9,8 +9,8 @@ import os
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.protocol import (ErrorResponse, from vllm.entrypoints.openai.protocol import (ErrorResponse,
LoadLoraAdapterRequest, LoadLoRAAdapterRequest,
UnloadLoraAdapterRequest) UnloadLoRAAdapterRequest)
from vllm.entrypoints.openai.serving_models import (BaseModelPath, from vllm.entrypoints.openai.serving_models import (BaseModelPath,
OpenAIServingModels) OpenAIServingModels)
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
...@@ -53,7 +53,7 @@ async def test_serving_model_name(): ...@@ -53,7 +53,7 @@ async def test_serving_model_name():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_load_lora_adapter_success(): async def test_load_lora_adapter_success():
serving_models = await _async_serving_models_init() serving_models = await _async_serving_models_init()
request = LoadLoraAdapterRequest(lora_name="adapter", request = LoadLoRAAdapterRequest(lora_name="adapter",
lora_path="/path/to/adapter2") lora_path="/path/to/adapter2")
response = await serving_models.load_lora_adapter(request) response = await serving_models.load_lora_adapter(request)
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter') assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
...@@ -64,7 +64,7 @@ async def test_load_lora_adapter_success(): ...@@ -64,7 +64,7 @@ async def test_load_lora_adapter_success():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_load_lora_adapter_missing_fields(): async def test_load_lora_adapter_missing_fields():
serving_models = await _async_serving_models_init() serving_models = await _async_serving_models_init()
request = LoadLoraAdapterRequest(lora_name="", lora_path="") request = LoadLoRAAdapterRequest(lora_name="", lora_path="")
response = await serving_models.load_lora_adapter(request) response = await serving_models.load_lora_adapter(request)
assert isinstance(response, ErrorResponse) assert isinstance(response, ErrorResponse)
assert response.type == "InvalidUserInput" assert response.type == "InvalidUserInput"
...@@ -74,14 +74,14 @@ async def test_load_lora_adapter_missing_fields(): ...@@ -74,14 +74,14 @@ async def test_load_lora_adapter_missing_fields():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_load_lora_adapter_duplicate(): async def test_load_lora_adapter_duplicate():
serving_models = await _async_serving_models_init() serving_models = await _async_serving_models_init()
request = LoadLoraAdapterRequest(lora_name="adapter1", request = LoadLoRAAdapterRequest(lora_name="adapter1",
lora_path="/path/to/adapter1") lora_path="/path/to/adapter1")
response = await serving_models.load_lora_adapter(request) response = await serving_models.load_lora_adapter(request)
assert response == LORA_LOADING_SUCCESS_MESSAGE.format( assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
lora_name='adapter1') lora_name='adapter1')
assert len(serving_models.lora_requests) == 1 assert len(serving_models.lora_requests) == 1
request = LoadLoraAdapterRequest(lora_name="adapter1", request = LoadLoRAAdapterRequest(lora_name="adapter1",
lora_path="/path/to/adapter1") lora_path="/path/to/adapter1")
response = await serving_models.load_lora_adapter(request) response = await serving_models.load_lora_adapter(request)
assert isinstance(response, ErrorResponse) assert isinstance(response, ErrorResponse)
...@@ -93,12 +93,12 @@ async def test_load_lora_adapter_duplicate(): ...@@ -93,12 +93,12 @@ async def test_load_lora_adapter_duplicate():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_unload_lora_adapter_success(): async def test_unload_lora_adapter_success():
serving_models = await _async_serving_models_init() serving_models = await _async_serving_models_init()
request = LoadLoraAdapterRequest(lora_name="adapter1", request = LoadLoRAAdapterRequest(lora_name="adapter1",
lora_path="/path/to/adapter1") lora_path="/path/to/adapter1")
response = await serving_models.load_lora_adapter(request) response = await serving_models.load_lora_adapter(request)
assert len(serving_models.lora_requests) == 1 assert len(serving_models.lora_requests) == 1
request = UnloadLoraAdapterRequest(lora_name="adapter1") request = UnloadLoRAAdapterRequest(lora_name="adapter1")
response = await serving_models.unload_lora_adapter(request) response = await serving_models.unload_lora_adapter(request)
assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format( assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
lora_name='adapter1') lora_name='adapter1')
...@@ -108,7 +108,7 @@ async def test_unload_lora_adapter_success(): ...@@ -108,7 +108,7 @@ async def test_unload_lora_adapter_success():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_unload_lora_adapter_missing_fields(): async def test_unload_lora_adapter_missing_fields():
serving_models = await _async_serving_models_init() serving_models = await _async_serving_models_init()
request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None) request = UnloadLoRAAdapterRequest(lora_name="", lora_int_id=None)
response = await serving_models.unload_lora_adapter(request) response = await serving_models.unload_lora_adapter(request)
assert isinstance(response, ErrorResponse) assert isinstance(response, ErrorResponse)
assert response.type == "InvalidUserInput" assert response.type == "InvalidUserInput"
...@@ -118,7 +118,7 @@ async def test_unload_lora_adapter_missing_fields(): ...@@ -118,7 +118,7 @@ async def test_unload_lora_adapter_missing_fields():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_unload_lora_adapter_not_found(): async def test_unload_lora_adapter_not_found():
serving_models = await _async_serving_models_init() serving_models = await _async_serving_models_init()
request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter") request = UnloadLoRAAdapterRequest(lora_name="nonexistent_adapter")
response = await serving_models.unload_lora_adapter(request) response = await serving_models.unload_lora_adapter(request)
assert isinstance(response, ErrorResponse) assert isinstance(response, ErrorResponse)
assert response.type == "NotFoundError" assert response.type == "NotFoundError"
......
...@@ -28,5 +28,12 @@ def test_sleep_mode(): ...@@ -28,5 +28,12 @@ def test_sleep_mode():
response = requests.post(remote_server.url_for("/sleep"), response = requests.post(remote_server.url_for("/sleep"),
data={"level": "1"}) data={"level": "1"})
assert response.status_code == 200 assert response.status_code == 200
response = requests.get(remote_server.url_for("/is_sleeping"))
assert response.status_code == 200
assert response.json().get("is_sleeping") is True
response = requests.post(remote_server.url_for("/wake_up")) response = requests.post(remote_server.url_for("/wake_up"))
assert response.status_code == 200 assert response.status_code == 200
response = requests.get(remote_server.url_for("/is_sleeping"))
assert response.status_code == 200
assert response.json().get("is_sleeping") is False
...@@ -3,12 +3,14 @@ ...@@ -3,12 +3,14 @@
# imports for guided decoding tests # imports for guided decoding tests
import io import io
import json import json
from unittest.mock import patch
import librosa import librosa
import numpy as np import numpy as np
import openai import openai
import pytest import pytest
import soundfile as sf import soundfile as sf
from openai._base_client import AsyncAPIClient
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
...@@ -120,3 +122,73 @@ async def test_completion_endpoints(): ...@@ -120,3 +122,73 @@ async def test_completion_endpoints():
res = await client.completions.create(model=model_name, prompt="Hello") res = await client.completions.create(model=model_name, prompt="Hello")
assert res.code == 400 assert res.code == 400
assert res.message == "The model does not support Completions API" assert res.message == "The model does not support Completions API"
@pytest.mark.asyncio
async def test_streaming_response(winning_call):
model_name = "openai/whisper-small"
server_args = ["--enforce-eager"]
transcription = ""
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
res_no_stream = await client.audio.transcriptions.create(
model=model_name,
file=winning_call,
response_format="json",
language="en",
temperature=0.0)
# Unfortunately this only works when the openai client is patched
# to use streaming mode, not exposed in the transcription api.
original_post = AsyncAPIClient.post
async def post_with_stream(*args, **kwargs):
kwargs['stream'] = True
return await original_post(*args, **kwargs)
with patch.object(AsyncAPIClient, "post", new=post_with_stream):
client = remote_server.get_async_client()
res = await client.audio.transcriptions.create(
model=model_name,
file=winning_call,
language="en",
temperature=0.0,
extra_body=dict(stream=True))
# Reconstruct from chunks and validate
async for chunk in res:
# just a chunk
text = chunk.choices[0]['delta']['content']
transcription += text
assert transcription == res_no_stream.text
@pytest.mark.asyncio
async def test_stream_options(winning_call):
model_name = "openai/whisper-small"
server_args = ["--enforce-eager"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
original_post = AsyncAPIClient.post
async def post_with_stream(*args, **kwargs):
kwargs['stream'] = True
return await original_post(*args, **kwargs)
with patch.object(AsyncAPIClient, "post", new=post_with_stream):
client = remote_server.get_async_client()
res = await client.audio.transcriptions.create(
model=model_name,
file=winning_call,
language="en",
temperature=0.0,
extra_body=dict(stream=True,
stream_include_usage=True,
stream_continuous_usage_stats=True))
final = False
continuous = True
async for chunk in res:
if not len(chunk.choices):
# final usage sent
final = True
else:
continuous = continuous and hasattr(chunk, 'usage')
assert final and continuous
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from typing import Dict, List
import os import os
import openai import openai
import pytest import pytest
...@@ -34,8 +32,6 @@ def server(): ...@@ -34,8 +32,6 @@ def server():
args = [ args = [
"--task", "--task",
"generate", "generate",
"--dtype",
"bfloat16",
"--max-model-len", "--max-model-len",
"32768", "32768",
"--max-num-seqs", "--max-num-seqs",
...@@ -57,7 +53,7 @@ async def client(server): ...@@ -57,7 +53,7 @@ async def client(server):
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def base64_encoded_video() -> Dict[str, str]: def base64_encoded_video() -> dict[str, str]:
return { return {
video_url: encode_video_base64(fetch_video(video_url)) video_url: encode_video_base64(fetch_video(video_url))
for video_url in TEST_VIDEO_URLS for video_url in TEST_VIDEO_URLS
...@@ -99,7 +95,7 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI, ...@@ -99,7 +95,7 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
assert choice.finish_reason == "length" assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage( assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=6299, total_tokens=6309) completion_tokens=10, prompt_tokens=6287, total_tokens=6297)
message = choice.message message = choice.message
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
...@@ -159,7 +155,7 @@ async def test_single_chat_session_video_beamsearch(client: openai.AsyncOpenAI, ...@@ -159,7 +155,7 @@ async def test_single_chat_session_video_beamsearch(client: openai.AsyncOpenAI,
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video_base64encoded( async def test_single_chat_session_video_base64encoded(
client: openai.AsyncOpenAI, model_name: str, video_url: str, client: openai.AsyncOpenAI, model_name: str, video_url: str,
base64_encoded_video: Dict[str, str]): base64_encoded_video: dict[str, str]):
messages = [{ messages = [{
"role": "role":
...@@ -192,7 +188,7 @@ async def test_single_chat_session_video_base64encoded( ...@@ -192,7 +188,7 @@ async def test_single_chat_session_video_base64encoded(
choice = chat_completion.choices[0] choice = chat_completion.choices[0]
assert choice.finish_reason == "length" assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage( assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=6299, total_tokens=6309) completion_tokens=10, prompt_tokens=6287, total_tokens=6297)
message = choice.message message = choice.message
message = chat_completion.choices[0].message message = chat_completion.choices[0].message
...@@ -217,7 +213,7 @@ async def test_single_chat_session_video_base64encoded( ...@@ -217,7 +213,7 @@ async def test_single_chat_session_video_base64encoded(
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video_base64encoded_beamsearch( async def test_single_chat_session_video_base64encoded_beamsearch(
client: openai.AsyncOpenAI, model_name: str, video_url: str, client: openai.AsyncOpenAI, model_name: str, video_url: str,
base64_encoded_video: Dict[str, str]): base64_encoded_video: dict[str, str]):
messages = [{ messages = [{
"role": "role":
...@@ -287,7 +283,7 @@ async def test_chat_streaming_video(client: openai.AsyncOpenAI, ...@@ -287,7 +283,7 @@ async def test_chat_streaming_video(client: openai.AsyncOpenAI,
temperature=0.0, temperature=0.0,
stream=True, stream=True,
) )
chunks: List[str] = [] chunks: list[str] = []
finish_reason_count = 0 finish_reason_count = 0
async for chunk in stream: async for chunk in stream:
delta = chunk.choices[0].delta delta = chunk.choices[0].delta
...@@ -310,7 +306,7 @@ async def test_chat_streaming_video(client: openai.AsyncOpenAI, ...@@ -310,7 +306,7 @@ async def test_chat_streaming_video(client: openai.AsyncOpenAI,
"video_urls", "video_urls",
[TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))]) [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))])
async def test_multi_video_input(client: openai.AsyncOpenAI, model_name: str, async def test_multi_video_input(client: openai.AsyncOpenAI, model_name: str,
video_urls: List[str]): video_urls: list[str]):
messages = [{ messages = [{
"role": "role":
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from typing import Dict, List
import openai import openai
import pytest import pytest
import os import os
...@@ -36,8 +34,6 @@ def server(): ...@@ -36,8 +34,6 @@ def server():
args = [ args = [
"--task", "--task",
"generate", "generate",
"--dtype",
"bfloat16",
"--max-model-len", "--max-model-len",
"2048", "2048",
"--max-num-seqs", "--max-num-seqs",
...@@ -59,7 +55,7 @@ async def client(server): ...@@ -59,7 +55,7 @@ async def client(server):
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def base64_encoded_image() -> Dict[str, str]: def base64_encoded_image() -> dict[str, str]:
return { return {
image_url: encode_image_base64(fetch_image(image_url)) image_url: encode_image_base64(fetch_image(image_url))
for image_url in TEST_IMAGE_URLS for image_url in TEST_IMAGE_URLS
...@@ -161,7 +157,7 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI, ...@@ -161,7 +157,7 @@ async def test_single_chat_session_image_beamsearch(client: openai.AsyncOpenAI,
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_single_chat_session_image_base64encoded( async def test_single_chat_session_image_base64encoded(
client: openai.AsyncOpenAI, model_name: str, image_url: str, client: openai.AsyncOpenAI, model_name: str, image_url: str,
base64_encoded_image: Dict[str, str]): base64_encoded_image: dict[str, str]):
messages = [{ messages = [{
"role": "role":
...@@ -219,7 +215,7 @@ async def test_single_chat_session_image_base64encoded( ...@@ -219,7 +215,7 @@ async def test_single_chat_session_image_base64encoded(
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
async def test_single_chat_session_image_base64encoded_beamsearch( async def test_single_chat_session_image_base64encoded_beamsearch(
client: openai.AsyncOpenAI, model_name: str, image_url: str, client: openai.AsyncOpenAI, model_name: str, image_url: str,
base64_encoded_image: Dict[str, str]): base64_encoded_image: dict[str, str]):
messages = [{ messages = [{
"role": "role":
...@@ -289,7 +285,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI, ...@@ -289,7 +285,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
temperature=0.0, temperature=0.0,
stream=True, stream=True,
) )
chunks: List[str] = [] chunks: list[str] = []
finish_reason_count = 0 finish_reason_count = 0
async for chunk in stream: async for chunk in stream:
delta = chunk.choices[0].delta delta = chunk.choices[0].delta
...@@ -312,7 +308,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI, ...@@ -312,7 +308,7 @@ async def test_chat_streaming_image(client: openai.AsyncOpenAI,
"image_urls", "image_urls",
[TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))]) [TEST_IMAGE_URLS[:i] for i in range(2, len(TEST_IMAGE_URLS))])
async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str, async def test_multi_image_input(client: openai.AsyncOpenAI, model_name: str,
image_urls: List[str]): image_urls: list[str]):
messages = [{ messages = [{
"role": "role":
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from typing import Dict
import os import os
import pytest import pytest
import requests import requests
...@@ -38,8 +36,6 @@ def server(): ...@@ -38,8 +36,6 @@ def server():
args = [ args = [
"--task", "--task",
"embed", "embed",
"--dtype",
"bfloat16",
"--max-model-len", "--max-model-len",
"2048", "2048",
"--max-num-seqs", "--max-num-seqs",
...@@ -57,7 +53,7 @@ def server(): ...@@ -57,7 +53,7 @@ def server():
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def base64_encoded_image() -> Dict[str, str]: def base64_encoded_image() -> dict[str, str]:
return { return {
image_url: encode_image_base64(fetch_image(image_url)) image_url: encode_image_base64(fetch_image(image_url))
for image_url in TEST_IMAGE_URLS for image_url in TEST_IMAGE_URLS
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from typing import List
from unittest.mock import MagicMock from unittest.mock import MagicMock
import pytest import pytest
...@@ -125,7 +124,7 @@ TEST_CASES = [ ...@@ -125,7 +124,7 @@ TEST_CASES = [
@pytest.mark.parametrize("streaming, model_output, expected_tool_calls", @pytest.mark.parametrize("streaming, model_output, expected_tool_calls",
TEST_CASES) TEST_CASES)
def test_tool_call(streaming: bool, model_output: str, def test_tool_call(streaming: bool, model_output: str,
expected_tool_calls: List[FunctionCall]): expected_tool_calls: list[FunctionCall]):
mock_tokenizer = MagicMock() mock_tokenizer = MagicMock()
tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")( tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
mock_tokenizer) mock_tokenizer)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from typing import Iterable, List, Tuple, Union from collections.abc import Iterable
from typing import Union
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaMessage, DeltaMessage,
...@@ -12,7 +13,7 @@ from vllm.entrypoints.openai.tool_parsers import ToolParser ...@@ -12,7 +13,7 @@ from vllm.entrypoints.openai.tool_parsers import ToolParser
class StreamingToolReconstructor: class StreamingToolReconstructor:
def __init__(self, assert_one_tool_per_delta: bool = True): def __init__(self, assert_one_tool_per_delta: bool = True):
self.tool_calls: List[ToolCall] = [] self.tool_calls: list[ToolCall] = []
self.other_content: str = "" self.other_content: str = ""
self._assert_one_tool_per_delta = assert_one_tool_per_delta self._assert_one_tool_per_delta = assert_one_tool_per_delta
...@@ -72,7 +73,7 @@ def run_tool_extraction( ...@@ -72,7 +73,7 @@ def run_tool_extraction(
request: Union[ChatCompletionRequest, None] = None, request: Union[ChatCompletionRequest, None] = None,
streaming: bool = False, streaming: bool = False,
assert_one_tool_per_delta: bool = True, assert_one_tool_per_delta: bool = True,
) -> Tuple[Union[str, None], List[ToolCall]]: ) -> tuple[Union[str, None], list[ToolCall]]:
if streaming: if streaming:
reconstructor = run_tool_extraction_streaming( reconstructor = run_tool_extraction_streaming(
tool_parser, tool_parser,
...@@ -106,7 +107,7 @@ def run_tool_extraction_streaming( ...@@ -106,7 +107,7 @@ def run_tool_extraction_streaming(
reconstructor = StreamingToolReconstructor( reconstructor = StreamingToolReconstructor(
assert_one_tool_per_delta=assert_one_tool_per_delta) assert_one_tool_per_delta=assert_one_tool_per_delta)
previous_text = "" previous_text = ""
previous_tokens: List[int] = [] previous_tokens: list[int] = []
for delta in model_deltas: for delta in model_deltas:
token_delta = [ token_delta = [
tool_parser.vocab.get(token) tool_parser.vocab.get(token)
......
...@@ -5,10 +5,13 @@ from typing import Optional ...@@ -5,10 +5,13 @@ from typing import Optional
import pytest import pytest
import os import os
from packaging.version import Version
from transformers import __version__ as TRANSFORMERS_VERSION
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template, from vllm.entrypoints.chat_utils import (_resolve_hf_chat_template,
_try_extract_ast, load_chat_template,
parse_chat_messages, parse_chat_messages,
parse_chat_messages_futures, parse_chat_messages_futures,
resolve_chat_template_content_format) resolve_chat_template_content_format)
...@@ -22,11 +25,14 @@ from ..utils import VLLM_PATH ...@@ -22,11 +25,14 @@ from ..utils import VLLM_PATH
EXAMPLES_DIR = VLLM_PATH / "examples" EXAMPLES_DIR = VLLM_PATH / "examples"
PHI3V_MODEL_ID = os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct") PHI3V_MODEL_ID = os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")
ULTRAVOX_MODEL_ID = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b") ULTRAVOX_MODEL_ID = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b")
QWEN2VL_MODEL_ID = os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct") QWEN2VL_MODEL_ID = os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")
QWEN25VL_MODEL_ID = os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct")
MLLAMA_MODEL_ID = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-11B-Vision-Instruct") MLLAMA_MODEL_ID = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-11B-Vision-Instruct")
LLAMA_GUARD_MODEL_ID = os.path.join(models_path_prefix, "meta-llama/Llama-Guard-3-1B") LLAMA_GUARD_MODEL_ID = os.path.join(models_path_prefix, "meta-llama/Llama-Guard-3-1B")
HERMES_MODEL_ID = os.path.join(models_path_prefix, "NousResearch/Hermes-3-Llama-3.1-8B")
@pytest.fixture(scope="function") @pytest.fixture(scope="function")
...@@ -36,7 +42,7 @@ def phi3v_model_config(): ...@@ -36,7 +42,7 @@ def phi3v_model_config():
tokenizer=PHI3V_MODEL_ID, tokenizer=PHI3V_MODEL_ID,
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=True, trust_remote_code=True,
dtype="bfloat16", dtype="auto",
seed=0, seed=0,
limit_mm_per_prompt={ limit_mm_per_prompt={
"image": 2, "image": 2,
...@@ -60,7 +66,7 @@ def mllama_model_config(): ...@@ -60,7 +66,7 @@ def mllama_model_config():
tokenizer=MLLAMA_MODEL_ID, tokenizer=MLLAMA_MODEL_ID,
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=True, trust_remote_code=True,
dtype="bfloat16", dtype="auto",
seed=0, seed=0,
limit_mm_per_prompt={ limit_mm_per_prompt={
"image": 2, "image": 2,
...@@ -671,7 +677,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url): ...@@ -671,7 +677,7 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
tokenizer=MLLAMA_MODEL_ID, tokenizer=MLLAMA_MODEL_ID,
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=True, trust_remote_code=True,
dtype="bfloat16", dtype="auto",
seed=0, seed=0,
limit_mm_per_prompt={ limit_mm_per_prompt={
"image": 2, "image": 2,
...@@ -705,25 +711,70 @@ def test_multimodal_image_parsing_matches_hf(model, image_url): ...@@ -705,25 +711,70 @@ def test_multimodal_image_parsing_matches_hf(model, image_url):
vllm_result = apply_hf_chat_template( vllm_result = apply_hf_chat_template(
tokenizer, tokenizer,
trust_remote_code=model_config.trust_remote_code,
conversation=conversation, conversation=conversation,
chat_template=None, chat_template=None,
tools=None,
add_generation_prompt=True, add_generation_prompt=True,
) )
assert hf_result == vllm_result assert hf_result == vllm_result
@pytest.mark.parametrize(
"model",
[
QWEN2VL_MODEL_ID, # tokenizer.chat_template is of type str
HERMES_MODEL_ID, # tokenizer.chat_template is of type dict
])
@pytest.mark.parametrize("use_tools", [True, False])
def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
"""checks that chat_template is a dict type for HF models."""
# Build the tokenizer group and grab the underlying tokenizer
tokenizer_group = TokenizerGroup(
model,
enable_lora=False,
max_num_seqs=5,
max_input_length=None,
)
tokenizer = tokenizer_group.tokenizer
tools = [{
"type": "function",
"function": {
"name": "dummy_function_name",
"description": "This is a dummy function",
"parameters": sample_json_schema
}
}] if use_tools else None
# Test detecting the tokenizer's chat_template
chat_template = _resolve_hf_chat_template(
tokenizer,
chat_template=None,
tools=tools,
trust_remote_code=True,
)
assert isinstance(chat_template, str)
# yapf: disable # yapf: disable
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model", "expected_format"), ("model", "expected_format"),
[(PHI3V_MODEL_ID, "string"), [(PHI3V_MODEL_ID, "string"),
(QWEN2VL_MODEL_ID, "openai"), (QWEN2VL_MODEL_ID, "openai"),
(QWEN25VL_MODEL_ID, "openai"),
(ULTRAVOX_MODEL_ID, "string"), (ULTRAVOX_MODEL_ID, "string"),
(MLLAMA_MODEL_ID, "openai"), (MLLAMA_MODEL_ID, "openai"),
(LLAMA_GUARD_MODEL_ID, "openai")], (LLAMA_GUARD_MODEL_ID, "openai")],
) )
# yapf: enable # yapf: enable
def test_resolve_content_format_hf_defined(model, expected_format): def test_resolve_content_format_hf_defined(model, expected_format):
if model == QWEN25VL_MODEL_ID and Version(TRANSFORMERS_VERSION) < Version(
"4.49.0"):
pytest.skip("Qwen2.5-VL requires transformers>=4.49.0")
tokenizer_group = TokenizerGroup( tokenizer_group = TokenizerGroup(
model, model,
enable_lora=False, enable_lora=False,
...@@ -732,7 +783,13 @@ def test_resolve_content_format_hf_defined(model, expected_format): ...@@ -732,7 +783,13 @@ def test_resolve_content_format_hf_defined(model, expected_format):
) )
tokenizer = tokenizer_group.tokenizer tokenizer = tokenizer_group.tokenizer
chat_template = tokenizer.chat_template # Test detecting the tokenizer's chat_template
chat_template = _resolve_hf_chat_template(
tokenizer,
chat_template=None,
tools=None,
trust_remote_code=True,
)
assert isinstance(chat_template, str) assert isinstance(chat_template, str)
print("[TEXT]") print("[TEXT]")
...@@ -742,8 +799,10 @@ def test_resolve_content_format_hf_defined(model, expected_format): ...@@ -742,8 +799,10 @@ def test_resolve_content_format_hf_defined(model, expected_format):
resolved_format = resolve_chat_template_content_format( resolved_format = resolve_chat_template_content_format(
None, # Test detecting the tokenizer's chat_template None, # Test detecting the tokenizer's chat_template
None,
"auto", "auto",
tokenizer, tokenizer,
trust_remote_code=True,
) )
assert resolved_format == expected_format assert resolved_format == expected_format
...@@ -793,8 +852,10 @@ def test_resolve_content_format_examples(template_path, expected_format): ...@@ -793,8 +852,10 @@ def test_resolve_content_format_examples(template_path, expected_format):
resolved_format = resolve_chat_template_content_format( resolved_format = resolve_chat_template_content_format(
chat_template, chat_template,
None,
"auto", "auto",
dummy_tokenizer, dummy_tokenizer,
trust_remote_code=True,
) )
assert resolved_format == expected_format assert resolved_format == expected_format
# SPDX-License-Identifier: Apache-2.0
import asyncio
import tempfile
from pathlib import Path
from ssl import SSLContext
import pytest
from vllm.entrypoints.ssl import SSLCertRefresher
class MockSSLContext(SSLContext):
def __init__(self):
self.load_cert_chain_count = 0
self.load_ca_count = 0
def load_cert_chain(
self,
certfile,
keyfile=None,
password=None,
):
self.load_cert_chain_count += 1
def load_verify_locations(
self,
cafile=None,
capath=None,
cadata=None,
):
self.load_ca_count += 1
def create_file() -> str:
with tempfile.NamedTemporaryFile(dir='/tmp', delete=False) as f:
return f.name
def touch_file(path: str) -> None:
Path(path).touch()
@pytest.mark.asyncio
async def test_ssl_refresher():
ssl_context = MockSSLContext()
key_path = create_file()
cert_path = create_file()
ca_path = create_file()
ssl_refresher = SSLCertRefresher(ssl_context, key_path, cert_path, ca_path)
await asyncio.sleep(1)
assert ssl_context.load_cert_chain_count == 0
assert ssl_context.load_ca_count == 0
touch_file(key_path)
await asyncio.sleep(1)
assert ssl_context.load_cert_chain_count == 1
assert ssl_context.load_ca_count == 0
touch_file(cert_path)
touch_file(ca_path)
await asyncio.sleep(1)
assert ssl_context.load_cert_chain_count == 2
assert ssl_context.load_ca_count == 1
ssl_refresher.stop()
touch_file(cert_path)
touch_file(ca_path)
await asyncio.sleep(1)
assert ssl_context.load_cert_chain_count == 2
assert ssl_context.load_ca_count == 1
# SPDX-License-Identifier: Apache-2.0
from vllm import SamplingParams
from vllm.config import LoadFormat
test_model = "openai-community/gpt2"
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
def test_model_loader_download_files(vllm_runner):
with vllm_runner(test_model,
load_format=LoadFormat.FASTSAFETENSORS) as llm:
deserialized_outputs = llm.generate(prompts, sampling_params)
assert deserialized_outputs
# SPDX-License-Identifier: Apache-2.0
import glob
import tempfile
import huggingface_hub.constants
import torch
from vllm.model_executor.model_loader.weight_utils import (
download_weights_from_hf, fastsafetensors_weights_iterator,
safetensors_weights_iterator)
def test_fastsafetensors_model_loader():
with tempfile.TemporaryDirectory() as tmpdir:
huggingface_hub.constants.HF_HUB_OFFLINE = False
download_weights_from_hf("openai-community/gpt2",
allow_patterns=["*.safetensors"],
cache_dir=tmpdir)
safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
assert len(safetensors) > 0
fastsafetensors_tensors = {}
hf_safetensors_tensors = {}
for name, tensor in fastsafetensors_weights_iterator(
safetensors, True):
fastsafetensors_tensors[name] = tensor
for name, tensor in safetensors_weights_iterator(safetensors, True):
hf_safetensors_tensors[name] = tensor
assert len(fastsafetensors_tensors) == len(hf_safetensors_tensors)
for name, fastsafetensors_tensor in fastsafetensors_tensors.items():
fastsafetensors_tensor = fastsafetensors_tensor.to('cpu')
assert fastsafetensors_tensor.dtype == hf_safetensors_tensors[
name].dtype
assert fastsafetensors_tensor.shape == hf_safetensors_tensors[
name].shape
assert torch.all(
fastsafetensors_tensor.eq(hf_safetensors_tensors[name]))
if __name__ == "__main__":
test_fastsafetensors_model_loader()
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
from typing import Optional, Tuple, Union from typing import Optional, Union
import torch import torch
...@@ -9,8 +9,7 @@ from vllm.platforms import current_platform ...@@ -9,8 +9,7 @@ from vllm.platforms import current_platform
# Using the default value (240.0) from pytorch will cause accuracy # Using the default value (240.0) from pytorch will cause accuracy
# issue on dynamic quantization models. Here use 224.0 for rocm. # issue on dynamic quantization models. Here use 224.0 for rocm.
ROCM_FP8_MAX = 224.0 ROCM_FP8_MAX = 224.0
FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm() \ FP8_DTYPE = current_platform.fp8_dtype()
else torch.float8_e4m3fn
def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor: def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
...@@ -19,7 +18,7 @@ def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor: ...@@ -19,7 +18,7 @@ def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor:
def ref_dynamic_per_token_quant(x: torch.tensor, def ref_dynamic_per_token_quant(x: torch.tensor,
quant_dtype: torch.dtype, quant_dtype: torch.dtype,
scale_ub: Optional[torch.tensor] = None) \ scale_ub: Optional[torch.tensor] = None) \
-> Tuple[torch.tensor, torch.tensor]: -> tuple[torch.tensor, torch.tensor]:
assert quant_dtype in [torch.int8, FP8_DTYPE] assert quant_dtype in [torch.int8, FP8_DTYPE]
if scale_ub is not None: if scale_ub is not None:
...@@ -68,7 +67,7 @@ def ref_dynamic_per_token_quant(x: torch.tensor, ...@@ -68,7 +67,7 @@ def ref_dynamic_per_token_quant(x: torch.tensor,
# ref_dynamic_per_token_quant, when we have a dynamic_per_tensor int8 quant # ref_dynamic_per_token_quant, when we have a dynamic_per_tensor int8 quant
# kernel # kernel
def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \
-> Tuple[torch.tensor, torch.tensor]: -> tuple[torch.tensor, torch.tensor]:
fp8_traits = torch.finfo(FP8_DTYPE) fp8_traits = torch.finfo(FP8_DTYPE)
fp8_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \ fp8_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import random import random
from typing import Type
import pytest import pytest
import torch import torch
...@@ -86,7 +85,7 @@ def test_act_and_mul( ...@@ -86,7 +85,7 @@ def test_act_and_mul(
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode() @torch.inference_mode()
def test_activation( def test_activation(
activation: Type[torch.nn.Module], activation: type[torch.nn.Module],
num_tokens: int, num_tokens: int,
d: int, d: int,
dtype: torch.dtype, dtype: torch.dtype,
......
# SPDX-License-Identifier: Apache-2.0
import pytest
import torch
from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.allspark_utils import (
ALLSPARK_AMPERE_K_ALIGN, ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
ALLSPARK_AMPERE_N_ALIGN)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
quantize_weights)
from vllm.platforms import current_platform
from vllm.scalar_type import scalar_types
def is_gptq_allspark_supported(min_capability: int,
max_capability: int) -> bool:
if not current_platform.is_cuda():
return False
capability = current_platform.get_device_capability()
assert capability is not None
return capability.to_int() >= min_capability \
and capability.to_int() <= max_capability
MNK_FACTORS = [
(1, 4, 8),
(13, 17, 67),
(26, 37, 13),
(48, 16, 24),
(67, 13, 88),
(257, 13, 11),
(658, 13, 11),
(1033, 9, 17),
]
DTYPES = [torch.float16, torch.bfloat16]
HAS_ZP_OPTS = [False, True]
def compute_max_diff(output, output_ref):
return torch.mean(torch.abs(output - output_ref)) / torch.mean(
torch.abs(output_ref))
def rand_data(shape, dtype=torch.float16):
return torch.randn(shape, dtype=dtype, device="cuda")
@pytest.mark.skipif(
not is_gptq_allspark_supported(80, 89),
reason="AllSpark Ampere kernel is not supported on this GPU type.")
@pytest.mark.parametrize("mnk_factors", MNK_FACTORS)
@pytest.mark.parametrize("group_size", [-1])
@pytest.mark.parametrize("has_zp", HAS_ZP_OPTS)
@pytest.mark.parametrize("dtype", DTYPES)
def test_gptq_allspark_gemm_ampere(mnk_factors, group_size, has_zp, dtype):
m_factor, n_factor, k_factor = mnk_factors
m = m_factor
n = n_factor * ALLSPARK_AMPERE_N_ALIGN
k = k_factor * ALLSPARK_AMPERE_K_ALIGN
input = rand_data((m, k), dtype=dtype)
weight = rand_data((k, n), dtype=dtype)
# Quantize (and apply act_order if provided)
w_ref, qw, s, zp = quantize_weights(weight, scalar_types.uint8b128,
group_size, has_zp)
qw = qw.to(torch.uint8)
if has_zp:
zp = zp.to(dtype)
properties = torch.cuda.get_device_properties(qw.device.index)
sm_count = properties.multi_processor_count
sm_version = properties.major * 10 + properties.minor
n_32align = (n + 32 - 1) // 32 * 32
qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
qw, s, zp, has_zp)
opcheck(torch.ops._C.rearrange_kn_weight_as_n32k16_order,
(qw, s, zp, has_zp, qw_reorder, s_reorder, zp_reorder, k, n,
n_32align))
opcheck(torch.ops._C.allspark_w8a16_gemm,
(input, qw_reorder, s_reorder, zp_reorder, n, group_size, sm_count,
sm_version, ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, has_zp, True),
test_utils=DEFAULT_OPCHECK_TEST_UTILS)
output = ops.allspark_w8a16_gemm(input, qw_reorder, s_reorder, zp_reorder,
n, group_size, sm_count, sm_version,
ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
has_zp, True)
output_ref = torch.matmul(input, w_ref)
torch.cuda.synchronize()
max_diff = compute_max_diff(output, output_ref)
assert max_diff < 0.04
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment