Commit 4d3a2c28 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.5' into v0.6.5-dev

parents 92ec5d8e 2d1b9baa
...@@ -5,14 +5,18 @@ import openai ...@@ -5,14 +5,18 @@ import openai
import pytest import pytest
import os import os
import pytest_asyncio import pytest_asyncio
import requests
from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import RemoteOpenAIServer, models_path_prefix from ...utils import RemoteOpenAIServer, models_path_prefix
EMBEDDING_MODEL_NAME = os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct") MODEL_NAME = os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def embedding_server(): def server():
args = [ args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
"--dtype", "--dtype",
...@@ -20,31 +24,29 @@ def embedding_server(): ...@@ -20,31 +24,29 @@ def embedding_server():
"--enforce-eager", "--enforce-eager",
"--max-model-len", "--max-model-len",
"8192", "8192",
"--chat-template",
DUMMY_CHAT_TEMPLATE,
] ]
with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server
@pytest_asyncio.fixture @pytest_asyncio.fixture
async def embedding_client(embedding_server): async def client(server):
async with embedding_server.get_async_client() as async_client: async with server.get_async_client() as async_client:
yield async_client yield async_client
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize("model_name", [MODEL_NAME])
"model_name", async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
[EMBEDDING_MODEL_NAME],
)
async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
model_name: str):
input_texts = [ input_texts = [
"The chef prepared a delicious meal.", "The chef prepared a delicious meal.",
] ]
# test single embedding # test single embedding
embeddings = await embedding_client.embeddings.create( embeddings = await client.embeddings.create(
model=model_name, model=model_name,
input=input_texts, input=input_texts,
encoding_format="float", encoding_format="float",
...@@ -58,7 +60,7 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI, ...@@ -58,7 +60,7 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
# test using token IDs # test using token IDs
input_tokens = [1, 1, 1, 1, 1] input_tokens = [1, 1, 1, 1, 1]
embeddings = await embedding_client.embeddings.create( embeddings = await client.embeddings.create(
model=model_name, model=model_name,
input=input_tokens, input=input_tokens,
encoding_format="float", encoding_format="float",
...@@ -72,18 +74,14 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI, ...@@ -72,18 +74,14 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize("model_name", [MODEL_NAME])
"model_name", async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
[EMBEDDING_MODEL_NAME],
)
async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
model_name: str):
# test List[str] # test List[str]
input_texts = [ input_texts = [
"The cat sat on the mat.", "A feline was resting on a rug.", "The cat sat on the mat.", "A feline was resting on a rug.",
"Stars twinkle brightly in the night sky." "Stars twinkle brightly in the night sky."
] ]
embeddings = await embedding_client.embeddings.create( embeddings = await client.embeddings.create(
model=model_name, model=model_name,
input=input_texts, input=input_texts,
encoding_format="float", encoding_format="float",
...@@ -91,11 +89,14 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI, ...@@ -91,11 +89,14 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
assert embeddings.id is not None assert embeddings.id is not None
assert len(embeddings.data) == 3 assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) == 4096 assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 32
assert embeddings.usage.total_tokens == 32
# test List[List[int]] # test List[List[int]]
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24], input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
[25, 32, 64, 77]] [25, 32, 64, 77]]
embeddings = await embedding_client.embeddings.create( embeddings = await client.embeddings.create(
model=model_name, model=model_name,
input=input_tokens, input=input_tokens,
encoding_format="float", encoding_format="float",
...@@ -109,22 +110,70 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI, ...@@ -109,22 +110,70 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize( @pytest.mark.parametrize("model_name", [MODEL_NAME])
"model_name", async def test_conversation_embedding(server: RemoteOpenAIServer,
[EMBEDDING_MODEL_NAME], client: openai.AsyncOpenAI,
) model_name: str):
async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI, messages = [{
"role": "user",
"content": "The cat sat on the mat.",
}, {
"role": "assistant",
"content": "A feline was resting on a rug.",
}, {
"role": "user",
"content": "Stars twinkle brightly in the night sky.",
}]
chat_response = requests.post(server.url_for("v1/embeddings"),
json={
"model": model_name,
"messages": messages,
"encoding_format": "float",
})
chat_response.raise_for_status()
chat_embeddings = chat_response.json()
tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
prompt = tokenizer.apply_chat_template(
messages,
chat_template=DUMMY_CHAT_TEMPLATE,
add_generation_prompt=True,
continue_final_message=False,
tokenize=False,
)
completion_response = await client.embeddings.create(
model=model_name,
input=prompt,
encoding_format="float",
# To be consistent with chat
extra_body={"add_special_tokens": False},
)
completion_embeddings = completion_response.model_dump(mode="json")
assert chat_embeddings.pop("id") is not None
assert completion_embeddings.pop("id") is not None
assert chat_embeddings.pop("created") <= completion_embeddings.pop(
"created")
assert chat_embeddings == completion_embeddings
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
model_name: str): model_name: str):
input_texts = [ input_texts = [
"Hello my name is", "Hello my name is",
"The best thing about vLLM is that it supports many different models" "The best thing about vLLM is that it supports many different models"
] ]
responses_float = await embedding_client.embeddings.create( responses_float = await client.embeddings.create(input=input_texts,
input=input_texts, model=model_name, encoding_format="float") model=model_name,
encoding_format="float")
responses_base64 = await embedding_client.embeddings.create( responses_base64 = await client.embeddings.create(input=input_texts,
input=input_texts, model=model_name, encoding_format="base64") model=model_name,
encoding_format="base64")
decoded_responses_base64_data = [] decoded_responses_base64_data = []
for data in responses_base64.data: for data in responses_base64.data:
...@@ -138,10 +187,65 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI, ...@@ -138,10 +187,65 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
1] 1]
# Default response is float32 decoded from base64 by OpenAI Client # Default response is float32 decoded from base64 by OpenAI Client
responses_default = await embedding_client.embeddings.create( responses_default = await client.embeddings.create(input=input_texts,
input=input_texts, model=model_name) model=model_name)
assert responses_float.data[0].embedding == responses_default.data[ assert responses_float.data[0].embedding == responses_default.data[
0].embedding 0].embedding
assert responses_float.data[1].embedding == responses_default.data[ assert responses_float.data[1].embedding == responses_default.data[
1].embedding 1].embedding
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
# test single embedding
embeddings = await client.embeddings.create(
model=model_name,
input=input_texts,
extra_body={"truncate_prompt_tokens": 10})
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
input_tokens = [
1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
]
embeddings = await client.embeddings.create(
model=model_name,
input=input_tokens,
extra_body={"truncate_prompt_tokens": 10})
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
with pytest.raises(openai.BadRequestError):
embeddings = await client.embeddings.create(
model=model_name,
input=input_texts,
extra_body={"truncate_prompt_tokens": 8193})
assert "error" in embeddings.object
assert "truncate_prompt_tokens value is greater than max_model_len. "\
"Please, select a smaller truncation size." in embeddings.message
...@@ -71,19 +71,21 @@ EXPECTED_VALUES = { ...@@ -71,19 +71,21 @@ EXPECTED_VALUES = {
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST), [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS)], ("_count", _NUM_REQUESTS)],
"vllm:request_params_n": [("_count", _NUM_REQUESTS)], "vllm:request_params_n": [("_count", _NUM_REQUESTS)],
"vllm:request_params_best_of": [("_count", _NUM_REQUESTS)], "vllm:request_params_max_tokens":
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS)],
"vllm:prompt_tokens": [("_total", "vllm:prompt_tokens": [("_total",
_NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
"vllm:generation_tokens": "vllm:generation_tokens": [
[("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)], ("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
],
"vllm:request_success": [("_total", _NUM_REQUESTS)], "vllm:request_success": [("_total", _NUM_REQUESTS)],
} }
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_metrics_counts(client: openai.AsyncOpenAI): async def test_metrics_counts(server: RemoteOpenAIServer,
base_url = str(client.base_url)[:-3].strip("/") client: openai.AsyncClient):
for _ in range(_NUM_REQUESTS): for _ in range(_NUM_REQUESTS):
# sending a request triggers the metrics to be logged. # sending a request triggers the metrics to be logged.
await client.completions.create( await client.completions.create(
...@@ -91,7 +93,7 @@ async def test_metrics_counts(client: openai.AsyncOpenAI): ...@@ -91,7 +93,7 @@ async def test_metrics_counts(client: openai.AsyncOpenAI):
prompt=_TOKENIZED_PROMPT, prompt=_TOKENIZED_PROMPT,
max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST) max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST)
response = requests.get(base_url + "/metrics") response = requests.get(server.url_for("metrics"))
print(response.text) print(response.text)
assert response.status_code == HTTPStatus.OK assert response.status_code == HTTPStatus.OK
...@@ -152,9 +154,9 @@ EXPECTED_METRICS = [ ...@@ -152,9 +154,9 @@ EXPECTED_METRICS = [
"vllm:request_params_n_sum", "vllm:request_params_n_sum",
"vllm:request_params_n_bucket", "vllm:request_params_n_bucket",
"vllm:request_params_n_count", "vllm:request_params_n_count",
"vllm:request_params_best_of_sum", "vllm:request_params_max_tokens_sum",
"vllm:request_params_best_of_bucket", "vllm:request_params_max_tokens_bucket",
"vllm:request_params_best_of_count", "vllm:request_params_max_tokens_count",
"vllm:num_preemptions_total", "vllm:num_preemptions_total",
"vllm:prompt_tokens_total", "vllm:prompt_tokens_total",
"vllm:generation_tokens_total", "vllm:generation_tokens_total",
...@@ -175,16 +177,15 @@ EXPECTED_METRICS = [ ...@@ -175,16 +177,15 @@ EXPECTED_METRICS = [
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_metrics_exist(client: openai.AsyncOpenAI): async def test_metrics_exist(server: RemoteOpenAIServer,
base_url = str(client.base_url)[:-3].strip("/") client: openai.AsyncClient):
# sending a request triggers the metrics to be logged. # sending a request triggers the metrics to be logged.
await client.completions.create(model=MODEL_NAME, await client.completions.create(model=MODEL_NAME,
prompt="Hello, my name is", prompt="Hello, my name is",
max_tokens=5, max_tokens=5,
temperature=0.0) temperature=0.0)
response = requests.get(base_url + "/metrics") response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK assert response.status_code == HTTPStatus.OK
for metric in EXPECTED_METRICS: for metric in EXPECTED_METRICS:
......
...@@ -21,3 +21,38 @@ async def test_empty_prompt(): ...@@ -21,3 +21,38 @@ async def test_empty_prompt():
prompt="", prompt="",
max_tokens=5, max_tokens=5,
temperature=0.0) temperature=0.0)
@pytest.mark.asyncio
async def test_out_of_vocab_token_ids():
model_name = "gpt2"
server_args = ["--enforce-eager"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
with pytest.raises(openai.BadRequestError,
match=re.compile('.*out of vocabulary.*')):
await client.completions.create(model=model_name,
prompt=[999999],
max_tokens=5,
temperature=0.0)
@pytest.mark.asyncio
async def test_reject_multistep_with_guided_decoding():
model_name = "gpt2"
server_args = ["--enforce-eager", "--num-scheduler-steps", "8"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
with pytest.raises(openai.BadRequestError,
match=re.compile(
'.*Guided decoding .* multi-step decoding.*')):
await client.completions.create(
model=model_name,
prompt="Hello",
max_tokens=5,
temperature=0.0,
extra_body={"response_format": {
"type": "json_object"
}})
import contextlib
import os
from typing import Any, List, NamedTuple
import openai # use the official client for correctness check
import pytest
from ...utils import RemoteOpenAIServer, models_path_prefix
# # any model with a chat template should work here
MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct")
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
API_KEY = "abc-123"
ERROR_API_KEY = "abc"
ROOT_PATH = "llm"
@pytest.fixture(scope="module")
def server():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--enforce-eager",
"--max-model-len",
"4080",
"--root-path", # use --root-path=/llm for testing
"/" + ROOT_PATH,
"--chat-template",
DUMMY_CHAT_TEMPLATE,
]
envs = os.environ.copy()
envs["VLLM_API_KEY"] = API_KEY
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
yield remote_server
class TestCase(NamedTuple):
model_name: str
base_url: List[str]
api_key: str
expected_error: Any
@pytest.mark.asyncio
@pytest.mark.parametrize(
"test_case",
[
TestCase(
model_name=MODEL_NAME,
base_url=["v1"], # http://localhost:8000/v1
api_key=ERROR_API_KEY,
expected_error=openai.AuthenticationError),
TestCase(
model_name=MODEL_NAME,
base_url=[ROOT_PATH, "v1"], # http://localhost:8000/llm/v1
api_key=ERROR_API_KEY,
expected_error=openai.AuthenticationError),
TestCase(
model_name=MODEL_NAME,
base_url=["v1"], # http://localhost:8000/v1
api_key=API_KEY,
expected_error=None),
TestCase(
model_name=MODEL_NAME,
base_url=[ROOT_PATH, "v1"], # http://localhost:8000/llm/v1
api_key=API_KEY,
expected_error=None),
],
)
async def test_chat_session_root_path_with_api_key(server: RemoteOpenAIServer,
test_case: TestCase):
saying: str = "Here is a common saying about apple. An apple a day, keeps"
ctx = contextlib.nullcontext()
if test_case.expected_error is not None:
ctx = pytest.raises(test_case.expected_error)
with ctx:
client = openai.AsyncOpenAI(
api_key=test_case.api_key,
base_url=server.url_for(*test_case.base_url),
max_retries=0)
chat_completion = await client.chat.completions.create(
model=test_case.model_name,
messages=[{
"role": "user",
"content": "tell me a common saying"
}, {
"role": "assistant",
"content": saying
}],
extra_body={
"continue_final_message": True,
"add_generation_prompt": False
})
assert chat_completion.id is not None
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "stop"
message = choice.message
assert len(message.content) > 0
assert message.role == "assistant"
import os
import pytest
import requests
from vllm.entrypoints.openai.protocol import ScoreResponse
from ...utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3")
@pytest.fixture(scope="module")
def server():
args = [
"--enforce-eager",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_text_1_str_text_2_list(server: RemoteOpenAIServer,
model_name: str):
text_1 = "What is the capital of France?"
text_2 = [
"The capital of Brazil is Brasilia.", "The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
json={
"model": model_name,
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
assert score.data[0].score <= 0.01
assert score.data[1].score >= 0.9
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_text_1_list_text_2_list(server: RemoteOpenAIServer,
model_name: str):
text_1 = [
"What is the capital of the United States?",
"What is the capital of France?"
]
text_2 = [
"The capital of Brazil is Brasilia.", "The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
json={
"model": model_name,
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
assert score.data[0].score <= 0.01
assert score.data[1].score >= 0.9
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_text_1_str_text_2_str(server: RemoteOpenAIServer,
model_name: str):
text_1 = "What is the capital of France?"
text_2 = "The capital of France is Paris."
score_response = requests.post(server.url_for("score"),
json={
"model": model_name,
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
assert score.data[0].score >= 0.9
...@@ -16,15 +16,22 @@ CHAT_TEMPLATE = "Dummy chat template for testing {}" ...@@ -16,15 +16,22 @@ CHAT_TEMPLATE = "Dummy chat template for testing {}"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)] BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
@dataclass
class MockHFConfig:
model_type: str = "any"
@dataclass @dataclass
class MockModelConfig: class MockModelConfig:
task = "generate"
tokenizer = MODEL_NAME tokenizer = MODEL_NAME
trust_remote_code = False trust_remote_code = False
tokenizer_mode = "auto" tokenizer_mode = "auto"
max_model_len = 100 max_model_len = 100
tokenizer_revision = None tokenizer_revision = None
embedding_mode = False
multimodal_config = MultiModalConfig() multimodal_config = MultiModalConfig()
hf_config = MockHFConfig()
logits_processor_pattern = None
@dataclass @dataclass
...@@ -43,6 +50,7 @@ async def _async_serving_chat_init(): ...@@ -43,6 +50,7 @@ async def _async_serving_chat_init():
BASE_MODEL_PATHS, BASE_MODEL_PATHS,
response_role="assistant", response_role="assistant",
chat_template=CHAT_TEMPLATE, chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
lora_modules=None, lora_modules=None,
prompt_adapters=None, prompt_adapters=None,
request_logger=None) request_logger=None)
...@@ -64,6 +72,7 @@ def test_serving_chat_should_set_correct_max_tokens(): ...@@ -64,6 +72,7 @@ def test_serving_chat_should_set_correct_max_tokens():
BASE_MODEL_PATHS, BASE_MODEL_PATHS,
response_role="assistant", response_role="assistant",
chat_template=CHAT_TEMPLATE, chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
lora_modules=None, lora_modules=None,
prompt_adapters=None, prompt_adapters=None,
request_logger=None) request_logger=None)
......
...@@ -10,6 +10,8 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse, ...@@ -10,6 +10,8 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
LoadLoraAdapterRequest, LoadLoraAdapterRequest,
UnloadLoraAdapterRequest) UnloadLoraAdapterRequest)
from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
from vllm.lora.request import LoRARequest
from ...utils import models_path_prefix from ...utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b") MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b")
...@@ -35,6 +37,16 @@ async def _async_serving_engine_init(): ...@@ -35,6 +37,16 @@ async def _async_serving_engine_init():
return serving_engine return serving_engine
@pytest.mark.asyncio
async def test_serving_model_name():
serving_engine = await _async_serving_engine_init()
assert serving_engine._get_model_name(None) == MODEL_NAME
request = LoRARequest(lora_name="adapter",
lora_path="/path/to/adapter2",
lora_int_id=1)
assert serving_engine._get_model_name(request) == request.lora_name
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_load_lora_adapter_success(): async def test_load_lora_adapter_success():
serving_engine = await _async_serving_engine_init() serving_engine = await _async_serving_engine_init()
......
...@@ -6,7 +6,7 @@ import pytest ...@@ -6,7 +6,7 @@ import pytest
from ...utils import RemoteOpenAIServer, models_path_prefix from ...utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta") MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
@pytest.mark.asyncio @pytest.mark.asyncio
......
import openai # use the official client for correctness check
import pytest import pytest
import os import os
import pytest_asyncio import pytest_asyncio
...@@ -57,9 +56,11 @@ async def client(server): ...@@ -57,9 +56,11 @@ async def client(server):
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
indirect=["tokenizer_name"], indirect=["tokenizer_name"],
) )
async def test_tokenize_completions(client: openai.AsyncOpenAI, async def test_tokenize_completions(
model_name: str, tokenizer_name: str): server: RemoteOpenAIServer,
base_url = str(client.base_url)[:-3].strip("/") model_name: str,
tokenizer_name: str,
):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
tokenizer_mode="fast") tokenizer_mode="fast")
...@@ -67,7 +68,7 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI, ...@@ -67,7 +68,7 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI,
prompt = "vllm1 This is a test prompt." prompt = "vllm1 This is a test prompt."
tokens = tokenizer.encode(prompt, add_special_tokens=add_special) tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
response = requests.post(base_url + "/tokenize", response = requests.post(server.url_for("tokenize"),
json={ json={
"add_special_tokens": add_special, "add_special_tokens": add_special,
"model": model_name, "model": model_name,
...@@ -88,9 +89,11 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI, ...@@ -88,9 +89,11 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI,
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
indirect=["tokenizer_name"], indirect=["tokenizer_name"],
) )
async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str, async def test_tokenize_chat(
tokenizer_name: str): server: RemoteOpenAIServer,
base_url = str(client.base_url)[:-3].strip("/") model_name: str,
tokenizer_name: str,
):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
tokenizer_mode="fast") tokenizer_mode="fast")
...@@ -106,28 +109,40 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str, ...@@ -106,28 +109,40 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
"role": "user", "role": "user",
"content": "Can I ask a question? vllm1" "content": "Can I ask a question? vllm1"
}] }]
for continue_final in [False, True]:
prompt = tokenizer.apply_chat_template( if add_generation and continue_final:
add_generation_prompt=add_generation, continue
conversation=conversation, if continue_final:
tokenize=False) conversation.append({
tokens = tokenizer.encode(prompt, add_special_tokens=add_special) "role": "assistant",
"content": "Sure,"
response = requests.post(base_url + "/tokenize", })
json={
"add_generation_prompt": prompt = tokenizer.apply_chat_template(
add_generation, add_generation_prompt=add_generation,
"add_special_tokens": add_special, continue_final_message=continue_final,
"messages": conversation, conversation=conversation,
"model": model_name tokenize=False)
}) tokens = tokenizer.encode(prompt,
response.raise_for_status() add_special_tokens=add_special)
assert response.json() == { response = requests.post(server.url_for("tokenize"),
"tokens": tokens, json={
"count": len(tokens), "add_generation_prompt":
"max_model_len": 8192 add_generation,
} "continue_final_message":
continue_final,
"add_special_tokens": add_special,
"messages": conversation,
"model": model_name
})
response.raise_for_status()
assert response.json() == {
"tokens": tokens,
"count": len(tokens),
"max_model_len": 8192
}
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -136,17 +151,18 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str, ...@@ -136,17 +151,18 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
indirect=["tokenizer_name"], indirect=["tokenizer_name"],
) )
async def test_detokenize(client: openai.AsyncOpenAI, model_name: str, async def test_detokenize(
tokenizer_name: str): server: RemoteOpenAIServer,
base_url = str(client.base_url)[:-3].strip("/") model_name: str,
tokenizer_name: str,
):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
tokenizer_mode="fast") tokenizer_mode="fast")
prompt = "This is a test prompt. vllm1" prompt = "This is a test prompt. vllm1"
tokens = tokenizer.encode(prompt, add_special_tokens=False) tokens = tokenizer.encode(prompt, add_special_tokens=False)
print(f"CALLING {base_url} FOR {model_name}") response = requests.post(server.url_for("detokenize"),
response = requests.post(base_url + "/detokenize",
json={ json={
"model": model_name, "model": model_name,
"tokens": tokens "tokens": tokens
......
from typing import Dict, List
import os
import openai
import pytest
import pytest_asyncio
from vllm.multimodal.utils import encode_video_base64, fetch_video
from ...utils import RemoteOpenAIServer, models_path_prefix, urls_port
MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
MAXIMUM_VIDEOS = 4
# TEST_VIDEO_URLS = [
# "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4",
# "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ElephantsDream.mp4",
# "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerBlazes.mp4",
# "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4",
# ]
TEST_VIDEO_URLS = [
f"http://localhost:{urls_port}/BigBuckBunny.mp4",
f"http://localhost:{urls_port}/ElephantsDream.mp4",
f"http://localhost:{urls_port}/ForBiggerBlazes.mp4",
f"http://localhost:{urls_port}/ForBiggerFun.mp4",
]
@pytest.fixture(scope="module")
def server():
args = [
"--task",
"generate",
"--dtype",
"bfloat16",
"--max-model-len",
"32768",
"--max-num-seqs",
"2",
"--enforce-eager",
"--trust-remote-code",
"--limit-mm-per-prompt",
f"video={MAXIMUM_VIDEOS}",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.fixture(scope="session")
def base64_encoded_video() -> Dict[str, str]:
return {
video_url: encode_video_base64(fetch_video(video_url))
for video_url in TEST_VIDEO_URLS
}
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video(client: openai.AsyncOpenAI,
model_name: str, video_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "video_url",
"video_url": {
"url": video_url
}
},
{
"type": "text",
"text": "What's in this video?"
},
],
}]
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
message = choice.message
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video_beamsearch(client: openai.AsyncOpenAI,
model_name: str,
video_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "video_url",
"video_url": {
"url": video_url
}
},
{
"type": "text",
"text": "What's in this video?"
},
],
}]
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
n=2,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5,
extra_body=dict(use_beam_search=True))
assert len(chat_completion.choices) == 2
assert chat_completion.choices[
0].message.content != chat_completion.choices[1].message.content
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video_base64encoded(
client: openai.AsyncOpenAI, model_name: str, video_url: str,
base64_encoded_video: Dict[str, str]):
messages = [{
"role":
"user",
"content": [
{
"type": "video_url",
"video_url": {
"url":
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
}
},
{
"type": "text",
"text": "What's in this video?"
},
],
}]
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
message = choice.message
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video_base64encoded_beamsearch(
client: openai.AsyncOpenAI, model_name: str, video_url: str,
base64_encoded_video: Dict[str, str]):
messages = [{
"role":
"user",
"content": [
{
"type": "video_url",
"video_url": {
"url":
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
}
},
{
"type": "text",
"text": "What's in this video?"
},
],
}]
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
n=2,
max_completion_tokens=10,
extra_body=dict(use_beam_search=True))
assert len(chat_completion.choices) == 2
assert chat_completion.choices[
0].message.content != chat_completion.choices[1].message.content
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_chat_streaming_video(client: openai.AsyncOpenAI,
model_name: str, video_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "video_url",
"video_url": {
"url": video_url
}
},
{
"type": "text",
"text": "What's in this video?"
},
],
}]
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
output = chat_completion.choices[0].message.content
stop_reason = chat_completion.choices[0].finish_reason
# test streaming
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
stream=True,
)
chunks: List[str] = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
if delta.role:
assert delta.role == "assistant"
if delta.content:
chunks.append(delta.content)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == stop_reason
assert delta.content
assert "".join(chunks) == output
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
"video_urls",
[TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))])
async def test_multi_video_input(client: openai.AsyncOpenAI, model_name: str,
video_urls: List[str]):
messages = [{
"role":
"user",
"content": [
*({
"type": "video_url",
"video_url": {
"url": video_url
}
} for video_url in video_urls),
{
"type": "text",
"text": "What's in this video?"
},
],
}]
if len(video_urls) > MAXIMUM_VIDEOS:
with pytest.raises(openai.BadRequestError): # test multi-video input
await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
# the server should still work afterwards
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
completion = completion.choices[0].text
assert completion is not None and len(completion) >= 0
else:
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment