"vscode:/vscode.git/clone" did not exist on "df3c0291a3d6aceb0e1393ab0bdbd16dec9f2081"
Commit 4d3a2c28 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.5' into v0.6.5-dev

parents 92ec5d8e 2d1b9baa
......@@ -5,14 +5,18 @@ import openai
import pytest
import os
import pytest_asyncio
import requests
from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import RemoteOpenAIServer, models_path_prefix
EMBEDDING_MODEL_NAME = os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")
MODEL_NAME = os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
@pytest.fixture(scope="module")
def embedding_server():
def server():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
......@@ -20,31 +24,29 @@ def embedding_server():
"--enforce-eager",
"--max-model-len",
"8192",
"--chat-template",
DUMMY_CHAT_TEMPLATE,
]
with RemoteOpenAIServer(EMBEDDING_MODEL_NAME, args) as remote_server:
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def embedding_client(embedding_server):
async with embedding_server.get_async_client() as async_client:
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
model_name: str):
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
input_texts = [
"The chef prepared a delicious meal.",
]
# test single embedding
embeddings = await embedding_client.embeddings.create(
embeddings = await client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
......@@ -58,7 +60,7 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
# test using token IDs
input_tokens = [1, 1, 1, 1, 1]
embeddings = await embedding_client.embeddings.create(
embeddings = await client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
......@@ -72,18 +74,14 @@ async def test_single_embedding(embedding_client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
model_name: str):
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
# test List[str]
input_texts = [
"The cat sat on the mat.", "A feline was resting on a rug.",
"Stars twinkle brightly in the night sky."
]
embeddings = await embedding_client.embeddings.create(
embeddings = await client.embeddings.create(
model=model_name,
input=input_texts,
encoding_format="float",
......@@ -91,11 +89,14 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
assert embeddings.id is not None
assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 32
assert embeddings.usage.total_tokens == 32
# test List[List[int]]
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
[25, 32, 64, 77]]
embeddings = await embedding_client.embeddings.create(
embeddings = await client.embeddings.create(
model=model_name,
input=input_tokens,
encoding_format="float",
......@@ -109,22 +110,70 @@ async def test_batch_embedding(embedding_client: openai.AsyncOpenAI,
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model_name",
[EMBEDDING_MODEL_NAME],
)
async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_conversation_embedding(server: RemoteOpenAIServer,
client: openai.AsyncOpenAI,
model_name: str):
messages = [{
"role": "user",
"content": "The cat sat on the mat.",
}, {
"role": "assistant",
"content": "A feline was resting on a rug.",
}, {
"role": "user",
"content": "Stars twinkle brightly in the night sky.",
}]
chat_response = requests.post(server.url_for("v1/embeddings"),
json={
"model": model_name,
"messages": messages,
"encoding_format": "float",
})
chat_response.raise_for_status()
chat_embeddings = chat_response.json()
tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
prompt = tokenizer.apply_chat_template(
messages,
chat_template=DUMMY_CHAT_TEMPLATE,
add_generation_prompt=True,
continue_final_message=False,
tokenize=False,
)
completion_response = await client.embeddings.create(
model=model_name,
input=prompt,
encoding_format="float",
# To be consistent with chat
extra_body={"add_special_tokens": False},
)
completion_embeddings = completion_response.model_dump(mode="json")
assert chat_embeddings.pop("id") is not None
assert completion_embeddings.pop("id") is not None
assert chat_embeddings.pop("created") <= completion_embeddings.pop(
"created")
assert chat_embeddings == completion_embeddings
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_base64_embedding(client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"Hello my name is",
"The best thing about vLLM is that it supports many different models"
]
responses_float = await embedding_client.embeddings.create(
input=input_texts, model=model_name, encoding_format="float")
responses_float = await client.embeddings.create(input=input_texts,
model=model_name,
encoding_format="float")
responses_base64 = await embedding_client.embeddings.create(
input=input_texts, model=model_name, encoding_format="base64")
responses_base64 = await client.embeddings.create(input=input_texts,
model=model_name,
encoding_format="base64")
decoded_responses_base64_data = []
for data in responses_base64.data:
......@@ -138,10 +187,65 @@ async def test_batch_base64_embedding(embedding_client: openai.AsyncOpenAI,
1]
# Default response is float32 decoded from base64 by OpenAI Client
responses_default = await embedding_client.embeddings.create(
input=input_texts, model=model_name)
responses_default = await client.embeddings.create(input=input_texts,
model=model_name)
assert responses_float.data[0].embedding == responses_default.data[
0].embedding
assert responses_float.data[1].embedding == responses_default.data[
1].embedding
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
# test single embedding
embeddings = await client.embeddings.create(
model=model_name,
input=input_texts,
extra_body={"truncate_prompt_tokens": 10})
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
input_tokens = [
1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
]
embeddings = await client.embeddings.create(
model=model_name,
input=input_tokens,
extra_body={"truncate_prompt_tokens": 10})
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
model_name: str):
input_texts = [
"Como o Brasil pode fomentar o desenvolvimento de modelos de IA?",
]
with pytest.raises(openai.BadRequestError):
embeddings = await client.embeddings.create(
model=model_name,
input=input_texts,
extra_body={"truncate_prompt_tokens": 8193})
assert "error" in embeddings.object
assert "truncate_prompt_tokens value is greater than max_model_len. "\
"Please, select a smaller truncation size." in embeddings.message
......@@ -71,19 +71,21 @@ EXPECTED_VALUES = {
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS)],
"vllm:request_params_n": [("_count", _NUM_REQUESTS)],
"vllm:request_params_best_of": [("_count", _NUM_REQUESTS)],
"vllm:request_params_max_tokens":
[("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
("_count", _NUM_REQUESTS)],
"vllm:prompt_tokens": [("_total",
_NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
"vllm:generation_tokens":
[("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
"vllm:generation_tokens": [
("_total", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)
],
"vllm:request_success": [("_total", _NUM_REQUESTS)],
}
@pytest.mark.asyncio
async def test_metrics_counts(client: openai.AsyncOpenAI):
base_url = str(client.base_url)[:-3].strip("/")
async def test_metrics_counts(server: RemoteOpenAIServer,
client: openai.AsyncClient):
for _ in range(_NUM_REQUESTS):
# sending a request triggers the metrics to be logged.
await client.completions.create(
......@@ -91,7 +93,7 @@ async def test_metrics_counts(client: openai.AsyncOpenAI):
prompt=_TOKENIZED_PROMPT,
max_tokens=_NUM_GENERATION_TOKENS_PER_REQUEST)
response = requests.get(base_url + "/metrics")
response = requests.get(server.url_for("metrics"))
print(response.text)
assert response.status_code == HTTPStatus.OK
......@@ -152,9 +154,9 @@ EXPECTED_METRICS = [
"vllm:request_params_n_sum",
"vllm:request_params_n_bucket",
"vllm:request_params_n_count",
"vllm:request_params_best_of_sum",
"vllm:request_params_best_of_bucket",
"vllm:request_params_best_of_count",
"vllm:request_params_max_tokens_sum",
"vllm:request_params_max_tokens_bucket",
"vllm:request_params_max_tokens_count",
"vllm:num_preemptions_total",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
......@@ -175,16 +177,15 @@ EXPECTED_METRICS = [
@pytest.mark.asyncio
async def test_metrics_exist(client: openai.AsyncOpenAI):
base_url = str(client.base_url)[:-3].strip("/")
async def test_metrics_exist(server: RemoteOpenAIServer,
client: openai.AsyncClient):
# sending a request triggers the metrics to be logged.
await client.completions.create(model=MODEL_NAME,
prompt="Hello, my name is",
max_tokens=5,
temperature=0.0)
response = requests.get(base_url + "/metrics")
response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK
for metric in EXPECTED_METRICS:
......
......@@ -21,3 +21,38 @@ async def test_empty_prompt():
prompt="",
max_tokens=5,
temperature=0.0)
@pytest.mark.asyncio
async def test_out_of_vocab_token_ids():
model_name = "gpt2"
server_args = ["--enforce-eager"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
with pytest.raises(openai.BadRequestError,
match=re.compile('.*out of vocabulary.*')):
await client.completions.create(model=model_name,
prompt=[999999],
max_tokens=5,
temperature=0.0)
@pytest.mark.asyncio
async def test_reject_multistep_with_guided_decoding():
model_name = "gpt2"
server_args = ["--enforce-eager", "--num-scheduler-steps", "8"]
with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client()
with pytest.raises(openai.BadRequestError,
match=re.compile(
'.*Guided decoding .* multi-step decoding.*')):
await client.completions.create(
model=model_name,
prompt="Hello",
max_tokens=5,
temperature=0.0,
extra_body={"response_format": {
"type": "json_object"
}})
import contextlib
import os
from typing import Any, List, NamedTuple
import openai # use the official client for correctness check
import pytest
from ...utils import RemoteOpenAIServer, models_path_prefix
# # any model with a chat template should work here
MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct")
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
API_KEY = "abc-123"
ERROR_API_KEY = "abc"
ROOT_PATH = "llm"
@pytest.fixture(scope="module")
def server():
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"float16",
"--enforce-eager",
"--max-model-len",
"4080",
"--root-path", # use --root-path=/llm for testing
"/" + ROOT_PATH,
"--chat-template",
DUMMY_CHAT_TEMPLATE,
]
envs = os.environ.copy()
envs["VLLM_API_KEY"] = API_KEY
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
yield remote_server
class TestCase(NamedTuple):
model_name: str
base_url: List[str]
api_key: str
expected_error: Any
@pytest.mark.asyncio
@pytest.mark.parametrize(
"test_case",
[
TestCase(
model_name=MODEL_NAME,
base_url=["v1"], # http://localhost:8000/v1
api_key=ERROR_API_KEY,
expected_error=openai.AuthenticationError),
TestCase(
model_name=MODEL_NAME,
base_url=[ROOT_PATH, "v1"], # http://localhost:8000/llm/v1
api_key=ERROR_API_KEY,
expected_error=openai.AuthenticationError),
TestCase(
model_name=MODEL_NAME,
base_url=["v1"], # http://localhost:8000/v1
api_key=API_KEY,
expected_error=None),
TestCase(
model_name=MODEL_NAME,
base_url=[ROOT_PATH, "v1"], # http://localhost:8000/llm/v1
api_key=API_KEY,
expected_error=None),
],
)
async def test_chat_session_root_path_with_api_key(server: RemoteOpenAIServer,
test_case: TestCase):
saying: str = "Here is a common saying about apple. An apple a day, keeps"
ctx = contextlib.nullcontext()
if test_case.expected_error is not None:
ctx = pytest.raises(test_case.expected_error)
with ctx:
client = openai.AsyncOpenAI(
api_key=test_case.api_key,
base_url=server.url_for(*test_case.base_url),
max_retries=0)
chat_completion = await client.chat.completions.create(
model=test_case.model_name,
messages=[{
"role": "user",
"content": "tell me a common saying"
}, {
"role": "assistant",
"content": saying
}],
extra_body={
"continue_final_message": True,
"add_generation_prompt": False
})
assert chat_completion.id is not None
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "stop"
message = choice.message
assert len(message.content) > 0
assert message.role == "assistant"
import os
import pytest
import requests
from vllm.entrypoints.openai.protocol import ScoreResponse
from ...utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3")
@pytest.fixture(scope="module")
def server():
args = [
"--enforce-eager",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_text_1_str_text_2_list(server: RemoteOpenAIServer,
model_name: str):
text_1 = "What is the capital of France?"
text_2 = [
"The capital of Brazil is Brasilia.", "The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
json={
"model": model_name,
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
assert score.data[0].score <= 0.01
assert score.data[1].score >= 0.9
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_text_1_list_text_2_list(server: RemoteOpenAIServer,
model_name: str):
text_1 = [
"What is the capital of the United States?",
"What is the capital of France?"
]
text_2 = [
"The capital of Brazil is Brasilia.", "The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
json={
"model": model_name,
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 2
assert score.data[0].score <= 0.01
assert score.data[1].score >= 0.9
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_text_1_str_text_2_str(server: RemoteOpenAIServer,
model_name: str):
text_1 = "What is the capital of France?"
text_2 = "The capital of France is Paris."
score_response = requests.post(server.url_for("score"),
json={
"model": model_name,
"text_1": text_1,
"text_2": text_2,
})
score_response.raise_for_status()
score = ScoreResponse.model_validate(score_response.json())
assert score.id is not None
assert score.data is not None
assert len(score.data) == 1
assert score.data[0].score >= 0.9
......@@ -16,15 +16,22 @@ CHAT_TEMPLATE = "Dummy chat template for testing {}"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
@dataclass
class MockHFConfig:
model_type: str = "any"
@dataclass
class MockModelConfig:
task = "generate"
tokenizer = MODEL_NAME
trust_remote_code = False
tokenizer_mode = "auto"
max_model_len = 100
tokenizer_revision = None
embedding_mode = False
multimodal_config = MultiModalConfig()
hf_config = MockHFConfig()
logits_processor_pattern = None
@dataclass
......@@ -43,6 +50,7 @@ async def _async_serving_chat_init():
BASE_MODEL_PATHS,
response_role="assistant",
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
lora_modules=None,
prompt_adapters=None,
request_logger=None)
......@@ -64,6 +72,7 @@ def test_serving_chat_should_set_correct_max_tokens():
BASE_MODEL_PATHS,
response_role="assistant",
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
lora_modules=None,
prompt_adapters=None,
request_logger=None)
......
......@@ -10,6 +10,8 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse,
LoadLoraAdapterRequest,
UnloadLoraAdapterRequest)
from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing
from vllm.lora.request import LoRARequest
from ...utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b")
......@@ -35,6 +37,16 @@ async def _async_serving_engine_init():
return serving_engine
@pytest.mark.asyncio
async def test_serving_model_name():
serving_engine = await _async_serving_engine_init()
assert serving_engine._get_model_name(None) == MODEL_NAME
request = LoRARequest(lora_name="adapter",
lora_path="/path/to/adapter2",
lora_int_id=1)
assert serving_engine._get_model_name(request) == request.lora_name
@pytest.mark.asyncio
async def test_load_lora_adapter_success():
serving_engine = await _async_serving_engine_init()
......
......@@ -6,7 +6,7 @@ import pytest
from ...utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
@pytest.mark.asyncio
......
import openai # use the official client for correctness check
import pytest
import os
import pytest_asyncio
......@@ -57,9 +56,11 @@ async def client(server):
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
indirect=["tokenizer_name"],
)
async def test_tokenize_completions(client: openai.AsyncOpenAI,
model_name: str, tokenizer_name: str):
base_url = str(client.base_url)[:-3].strip("/")
async def test_tokenize_completions(
server: RemoteOpenAIServer,
model_name: str,
tokenizer_name: str,
):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
tokenizer_mode="fast")
......@@ -67,7 +68,7 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI,
prompt = "vllm1 This is a test prompt."
tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
response = requests.post(base_url + "/tokenize",
response = requests.post(server.url_for("tokenize"),
json={
"add_special_tokens": add_special,
"model": model_name,
......@@ -88,9 +89,11 @@ async def test_tokenize_completions(client: openai.AsyncOpenAI,
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
indirect=["tokenizer_name"],
)
async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
tokenizer_name: str):
base_url = str(client.base_url)[:-3].strip("/")
async def test_tokenize_chat(
server: RemoteOpenAIServer,
model_name: str,
tokenizer_name: str,
):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
tokenizer_mode="fast")
......@@ -106,28 +109,40 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
"role": "user",
"content": "Can I ask a question? vllm1"
}]
prompt = tokenizer.apply_chat_template(
add_generation_prompt=add_generation,
conversation=conversation,
tokenize=False)
tokens = tokenizer.encode(prompt, add_special_tokens=add_special)
response = requests.post(base_url + "/tokenize",
json={
"add_generation_prompt":
add_generation,
"add_special_tokens": add_special,
"messages": conversation,
"model": model_name
})
response.raise_for_status()
assert response.json() == {
"tokens": tokens,
"count": len(tokens),
"max_model_len": 8192
}
for continue_final in [False, True]:
if add_generation and continue_final:
continue
if continue_final:
conversation.append({
"role": "assistant",
"content": "Sure,"
})
prompt = tokenizer.apply_chat_template(
add_generation_prompt=add_generation,
continue_final_message=continue_final,
conversation=conversation,
tokenize=False)
tokens = tokenizer.encode(prompt,
add_special_tokens=add_special)
response = requests.post(server.url_for("tokenize"),
json={
"add_generation_prompt":
add_generation,
"continue_final_message":
continue_final,
"add_special_tokens": add_special,
"messages": conversation,
"model": model_name
})
response.raise_for_status()
assert response.json() == {
"tokens": tokens,
"count": len(tokens),
"max_model_len": 8192
}
@pytest.mark.asyncio
......@@ -136,17 +151,18 @@ async def test_tokenize_chat(client: openai.AsyncOpenAI, model_name: str,
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
indirect=["tokenizer_name"],
)
async def test_detokenize(client: openai.AsyncOpenAI, model_name: str,
tokenizer_name: str):
base_url = str(client.base_url)[:-3].strip("/")
async def test_detokenize(
server: RemoteOpenAIServer,
model_name: str,
tokenizer_name: str,
):
tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
tokenizer_mode="fast")
prompt = "This is a test prompt. vllm1"
tokens = tokenizer.encode(prompt, add_special_tokens=False)
print(f"CALLING {base_url} FOR {model_name}")
response = requests.post(base_url + "/detokenize",
response = requests.post(server.url_for("detokenize"),
json={
"model": model_name,
"tokens": tokens
......
from typing import Dict, List
import os
import openai
import pytest
import pytest_asyncio
from vllm.multimodal.utils import encode_video_base64, fetch_video
from ...utils import RemoteOpenAIServer, models_path_prefix, urls_port
MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
MAXIMUM_VIDEOS = 4
# TEST_VIDEO_URLS = [
# "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4",
# "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ElephantsDream.mp4",
# "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerBlazes.mp4",
# "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4",
# ]
TEST_VIDEO_URLS = [
f"http://localhost:{urls_port}/BigBuckBunny.mp4",
f"http://localhost:{urls_port}/ElephantsDream.mp4",
f"http://localhost:{urls_port}/ForBiggerBlazes.mp4",
f"http://localhost:{urls_port}/ForBiggerFun.mp4",
]
@pytest.fixture(scope="module")
def server():
args = [
"--task",
"generate",
"--dtype",
"bfloat16",
"--max-model-len",
"32768",
"--max-num-seqs",
"2",
"--enforce-eager",
"--trust-remote-code",
"--limit-mm-per-prompt",
f"video={MAXIMUM_VIDEOS}",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
@pytest.fixture(scope="session")
def base64_encoded_video() -> Dict[str, str]:
return {
video_url: encode_video_base64(fetch_video(video_url))
for video_url in TEST_VIDEO_URLS
}
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video(client: openai.AsyncOpenAI,
model_name: str, video_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "video_url",
"video_url": {
"url": video_url
}
},
{
"type": "text",
"text": "What's in this video?"
},
],
}]
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
message = choice.message
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video_beamsearch(client: openai.AsyncOpenAI,
model_name: str,
video_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "video_url",
"video_url": {
"url": video_url
}
},
{
"type": "text",
"text": "What's in this video?"
},
],
}]
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
n=2,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5,
extra_body=dict(use_beam_search=True))
assert len(chat_completion.choices) == 2
assert chat_completion.choices[
0].message.content != chat_completion.choices[1].message.content
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video_base64encoded(
client: openai.AsyncOpenAI, model_name: str, video_url: str,
base64_encoded_video: Dict[str, str]):
messages = [{
"role":
"user",
"content": [
{
"type": "video_url",
"video_url": {
"url":
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
}
},
{
"type": "text",
"text": "What's in this video?"
},
],
}]
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
logprobs=True,
top_logprobs=5)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
message = choice.message
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 10
assert message.role == "assistant"
messages.append({"role": "assistant", "content": message.content})
# test multi-turn dialogue
messages.append({"role": "user", "content": "express your result in json"})
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_single_chat_session_video_base64encoded_beamsearch(
client: openai.AsyncOpenAI, model_name: str, video_url: str,
base64_encoded_video: Dict[str, str]):
messages = [{
"role":
"user",
"content": [
{
"type": "video_url",
"video_url": {
"url":
f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
}
},
{
"type": "text",
"text": "What's in this video?"
},
],
}]
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
n=2,
max_completion_tokens=10,
extra_body=dict(use_beam_search=True))
assert len(chat_completion.choices) == 2
assert chat_completion.choices[
0].message.content != chat_completion.choices[1].message.content
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
async def test_chat_streaming_video(client: openai.AsyncOpenAI,
model_name: str, video_url: str):
messages = [{
"role":
"user",
"content": [
{
"type": "video_url",
"video_url": {
"url": video_url
}
},
{
"type": "text",
"text": "What's in this video?"
},
],
}]
# test single completion
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
output = chat_completion.choices[0].message.content
stop_reason = chat_completion.choices[0].finish_reason
# test streaming
stream = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
stream=True,
)
chunks: List[str] = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
if delta.role:
assert delta.role == "assistant"
if delta.content:
chunks.append(delta.content)
if chunk.choices[0].finish_reason is not None:
finish_reason_count += 1
# finish reason should only return in last block
assert finish_reason_count == 1
assert chunk.choices[0].finish_reason == stop_reason
assert delta.content
assert "".join(chunks) == output
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize(
"video_urls",
[TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))])
async def test_multi_video_input(client: openai.AsyncOpenAI, model_name: str,
video_urls: List[str]):
messages = [{
"role":
"user",
"content": [
*({
"type": "video_url",
"video_url": {
"url": video_url
}
} for video_url in video_urls),
{
"type": "text",
"text": "What's in this video?"
},
],
}]
if len(video_urls) > MAXIMUM_VIDEOS:
with pytest.raises(openai.BadRequestError): # test multi-video input
await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
# the server should still work afterwards
completion = await client.completions.create(
model=model_name,
prompt=[0, 0, 0, 0, 0],
max_tokens=5,
temperature=0.0,
)
completion = completion.choices[0].text
assert completion is not None and len(completion) >= 0
else:
chat_completion = await client.chat.completions.create(
model=model_name,
messages=messages,
max_completion_tokens=10,
temperature=0.0,
)
message = chat_completion.choices[0].message
assert message.content is not None and len(message.content) >= 0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment