Commit 469e903b authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.2' into v0.8.2-dev

parents 389ebcf7 25f560a6
......@@ -5,7 +5,6 @@ import os
from vllm import LLM
from ...utils import models_path_prefix
from vllm.config import LoadFormat
@pytest.fixture(autouse=True)
......@@ -17,17 +16,13 @@ def v1(run_with_both_engines):
def test_empty_prompt():
llm = LLM(model=os.path.join(models_path_prefix, "gpt2"),
load_format=LoadFormat.RUNAI_STREAMER,
enforce_eager=True)
llm = LLM(model=os.path.join(models_path_prefix, "openai-community/gpt2"), enforce_eager=True)
with pytest.raises(ValueError, match='Prompt cannot be empty'):
llm.generate([""])
@pytest.mark.skip_v1
def test_out_of_vocab_token():
llm = LLM(model=os.path.join(models_path_prefix, "gpt2"),
load_format=LoadFormat.RUNAI_STREAMER,
enforce_eager=True)
llm = LLM(model=os.path.join(models_path_prefix, "openai-community/gpt2"), enforce_eager=True)
with pytest.raises(ValueError, match='out of vocabulary'):
llm.generate({"prompt_token_ids": [999999]})
......@@ -56,32 +56,37 @@ def cache_models():
@pytest.mark.skip_global_cleanup
@pytest.mark.usefixtures("cache_models")
def test_offline_mode(monkeypatch):
def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
# Set HF to offline mode and ensure we can still construct an LLM
try:
monkeypatch.setenv("HF_HUB_OFFLINE", "1")
monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1")
def disable_connect(*args, **kwargs):
raise RuntimeError("No http calls allowed")
monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect",
disable_connect)
monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect",
disable_connect)
# Need to re-import huggingface_hub and friends to setup offline mode
_re_import_modules()
# Cached model files should be used in offline mode
for model_config in MODEL_CONFIGS:
LLM(**model_config)
finally:
# Reset the environment after the test
# NB: Assuming tests are run in online mode
monkeypatch.delenv("HF_HUB_OFFLINE")
monkeypatch.delenv("VLLM_NO_USAGE_STATS")
_re_import_modules()
pass
with monkeypatch.context() as m:
try:
m.setenv("HF_HUB_OFFLINE", "1")
m.setenv("VLLM_NO_USAGE_STATS", "1")
def disable_connect(*args, **kwargs):
raise RuntimeError("No http calls allowed")
m.setattr(
urllib3.connection.HTTPConnection,
"connect",
disable_connect,
)
m.setattr(
urllib3.connection.HTTPSConnection,
"connect",
disable_connect,
)
# Need to re-import huggingface_hub
# and friends to setup offline mode
_re_import_modules()
# Cached model files should be used in offline mode
for model_config in MODEL_CONFIGS:
LLM(**model_config)
finally:
# Reset the environment after the test
# NB: Assuming tests are run in online mode
_re_import_modules()
def _re_import_modules():
......
......@@ -21,7 +21,7 @@ NUM_CONCURRENT = 500
TASK = "gsm8k"
FILTER = "exact_match,strict-match"
RTOL = 0.03
EXPECTED_VALUE = 0.58
EXPECTED_VALUE = 0.54
DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"]
MORE_ARGS_LIST = [
[], # Default
......@@ -71,7 +71,7 @@ def run_test(more_args):
@pytest.mark.skipif(not current_platform.is_cuda()
and not current_platform.is_tpu(),
reason="V1 currently only supported on CUDA and TPU")
def test_lm_eval_accuracy_v1_engine(monkeypatch):
def test_lm_eval_accuracy_v1_engine(monkeypatch: pytest.MonkeyPatch):
"""Run with the V1 Engine."""
with monkeypatch.context() as m:
......@@ -86,7 +86,8 @@ def test_lm_eval_accuracy_v1_engine(monkeypatch):
@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args):
def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch,
more_args):
"""Run with the V0 Engine."""
with monkeypatch.context() as m:
......
......@@ -10,7 +10,6 @@ import asyncio
import io
import time
from statistics import mean, median
from typing import List
import librosa
import pytest
......@@ -67,7 +66,7 @@ async def process_dataset(model, client, data, concurrent_request):
audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
_ = await bound_transcribe(model, sem, client, (audio, sr), "")
tasks: List[asyncio.Task] = []
tasks: list[asyncio.Task] = []
for sample in data:
audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
task = asyncio.create_task(
......
# SPDX-License-Identifier: Apache-2.0
from typing import List
import pytest
from transformers import AutoTokenizer
......@@ -180,7 +178,7 @@ def test_reasoning(
):
output = tokenizer.tokenize(param_dict["output"])
# decode everything to tokens
output_tokens: List[str] = [
output_tokens: list[str] = [
tokenizer.convert_tokens_to_string([token]) for token in output
]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
......
# SPDX-License-Identifier: Apache-2.0
from typing import List, Optional, Tuple, Union
from typing import Optional, Union
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaMessage)
......@@ -33,10 +33,10 @@ class StreamingReasoningReconstructor:
def run_reasoning_extraction(
reasoning_parser: ReasoningParser,
model_output: List[str],
model_output: list[str],
request: Union[ChatCompletionRequest, None] = None,
streaming: bool = False,
) -> Tuple[Optional[str], Optional[str]]:
) -> tuple[Optional[str], Optional[str]]:
if streaming:
reconstructor = run_reasoning_extraction_streaming(
reasoning_parser,
......@@ -55,9 +55,9 @@ def run_reasoning_extraction(
def run_reasoning_extraction_nonstreaming(
reasoning_parser: ReasoningParser,
model_output: List[str],
model_output: list[str],
request: Union[ChatCompletionRequest, None] = None,
) -> Tuple[Optional[str], Optional[str]]:
) -> tuple[Optional[str], Optional[str]]:
request = request or ChatCompletionRequest(messages=[], model="test-model")
return reasoning_parser.extract_reasoning_content(
model_output=''.join(model_output), request=request)
......@@ -65,13 +65,13 @@ def run_reasoning_extraction_nonstreaming(
def run_reasoning_extraction_streaming(
reasoning_parser: ReasoningParser,
model_deltas: List[str],
model_deltas: list[str],
request: Union[ChatCompletionRequest, None] = None,
) -> StreamingReasoningReconstructor:
request = request or ChatCompletionRequest(messages=[], model="test-model")
reconstructor = StreamingReasoningReconstructor()
previous_text = ""
previous_tokens: List[int] = []
previous_tokens: list[int] = []
for delta in model_deltas:
token_delta = [
reasoning_parser.vocab.get(token)
......
# SPDX-License-Identifier: Apache-2.0
from typing import Dict, List
import openai
import pytest
import os
......@@ -20,8 +18,6 @@ TEST_AUDIO_URLS = [
@pytest.fixture(scope="module")
def server():
args = [
"--dtype",
"bfloat16",
"--max-model-len",
"2048",
"--max-num-seqs",
......@@ -41,7 +37,7 @@ async def client(server):
@pytest.fixture(scope="session")
def base64_encoded_audio() -> Dict[str, str]:
def base64_encoded_audio() -> dict[str, str]:
return {
audio_url: encode_audio_base64(*fetch_audio(audio_url))
for audio_url in TEST_AUDIO_URLS
......@@ -83,7 +79,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=201, total_tokens=211)
completion_tokens=10, prompt_tokens=202, total_tokens=212)
message = choice.message
message = chat_completion.choices[0].message
......@@ -107,7 +103,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
async def test_single_chat_session_audio_base64encoded(
client: openai.AsyncOpenAI, model_name: str, audio_url: str,
base64_encoded_audio: Dict[str, str]):
base64_encoded_audio: dict[str, str]):
messages = [{
"role":
......@@ -140,7 +136,7 @@ async def test_single_chat_session_audio_base64encoded(
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=201, total_tokens=211)
completion_tokens=10, prompt_tokens=202, total_tokens=212)
message = choice.message
message = chat_completion.choices[0].message
......@@ -165,7 +161,7 @@ async def test_single_chat_session_audio_base64encoded(
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
async def test_single_chat_session_input_audio(
client: openai.AsyncOpenAI, model_name: str, audio_url: str,
base64_encoded_audio: Dict[str, str]):
base64_encoded_audio: dict[str, str]):
messages = [{
"role":
"user",
......@@ -196,7 +192,7 @@ async def test_single_chat_session_input_audio(
choice = chat_completion.choices[0]
assert choice.finish_reason == "length"
assert chat_completion.usage == openai.types.CompletionUsage(
completion_tokens=10, prompt_tokens=201, total_tokens=211)
completion_tokens=10, prompt_tokens=202, total_tokens=212)
message = choice.message
message = chat_completion.choices[0].message
......@@ -255,7 +251,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
temperature=0.0,
stream=True,
)
chunks: List[str] = []
chunks: list[str] = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
......@@ -277,7 +273,7 @@ async def test_chat_streaming_audio(client: openai.AsyncOpenAI,
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
model_name: str, audio_url: str,
base64_encoded_audio: Dict[str,
base64_encoded_audio: dict[str,
str]):
messages = [{
"role":
......@@ -315,7 +311,7 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
temperature=0.0,
stream=True,
)
chunks: List[str] = []
chunks: list[str] = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
......@@ -337,7 +333,7 @@ async def test_chat_streaming_input_audio(client: openai.AsyncOpenAI,
@pytest.mark.parametrize("audio_url", TEST_AUDIO_URLS)
async def test_multi_audio_input(client: openai.AsyncOpenAI, model_name: str,
audio_url: str,
base64_encoded_audio: Dict[str, str]):
base64_encoded_audio: dict[str, str]):
messages = [{
"role":
......
......@@ -2,7 +2,6 @@
import asyncio
from http import HTTPStatus
from typing import List
import openai
import pytest
......@@ -18,7 +17,7 @@ MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
@pytest.fixture(scope='module')
def server_args(request: pytest.FixtureRequest) -> List[str]:
def server_args(request: pytest.FixtureRequest) -> list[str]:
""" Provide extra arguments to the server via indirect parametrization
Usage:
......@@ -173,3 +172,51 @@ async def test_request_wrong_content_type(server: RemoteOpenAIServer):
extra_headers={
"Content-Type": "application/x-www-form-urlencoded"
})
@pytest.mark.parametrize(
"server_args",
[
pytest.param(["--enable-server-load-tracking"],
id="enable-server-load-tracking")
],
indirect=True,
)
@pytest.mark.asyncio
async def test_server_load(server: RemoteOpenAIServer):
# Check initial server load
response = requests.get(server.url_for("load"))
assert response.status_code == HTTPStatus.OK
assert response.json().get("server_load") == 0
def make_long_completion_request():
return requests.post(
server.url_for("v1/completions"),
headers={"Content-Type": "application/json"},
json={
"prompt": "Give me a long story",
"max_tokens": 1000,
"temperature": 0,
},
)
# Start the completion request in a background thread.
completion_future = asyncio.create_task(
asyncio.to_thread(make_long_completion_request))
# Give a short delay to ensure the request has started.
await asyncio.sleep(0.1)
# Check server load while the completion request is running.
response = requests.get(server.url_for("load"))
assert response.status_code == HTTPStatus.OK
assert response.json().get("server_load") == 1
# Wait for the completion request to finish.
await completion_future
await asyncio.sleep(0.1)
# Check server load after the completion request has finished.
response = requests.get(server.url_for("load"))
assert response.status_code == HTTPStatus.OK
assert response.json().get("server_load") == 0
......@@ -3,13 +3,14 @@
# imports for guided decoding tests
import json
import re
from typing import Dict, List, Optional
from typing import Optional
import jsonschema
import openai # use the official client for correctness check
import pytest
import os
import pytest_asyncio
import requests
import torch
from openai import BadRequestError
......@@ -190,7 +191,7 @@ async def test_too_many_chat_logprobs(client: openai.AsyncOpenAI,
async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI,
model_name: str,
prompt_logprobs: Optional[int]):
params: Dict = {
params: dict = {
"messages": [{
"role": "system",
"content": "You are a helpful assistant."
......@@ -232,7 +233,7 @@ async def test_prompt_logprobs_chat(client: openai.AsyncOpenAI,
)
async def test_more_than_one_prompt_logprobs_chat(client: openai.AsyncOpenAI,
model_name: str):
params: Dict = {
params: dict = {
"messages": [{
"role": "system",
"content": "You are a helpful assistant."
......@@ -343,7 +344,7 @@ async def test_chat_streaming(client: openai.AsyncOpenAI, model_name: str):
temperature=0.0,
stream=True,
)
chunks: List[str] = []
chunks: list[str] = []
finish_reason_count = 0
async for chunk in stream:
delta = chunk.choices[0].delta
......@@ -1001,3 +1002,34 @@ async def test_long_seed(client: openai.AsyncOpenAI):
assert ("greater_than_equal" in exc_info.value.message
or "less_than_equal" in exc_info.value.message)
@pytest.mark.asyncio
async def test_http_chat_wo_model_name(server: RemoteOpenAIServer):
url = f"http://localhost:{server.port}/v1/chat/completions"
headers = {
"Content-Type": "application/json",
}
data = {
# model_name is avoided here.
"messages": [{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role": "user",
"content": "what is 1+1?"
}],
"max_tokens":
5
}
response = requests.post(url, headers=headers, json=data)
response_data = response.json()
print(response_data)
choice = response_data.get("choices")[0]
message = choice.get("message")
assert message is not None
content = message.get("content")
assert content is not None
assert len(content) > 0
......@@ -11,7 +11,6 @@ from ...utils import RemoteOpenAIServer, models_path_prefix
# # any model with a chat template should work here
MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct")
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
@pytest.fixture(scope="module")
......@@ -23,8 +22,6 @@ def server():
"--enforce-eager",
"--max-model-len",
"4080",
"--chat-template",
DUMMY_CHAT_TEMPLATE,
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
......
......@@ -108,8 +108,10 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
# Call the function and get the result
result = apply_hf_chat_template(
tokenizer,
trust_remote_code=True,
conversation=mock_request.messages,
chat_template=mock_request.chat_template or template_content,
tools=None,
add_generation_prompt=mock_request.add_generation_prompt,
continue_final_message=mock_request.continue_final_message,
)
......
# SPDX-License-Identifier: Apache-2.0
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
from ...utils import RemoteOpenAIServer
# a reasoning and tool calling model
MODEL_NAME = "Qwen/QwQ-32B"
@pytest.fixture(scope="module")
def server(): # noqa: F811
args = [
"--max-model-len", "8192", "--enforce-eager", "--enable-reasoning",
"--reasoning-parser", "deepseek_r1", "--enable-auto-tool-choice",
"--tool-call-parser", "hermes"
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server):
async with server.get_async_client() as async_client:
yield async_client
TOOLS = [{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type":
"string",
"description":
"The city to find the weather for, e.g. 'San Francisco'"
},
"state": {
"type":
"string",
"description":
"the two-letter abbreviation for the state that the city is"
" in, e.g. 'CA' which would mean 'California'"
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"]
}
},
"required": ["city", "state", "unit"]
}
}
}]
MESSAGES = [{
"role": "user",
"content": "Hi! How are you doing today?"
}, {
"role": "assistant",
"content": "I'm doing well! How can I help you?"
}, {
"role":
"user",
"content":
"Can you tell me what the temperate will be in Dallas, in fahrenheit?"
}]
FUNC_NAME = "get_current_weather"
FUNC_ARGS = """{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}"""
def extract_reasoning_and_calls(chunks: list):
reasoning_content = ""
tool_call_idx = -1
arguments = []
function_names = []
for chunk in chunks:
if chunk.choices[0].delta.tool_calls:
tool_call = chunk.choices[0].delta.tool_calls[0]
if tool_call.index != tool_call_idx:
tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
arguments.append("")
function_names.append("")
if tool_call.function:
if tool_call.function.name:
function_names[tool_call_idx] = tool_call.function.name
if tool_call.function.arguments:
arguments[tool_call_idx] += tool_call.function.arguments
else:
if hasattr(chunk.choices[0].delta, "reasoning_content"):
reasoning_content += chunk.choices[0].delta.reasoning_content
return reasoning_content, arguments, function_names
# test streaming
@pytest.mark.asyncio
async def test_chat_streaming_of_tool_and_reasoning(
client: openai.AsyncOpenAI):
stream = await client.chat.completions.create(
model=MODEL_NAME,
messages=MESSAGES,
tools=TOOLS,
temperature=0.0,
stream=True,
)
chunks = []
async for chunk in stream:
chunks.append(chunk)
reasoning_content, arguments, function_names = extract_reasoning_and_calls(
chunks)
assert len(reasoning_content) > 0
assert len(function_names) > 0 and function_names[0] == FUNC_NAME
assert len(arguments) > 0 and arguments[0] == FUNC_ARGS
# test full generate
@pytest.mark.asyncio
async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
tool_calls = await client.chat.completions.create(
model=MODEL_NAME,
messages=MESSAGES,
tools=TOOLS,
temperature=0.0,
stream=False,
)
assert len(tool_calls.choices[0].message.reasoning_content) > 0
assert tool_calls.choices[0].message.tool_calls[0].function.name \
== FUNC_NAME
assert tool_calls.choices[0].message.tool_calls[0].function.arguments \
== FUNC_ARGS
......@@ -26,7 +26,7 @@ def serve_parser():
return make_arg_parser(parser)
### Tests for Lora module parsing
### Tests for LoRA module parsing
def test_valid_key_value_format(serve_parser):
# Test old format: name=path
args = serve_parser.parse_args([
......
......@@ -5,7 +5,7 @@ import json
import re
import shutil
from tempfile import TemporaryDirectory
from typing import Dict, List, Optional
from typing import Optional
import jsonschema
import openai # use the official client for correctness check
......@@ -290,7 +290,7 @@ async def test_too_many_completion_logprobs(client: openai.AsyncOpenAI,
async def test_prompt_logprobs_completion(client: openai.AsyncOpenAI,
model_name: str,
prompt_logprobs: Optional[int]):
params: Dict = {
params: dict = {
"prompt": ["A robot may not injure another robot", "My name is"],
"model": model_name,
}
......@@ -334,7 +334,7 @@ async def test_completion_streaming(client: openai.AsyncOpenAI,
max_tokens=5,
temperature=0.0,
stream=True)
chunks: List[str] = []
chunks: list[str] = []
finish_reason_count = 0
async for chunk in stream:
chunks.append(chunk.choices[0].text)
......@@ -367,7 +367,7 @@ async def test_parallel_streaming(client: openai.AsyncOpenAI, model_name: str):
max_tokens=max_tokens,
n=n,
stream=True)
chunks: List[List[str]] = [[] for i in range(n)]
chunks: list[list[str]] = [[] for i in range(n)]
finish_reason_count = 0
async for chunk in stream:
index = chunk.choices[0].index
......
......@@ -14,7 +14,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")
MODEL_NAME = os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")
DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}""" # noqa: E501
......@@ -28,7 +28,7 @@ def server():
"bfloat16",
"--enforce-eager",
"--max-model-len",
"8192",
"512",
"--chat-template",
DUMMY_CHAT_TEMPLATE,
]
......@@ -61,10 +61,10 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 9
assert embeddings.usage.total_tokens == 9
assert embeddings.usage.prompt_tokens == 11
assert embeddings.usage.total_tokens == 11
# test using token IDs
input_tokens = [1, 1, 1, 1, 1]
......@@ -78,7 +78,7 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 5
assert embeddings.usage.total_tokens == 5
......@@ -87,7 +87,7 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
# test List[str]
# test list[str]
input_texts = [
"The cat sat on the mat.", "A feline was resting on a rug.",
"Stars twinkle brightly in the night sky."
......@@ -102,12 +102,12 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
assert embeddings.id is not None
assert len(embeddings.data) == 3
assert len(embeddings.data[0].embedding) == 4096
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 32
assert embeddings.usage.total_tokens == 32
assert embeddings.usage.prompt_tokens == 33
assert embeddings.usage.total_tokens == 33
# test List[List[int]]
# test list[list[int]]
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
[25, 32, 64, 77]]
embedding_response = await client.embeddings.create(
......@@ -120,7 +120,7 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
assert embeddings.id is not None
assert len(embeddings.data) == 4
assert len(embeddings.data[0].embedding) == 4096
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 17
assert embeddings.usage.total_tokens == 17
......@@ -235,7 +235,7 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
......@@ -253,7 +253,7 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
assert embeddings.id is not None
assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 4096
assert len(embeddings.data[0].embedding) == 384
assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 10
assert embeddings.usage.total_tokens == 10
......
......@@ -7,7 +7,7 @@ import pytest_asyncio
from ...utils import RemoteOpenAIServer, models_path_prefix
from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
from vllm.utils import is_hip
from vllm.platforms import current_platform
MODEL_NAME = os.path.join(models_path_prefix, "facebook/bart-base")
......@@ -30,7 +30,7 @@ async def client(server):
yield async_client
@pytest.mark.skipif(is_hip(),
@pytest.mark.skipif(current_platform.is_rocm(),
reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP)
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
......
......@@ -228,9 +228,11 @@ EXPECTED_METRICS_V1 = [
"vllm:gpu_cache_usage_perc",
"vllm:gpu_prefix_cache_queries",
"vllm:gpu_prefix_cache_hits",
"vllm:num_preemptions_total",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
"vllm:iteration_tokens_total",
"vllm:cache_config_info",
"vllm:request_success_total",
"vllm:request_prompt_tokens_sum",
"vllm:request_prompt_tokens_bucket",
......@@ -238,6 +240,12 @@ EXPECTED_METRICS_V1 = [
"vllm:request_generation_tokens_sum",
"vllm:request_generation_tokens_bucket",
"vllm:request_generation_tokens_count",
"vllm:request_params_n_sum",
"vllm:request_params_n_bucket",
"vllm:request_params_n_count",
"vllm:request_params_max_tokens_sum",
"vllm:request_params_max_tokens_bucket",
"vllm:request_params_max_tokens_count",
"vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count",
......@@ -281,7 +289,7 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
def test_metrics_exist_run_batch(use_v1: bool):
if use_v1:
pytest.skip("Skipping test on vllm V1")
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}""" # noqa: E501
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/multilingual-e5-small", "input": "You are a helpful assistant."}}""" # noqa: E501
#base_url = "0.0.0.0"
base_url = "localhost"
......@@ -302,7 +310,7 @@ def test_metrics_exist_run_batch(use_v1: bool):
"-o",
output_file.name,
"--model",
os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"),
os.path.join(models_path_prefix, "intfloat/multilingual-e5-small"),
"--enable-metrics",
"--url",
base_url,
......
......@@ -84,7 +84,7 @@ async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
# test List[str]
# test list[str]
input_texts = [
"The cat sat on the mat.", "A feline was resting on a rug.",
"Stars twinkle brightly in the night sky."
......@@ -107,7 +107,7 @@ async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
assert poolings.usage.prompt_tokens == 25
assert poolings.usage.total_tokens == 25
# test List[List[int]]
# test list[list[int]]
input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
[25, 32, 64, 77]]
response = requests.post(
......
......@@ -8,17 +8,17 @@ from vllm.entrypoints.openai.protocol import RerankResponse
from ...utils import RemoteOpenAIServer
MODEL_NAME = "BAAI/bge-reranker-base"
DTYPE = "bfloat16"
@pytest.fixture(scope="module")
def server():
args = ["--enforce-eager", "--max-model-len", "100"]
args = ["--enforce-eager", "--max-model-len", "100", "--dtype", DTYPE]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
query = "What is the capital of France?"
......@@ -42,7 +42,6 @@ def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
assert rerank.results[1].relevance_score <= 0.01
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_top_n(server: RemoteOpenAIServer, model_name: str):
query = "What is the capital of France?"
......@@ -68,7 +67,6 @@ def test_top_n(server: RemoteOpenAIServer, model_name: str):
assert rerank.results[1].relevance_score <= 0.01
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
......
......@@ -17,18 +17,28 @@ from .test_completion import MODEL_NAME
@pytest.fixture(scope="module")
def server_with_return_tokens_as_token_ids_flag(
default_server_args): # noqa: F811
args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
yield remote_server
def server_fixture(request, default_server_args): # noqa: F811
use_server_flag = request.param
if use_server_flag:
args_with_flag = default_server_args + ["--return-tokens-as-token-ids"]
with RemoteOpenAIServer(MODEL_NAME, args_with_flag) as remote_server:
yield (remote_server, True)
else:
with RemoteOpenAIServer(MODEL_NAME,
default_server_args) as remote_server:
yield (remote_server, False)
@pytest.mark.asyncio
@pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
async def test_completion_return_tokens_as_token_ids_completion(
server_with_return_tokens_as_token_ids_flag):
async with server_with_return_tokens_as_token_ids_flag.get_async_client(
) as client:
server_fixture):
server, use_server_flag = server_fixture
request_args = {}
if not use_server_flag:
request_args["return_tokens_as_token_ids"] = True
async with server.get_async_client() as client:
completion = await client.completions.create(
model=MODEL_NAME,
......@@ -39,7 +49,8 @@ async def test_completion_return_tokens_as_token_ids_completion(
echo=True,
temperature=0,
max_tokens=10,
logprobs=1)
logprobs=1,
extra_body=request_args)
text = completion.choices[0].text
token_strs = completion.choices[0].logprobs.tokens
......@@ -60,10 +71,14 @@ async def test_completion_return_tokens_as_token_ids_completion(
@pytest.mark.asyncio
async def test_chat_return_tokens_as_token_ids_completion(
server_with_return_tokens_as_token_ids_flag):
async with server_with_return_tokens_as_token_ids_flag.get_async_client(
) as client:
@pytest.mark.parametrize("server_fixture", [True, False], indirect=True)
async def test_chat_return_tokens_as_token_ids_completion(server_fixture):
server, use_server_flag = server_fixture
request_args = {}
if not use_server_flag:
request_args["return_tokens_as_token_ids"] = True
async with server.get_async_client() as client:
response = await client.chat.completions.create(
model=MODEL_NAME,
# Include Unicode characters to test for dividing a single
......@@ -78,7 +93,8 @@ async def test_chat_return_tokens_as_token_ids_completion(
}],
temperature=0,
max_tokens=8,
logprobs=True)
logprobs=True,
extra_body=request_args)
text = response.choices[0].message.content
tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment