Commit afd0da21 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.1' into v0.7.1-dev

parents 1a11f127 4f4d427a
# unit test for `examples/offline_inference/torchrun_example.py`
import random
import torch.distributed as dist
from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import get_world_group
# Create prompts
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# set different `gpu_memory_utilization` and `swap_space` for different ranks,
# to test if all ranks agree on the same kv cache configuration.
llm = LLM(model="facebook/opt-125m",
tensor_parallel_size=2,
distributed_executor_backend="external_launcher",
gpu_memory_utilization=random.uniform(0.7, 0.9),
swap_space=random.randint(1, 4))
outputs = llm.generate(prompts, sampling_params)
cpu_group = get_world_group().cpu_group
torch_rank = dist.get_rank(group=cpu_group)
def test_consistent_across_ranks(obj):
if torch_rank == 0:
dist.broadcast_object_list([obj], src=0, group=cpu_group)
else:
container = [None]
dist.broadcast_object_list(container, src=0, group=cpu_group)
assert container[0] == obj
test_consistent_across_ranks(
llm.llm_engine.vllm_config.cache_config.num_cpu_blocks)
test_consistent_across_ranks(
llm.llm_engine.vllm_config.cache_config.num_gpu_blocks)
# all ranks should have the same outputs
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
test_consistent_across_ranks(prompt)
test_consistent_across_ranks(generated_text)
print(f"Rank {torch_rank}, Prompt: {prompt!r}, "
f"Generated text: {generated_text!r}")
import asyncio import asyncio
import os import os
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import pytest import pytest
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.llm_engine import LLMEngine from vllm.engine.llm_engine import LLMEngine
from vllm.executor.gpu_executor import GPUExecutor, GPUExecutorAsync from vllm.executor.uniproc_executor import UniProcExecutor
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
import os import os
from ..utils import models_path_prefix from ..utils import models_path_prefix
...@@ -16,21 +17,20 @@ class Mock: ...@@ -16,21 +17,20 @@ class Mock:
... ...
class CustomGPUExecutor(GPUExecutor): class CustomUniExecutor(UniProcExecutor):
def execute_model(self, *args, **kwargs): def collective_rpc(self,
method: Union[str, Callable],
timeout: Optional[float] = None,
args: Tuple = (),
kwargs: Optional[Dict] = None) -> List[Any]:
# Drop marker to show that this was ran # Drop marker to show that this was ran
with open(".marker", "w"): with open(".marker", "w"):
... ...
return super().execute_model(*args, **kwargs) return super().collective_rpc(method, timeout, args, kwargs)
class CustomGPUExecutorAsync(GPUExecutorAsync): CustomUniExecutorAsync = CustomUniExecutor
async def execute_model_async(self, *args, **kwargs):
with open(".marker", "w"):
...
return await super().execute_model_async(*args, **kwargs)
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
...@@ -43,10 +43,6 @@ def test_custom_executor_type_checking(model): ...@@ -43,10 +43,6 @@ def test_custom_executor_type_checking(model):
engine_args = AsyncEngineArgs(model=model, engine_args = AsyncEngineArgs(model=model,
distributed_executor_backend=Mock) distributed_executor_backend=Mock)
AsyncLLMEngine.from_engine_args(engine_args) AsyncLLMEngine.from_engine_args(engine_args)
with pytest.raises(TypeError):
engine_args = AsyncEngineArgs(
model=model, distributed_executor_backend=CustomGPUExecutor)
AsyncLLMEngine.from_engine_args(engine_args)
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
...@@ -57,7 +53,9 @@ def test_custom_executor(model, tmp_path): ...@@ -57,7 +53,9 @@ def test_custom_executor(model, tmp_path):
assert not os.path.exists(".marker") assert not os.path.exists(".marker")
engine_args = EngineArgs( engine_args = EngineArgs(
model=model, distributed_executor_backend=CustomGPUExecutor) model=model,
distributed_executor_backend=CustomUniExecutor,
)
engine = LLMEngine.from_engine_args(engine_args) engine = LLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(max_tokens=1) sampling_params = SamplingParams(max_tokens=1)
...@@ -77,7 +75,7 @@ def test_custom_executor_async(model, tmp_path): ...@@ -77,7 +75,7 @@ def test_custom_executor_async(model, tmp_path):
assert not os.path.exists(".marker") assert not os.path.exists(".marker")
engine_args = AsyncEngineArgs( engine_args = AsyncEngineArgs(
model=model, distributed_executor_backend=CustomGPUExecutorAsync) model=model, distributed_executor_backend=CustomUniExecutorAsync)
engine = AsyncLLMEngine.from_engine_args(engine_args) engine = AsyncLLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(max_tokens=1) sampling_params = SamplingParams(max_tokens=1)
......
...@@ -6,16 +6,15 @@ from typing import Any, List, Tuple ...@@ -6,16 +6,15 @@ from typing import Any, List, Tuple
import pytest import pytest
from vllm.config import VllmConfig
from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper, from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
ResultHandler, WorkerMonitor) ResultHandler, WorkerMonitor)
from vllm.worker.worker_base import WorkerWrapperBase
class DummyWorker: class DummyWorkerWrapper(WorkerWrapperBase):
"""Dummy version of vllm.worker.worker.Worker""" """Dummy version of vllm.worker.worker.Worker"""
def __init__(self, rank: int):
self.rank = rank
def worker_method(self, worker_input: Any) -> Tuple[int, Any]: def worker_method(self, worker_input: Any) -> Tuple[int, Any]:
sleep(0.05) sleep(0.05)
...@@ -23,14 +22,15 @@ class DummyWorker: ...@@ -23,14 +22,15 @@ class DummyWorker:
# simulate error case # simulate error case
raise worker_input raise worker_input
return self.rank, input return self.rpc_rank, input
def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]: def _start_workers() -> Tuple[List[ProcessWorkerWrapper], WorkerMonitor]:
result_handler = ResultHandler() result_handler = ResultHandler()
vllm_config = VllmConfig()
workers = [ workers = [
ProcessWorkerWrapper(result_handler, partial(DummyWorker, rank=rank)) ProcessWorkerWrapper(result_handler, DummyWorkerWrapper, vllm_config,
for rank in range(8) rank) for rank in range(8)
] ]
worker_monitor = WorkerMonitor(workers, result_handler) worker_monitor = WorkerMonitor(workers, result_handler)
......
import pytest
from vllm import LLM
from ...utils import fork_new_process_for_each_test
@pytest.mark.parametrize("tp_size", [1, 2])
@pytest.mark.parametrize("backend", ["mp", "ray"])
@fork_new_process_for_each_test
def test_collective_rpc(tp_size, backend):
if tp_size == 1 and backend == "ray":
pytest.skip("Skip duplicate test case")
if tp_size == 1:
backend = None
# intentionally define the method and class in the test function,
# to test if they can be serialized and sent to the workers
def echo_rank(self):
return self.rank
from vllm.worker.worker import Worker
class MyWorker(Worker):
def echo_rank(self):
return self.rank
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
enforce_eager=True,
load_format="dummy",
tensor_parallel_size=tp_size,
distributed_executor_backend=backend,
worker_cls=MyWorker)
for method in ["echo_rank", echo_rank]:
assert llm.collective_rpc(method) == list(range(tp_size))
...@@ -107,3 +107,10 @@ def test_multiple_pooling_params(llm: LLM): ...@@ -107,3 +107,10 @@ def test_multiple_pooling_params(llm: LLM):
# pooling_params is None, default params should be applied # pooling_params is None, default params should be applied
outputs = llm.encode(PROMPTS, pooling_params=None) outputs = llm.encode(PROMPTS, pooling_params=None)
assert len(PROMPTS) == len(outputs) assert len(PROMPTS) == len(outputs)
@pytest.mark.skip_global_cleanup
def test_right_side_truncation(llm: LLM):
# Embeddings models should truncate the end of the prompt
tokenizer = llm.get_tokenizer()
assert tokenizer.truncation_side == "right"
from typing import List
import pytest
from transformers import AutoTokenizer
from tests.entrypoints.openai.reasoning_parsers.utils import (
run_reasoning_extraction)
from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
ReasoningParserManager)
parser_name = "deepseek_r1"
start_token = "<think>"
end_token = "</think>"
SIMPLE_REASONING = {
"output": "<think>This is a reasoning section</think>This is the rest",
"reasoning_content": "This is a reasoning section",
"content": "This is the rest",
}
COMPLETE_REASONING = {
"output": "<think>This is a reasoning section</think>",
"reasoning_content": "This is a reasoning section",
"content": None,
}
NO_REASONING = {
"output": "This is a reasoning section",
"reasoning_content": None,
"content": "This is a reasoning section",
}
MULTIPLE_LINES = {
"output": "<think>This\nThat</think>This is the rest\nThat",
"reasoning_content": "This\nThat",
"content": "This is the rest\nThat",
}
SHORTEST_REASONING_NO_STREAMING = {
"output": "<think></think>This is the rest",
"reasoning_content": "",
"content": "This is the rest",
}
SHORTEST_REASONING = {
"output": "<think></think>This is the rest",
"reasoning_content": None,
"content": "This is the rest",
}
TEST_CASES = [
pytest.param(
False,
SIMPLE_REASONING,
id="simple_streaming",
),
pytest.param(
True,
SIMPLE_REASONING,
id="simple_streaming",
),
pytest.param(
False,
COMPLETE_REASONING,
id="complete_streaming",
),
pytest.param(
True,
COMPLETE_REASONING,
id="complete_streaming",
),
pytest.param(
False,
NO_REASONING,
id="no_streaming",
),
pytest.param(
True,
NO_REASONING,
id="no_streaming",
),
pytest.param(
False,
MULTIPLE_LINES,
id="multiple_lines_streaming",
),
pytest.param(
True,
MULTIPLE_LINES,
id="multiple_lines_streaming",
),
pytest.param(
True,
SHORTEST_REASONING,
id="shortest_streaming",
),
pytest.param(
False,
SHORTEST_REASONING_NO_STREAMING,
id="shortest_streaming",
),
]
@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
def test_reasoning(
streaming: bool,
param_dict: dict,
):
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
tokenizer.add_tokens([start_token, end_token])
output = tokenizer.tokenize(param_dict["output"])
# decode everything to tokens
output_tokens: List[str] = [
tokenizer.convert_tokens_to_string([token]) for token in output
]
parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
parser_name)(tokenizer)
reasoning, content = run_reasoning_extraction(parser,
output_tokens,
streaming=streaming)
assert reasoning == param_dict["reasoning_content"]
assert content == param_dict["content"]
from typing import List, Optional, Tuple, Union
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
DeltaMessage)
from vllm.entrypoints.openai.reasoning_parsers import ReasoningParser
class StreamingReasoningReconstructor:
def __init__(self):
self.reasoning_content = None
self.other_content = None
def append_delta(self, delta: DeltaMessage):
# content and the reasoning content should not be present
# at the same time
assert delta.content is None or delta.reasoning_content is None, (
"Both content and reasoning content are present in the "
"delta message")
if delta.content is not None:
if self.other_content is None:
self.other_content = delta.content
else:
self.other_content += delta.content
else:
if self.reasoning_content is None:
self.reasoning_content = delta.reasoning_content
else:
self.reasoning_content += delta.reasoning_content
def run_reasoning_extraction(
reasoning_parser: ReasoningParser,
model_output: List[str],
request: Union[ChatCompletionRequest, None] = None,
streaming: bool = False,
) -> Tuple[Optional[str], Optional[str]]:
if streaming:
reconstructor = run_reasoning_extraction_streaming(
reasoning_parser,
model_output,
request,
)
return (
reconstructor.reasoning_content,
reconstructor.other_content or None,
)
else:
reasoning, content = run_reasoning_extraction_nonstreaming(
reasoning_parser, model_output, request)
return reasoning, content
def run_reasoning_extraction_nonstreaming(
reasoning_parser: ReasoningParser,
model_output: List[str],
request: Union[ChatCompletionRequest, None] = None,
) -> Tuple[Optional[str], Optional[str]]:
request = request or ChatCompletionRequest(messages=[], model="test-model")
return reasoning_parser.extract_reasoning_content(
model_output=''.join(model_output), request=request)
def run_reasoning_extraction_streaming(
reasoning_parser: ReasoningParser,
model_deltas: List[str],
request: Union[ChatCompletionRequest, None] = None,
) -> StreamingReasoningReconstructor:
request = request or ChatCompletionRequest(messages=[], model="test-model")
reconstructor = StreamingReasoningReconstructor()
previous_text = ""
previous_tokens: List[int] = []
for delta in model_deltas:
token_delta = [
reasoning_parser.vocab.get(token)
for token in reasoning_parser.model_tokenizer.tokenize(delta)
if token in reasoning_parser.vocab
]
current_text = previous_text + delta
current_tokens = previous_tokens + token_delta
delta_message = reasoning_parser.extract_reasoning_content_streaming(
previous_text,
current_text,
delta,
previous_tokens,
current_tokens,
token_delta,
)
if delta_message is not None:
reconstructor.append_delta(delta_message)
previous_text = current_text
previous_tokens = current_tokens
return reconstructor
...@@ -4,7 +4,7 @@ import pytest ...@@ -4,7 +4,7 @@ import pytest
from vllm.entrypoints.openai.cli_args import (make_arg_parser, from vllm.entrypoints.openai.cli_args import (make_arg_parser,
validate_parsed_serve_args) validate_parsed_serve_args)
from vllm.entrypoints.openai.serving_engine import LoRAModulePath from vllm.entrypoints.openai.serving_models import LoRAModulePath
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
from ...utils import VLLM_PATH from ...utils import VLLM_PATH
...@@ -116,6 +116,35 @@ def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser): ...@@ -116,6 +116,35 @@ def test_enable_auto_choice_passes_with_tool_call_parser(serve_parser):
validate_parsed_serve_args(args) validate_parsed_serve_args(args)
def test_enable_auto_choice_fails_with_enable_reasoning(serve_parser):
"""Ensure validation fails if reasoning is enabled with auto tool choice"""
args = serve_parser.parse_args(args=[
"--enable-auto-tool-choice",
"--enable-reasoning",
])
with pytest.raises(TypeError):
validate_parsed_serve_args(args)
def test_enable_reasoning_passes_with_reasoning_parser(serve_parser):
"""Ensure validation passes if reasoning is enabled
with a reasoning parser"""
args = serve_parser.parse_args(args=[
"--enable-reasoning",
"--reasoning-parser",
"deepseek_r1",
])
validate_parsed_serve_args(args)
def test_enable_reasoning_fails_without_reasoning_parser(serve_parser):
"""Ensure validation fails if reasoning is enabled
without a reasoning parser"""
args = serve_parser.parse_args(args=["--enable-reasoning"])
with pytest.raises(TypeError):
validate_parsed_serve_args(args)
def test_chat_template_validation_for_happy_paths(serve_parser): def test_chat_template_validation_for_happy_paths(serve_parser):
"""Ensure validation passes if the chat template exists""" """Ensure validation passes if the chat template exists"""
args = serve_parser.parse_args( args = serve_parser.parse_args(
......
...@@ -29,6 +29,8 @@ PA_NAME = os.path.join(models_path_prefix, "swapnilbp/llama_tweet_ptune") ...@@ -29,6 +29,8 @@ PA_NAME = os.path.join(models_path_prefix, "swapnilbp/llama_tweet_ptune")
# need to change to match the prompt adapter # need to change to match the prompt adapter
PA_NUM_VIRTUAL_TOKENS = 8 PA_NUM_VIRTUAL_TOKENS = 8
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def zephyr_lora_files(): def zephyr_lora_files():
...@@ -638,8 +640,7 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI): ...@@ -638,8 +640,7 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
["outlines", "lm-format-enforcer"])
async def test_guided_json_completion(client: openai.AsyncOpenAI, async def test_guided_json_completion(client: openai.AsyncOpenAI,
guided_decoding_backend: str, guided_decoding_backend: str,
sample_json_schema): sample_json_schema):
...@@ -661,8 +662,7 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI, ...@@ -661,8 +662,7 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
["outlines", "lm-format-enforcer"])
async def test_guided_regex_completion(client: openai.AsyncOpenAI, async def test_guided_regex_completion(client: openai.AsyncOpenAI,
guided_decoding_backend: str, guided_decoding_backend: str,
sample_regex): sample_regex):
...@@ -683,8 +683,7 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI, ...@@ -683,8 +683,7 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
["outlines", "lm-format-enforcer"])
async def test_guided_choice_completion(client: openai.AsyncOpenAI, async def test_guided_choice_completion(client: openai.AsyncOpenAI,
guided_decoding_backend: str, guided_decoding_backend: str,
sample_guided_choice): sample_guided_choice):
...@@ -764,8 +763,7 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI, ...@@ -764,8 +763,7 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("guided_decoding_backend", @pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
["outlines", "lm-format-enforcer"])
async def test_guided_decoding_type_error(client: openai.AsyncOpenAI, async def test_guided_decoding_type_error(client: openai.AsyncOpenAI,
guided_decoding_backend: str, guided_decoding_backend: str,
sample_json_schema, sample_regex): sample_json_schema, sample_regex):
......
import asyncio
import json
import shutil
from contextlib import suppress
import openai # use the official client for correctness check
import pytest
import pytest_asyncio
# downloading lora to test lora requests
from huggingface_hub import snapshot_download
from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here
LORA_NAME = "typeof/zephyr-7b-beta-lora"
BADREQUEST_CASES = [
(
"test_rank",
{
"r": 1024
},
"is greater than max_lora_rank",
),
(
"test_bias",
{
"bias": "all"
},
"Adapter bias cannot be used without bias_enabled",
),
("test_dora", {
"use_dora": True
}, "does not yet support DoRA"),
(
"test_modules_to_save",
{
"modules_to_save": ["lm_head"]
},
"only supports modules_to_save being None",
),
]
@pytest.fixture(scope="module")
def zephyr_lora_files():
return snapshot_download(repo_id=LORA_NAME)
@pytest.fixture(scope="module")
def server_with_lora_modules_json(zephyr_lora_files):
# Define the json format LoRA module configurations
lora_module_1 = {
"name": "zephyr-lora",
"path": zephyr_lora_files,
"base_model_name": MODEL_NAME
}
lora_module_2 = {
"name": "zephyr-lora2",
"path": zephyr_lora_files,
"base_model_name": MODEL_NAME
}
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"bfloat16",
"--max-model-len",
"8192",
"--enforce-eager",
# lora config below
"--enable-lora",
"--lora-modules",
json.dumps(lora_module_1),
json.dumps(lora_module_2),
"--max-lora-rank",
"64",
"--max-cpu-loras",
"2",
"--max-num-seqs",
"64",
]
# Enable the /v1/load_lora_adapter endpoint
envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"}
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def client(server_with_lora_modules_json):
async with server_with_lora_modules_json.get_async_client(
) as async_client:
yield async_client
@pytest.mark.asyncio
async def test_static_lora_lineage(client: openai.AsyncOpenAI,
zephyr_lora_files):
models = await client.models.list()
models = models.data
served_model = models[0]
lora_models = models[1:]
assert served_model.id == MODEL_NAME
assert served_model.root == MODEL_NAME
assert served_model.parent is None
assert all(lora_model.root == zephyr_lora_files
for lora_model in lora_models)
assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models)
assert lora_models[0].id == "zephyr-lora"
assert lora_models[1].id == "zephyr-lora2"
@pytest.mark.asyncio
async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI,
zephyr_lora_files):
response = await client.post("load_lora_adapter",
cast_to=str,
body={
"lora_name": "zephyr-lora-3",
"lora_path": zephyr_lora_files
})
# Ensure adapter loads before querying /models
assert "success" in response
models = await client.models.list()
models = models.data
dynamic_lora_model = models[-1]
assert dynamic_lora_model.root == zephyr_lora_files
assert dynamic_lora_model.parent == MODEL_NAME
assert dynamic_lora_model.id == "zephyr-lora-3"
@pytest.mark.asyncio
async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI):
with pytest.raises(openai.NotFoundError):
await client.post("load_lora_adapter",
cast_to=str,
body={
"lora_name": "notfound",
"lora_path": "/not/an/adapter"
})
@pytest.mark.asyncio
async def test_dynamic_lora_invalid_files(client: openai.AsyncOpenAI,
tmp_path):
invalid_files = tmp_path / "invalid_files"
invalid_files.mkdir()
(invalid_files / "adapter_config.json").write_text("this is not json")
with pytest.raises(openai.BadRequestError):
await client.post("load_lora_adapter",
cast_to=str,
body={
"lora_name": "invalid-json",
"lora_path": str(invalid_files)
})
@pytest.mark.asyncio
@pytest.mark.parametrize("test_name,config_change,expected_error",
BADREQUEST_CASES)
async def test_dynamic_lora_badrequests(client: openai.AsyncOpenAI, tmp_path,
zephyr_lora_files, test_name: str,
config_change: dict,
expected_error: str):
# Create test directory
test_dir = tmp_path / test_name
# Copy adapter files
shutil.copytree(zephyr_lora_files, test_dir)
# Load and modify configuration
config_path = test_dir / "adapter_config.json"
with open(config_path) as f:
adapter_config = json.load(f)
# Apply configuration changes
adapter_config.update(config_change)
# Save modified configuration
with open(config_path, "w") as f:
json.dump(adapter_config, f)
# Test loading the adapter
with pytest.raises(openai.BadRequestError, match=expected_error):
await client.post("load_lora_adapter",
cast_to=str,
body={
"lora_name": test_name,
"lora_path": str(test_dir)
})
@pytest.mark.asyncio
async def test_multiple_lora_adapters(client: openai.AsyncOpenAI, tmp_path,
zephyr_lora_files):
"""Validate that many loras can be dynamically registered and inferenced
with concurrently"""
# This test file configures the server with --max-cpu-loras=2 and this test
# will concurrently load 10 adapters, so it should flex the LRU cache
async def load_and_run_adapter(adapter_name: str):
await client.post("load_lora_adapter",
cast_to=str,
body={
"lora_name": adapter_name,
"lora_path": str(zephyr_lora_files)
})
for _ in range(3):
await client.completions.create(
model=adapter_name,
prompt=["Hello there", "Foo bar bazz buzz"],
max_tokens=5,
)
lora_tasks = []
for i in range(10):
lora_tasks.append(
asyncio.create_task(load_and_run_adapter(f"adapter_{i}")))
results, _ = await asyncio.wait(lora_tasks)
for r in results:
assert not isinstance(r, Exception), f"Got exception {r}"
@pytest.mark.asyncio
async def test_loading_invalid_adapters_does_not_break_others(
client: openai.AsyncOpenAI, tmp_path, zephyr_lora_files):
invalid_files = tmp_path / "invalid_files"
invalid_files.mkdir()
(invalid_files / "adapter_config.json").write_text("this is not json")
stop_good_requests_event = asyncio.Event()
async def run_good_requests(client):
# Run chat completions requests until event set
results = []
while not stop_good_requests_event.is_set():
try:
batch = await client.completions.create(
model="zephyr-lora",
prompt=["Hello there", "Foo bar bazz buzz"],
max_tokens=5,
)
results.append(batch)
except Exception as e:
results.append(e)
return results
# Create task to run good requests
good_task = asyncio.create_task(run_good_requests(client))
# Run a bunch of bad adapter loads
for _ in range(25):
with suppress(openai.NotFoundError):
await client.post("load_lora_adapter",
cast_to=str,
body={
"lora_name": "notfound",
"lora_path": "/not/an/adapter"
})
for _ in range(25):
with suppress(openai.BadRequestError):
await client.post("load_lora_adapter",
cast_to=str,
body={
"lora_name": "invalid",
"lora_path": str(invalid_files)
})
# Ensure all the running requests with lora adapters succeeded
stop_good_requests_event.set()
results = await good_task
for r in results:
assert not isinstance(r, Exception), f"Got exception {r}"
# Ensure we can load another adapter and run it
await client.post("load_lora_adapter",
cast_to=str,
body={
"lora_name": "valid",
"lora_path": zephyr_lora_files
})
await client.completions.create(
model="valid",
prompt=["Hello there", "Foo bar bazz buzz"],
max_tokens=5,
)
...@@ -17,6 +17,24 @@ from ...utils import RemoteOpenAIServer, models_path_prefix ...@@ -17,6 +17,24 @@ from ...utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "TinyLlama/TinyLlama-1.1B-Chat-v1.0") MODEL_NAME = os.path.join(models_path_prefix, "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
@pytest.fixture(scope="module", params=[True, False])
def use_v1(request):
# Module-scoped variant of run_with_both_engines
#
# Use this fixture to run a test with both v0 and v1, and
# also to conditionalize the test logic e.g.
#
# def test_metrics_exist(use_v1, server, client):
# ...
# expected = EXPECTED_V1_METRICS if use_v1 else EXPECTED_METRICS
# for metric in expected:
# assert metric in response.text
#
# @skip_v1 wouldn't work here because this is a module-level
# fixture - per-function decorators would have no effect
yield request.param
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def default_server_args(): def default_server_args():
return [ return [
...@@ -37,10 +55,12 @@ def default_server_args(): ...@@ -37,10 +55,12 @@ def default_server_args():
"--enable-chunked-prefill", "--enable-chunked-prefill",
"--disable-frontend-multiprocessing", "--disable-frontend-multiprocessing",
]) ])
def server(default_server_args, request): def server(use_v1, default_server_args, request):
if request.param: if request.param:
default_server_args.append(request.param) default_server_args.append(request.param)
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: env_dict = dict(VLLM_USE_V1='1' if use_v1 else '0')
with RemoteOpenAIServer(MODEL_NAME, default_server_args,
env_dict=env_dict) as remote_server:
yield remote_server yield remote_server
...@@ -85,7 +105,7 @@ EXPECTED_VALUES = { ...@@ -85,7 +105,7 @@ EXPECTED_VALUES = {
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_metrics_counts(server: RemoteOpenAIServer, async def test_metrics_counts(server: RemoteOpenAIServer,
client: openai.AsyncClient): client: openai.AsyncClient, use_v1: bool):
for _ in range(_NUM_REQUESTS): for _ in range(_NUM_REQUESTS):
# sending a request triggers the metrics to be logged. # sending a request triggers the metrics to be logged.
await client.completions.create( await client.completions.create(
...@@ -99,6 +119,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer, ...@@ -99,6 +119,9 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
# Loop over all expected metric_families # Loop over all expected metric_families
for metric_family, suffix_values_list in EXPECTED_VALUES.items(): for metric_family, suffix_values_list in EXPECTED_VALUES.items():
if use_v1 and metric_family not in EXPECTED_METRICS_V1:
continue
found_metric = False found_metric = False
# Check to see if the metric_family is found in the prom endpoint. # Check to see if the metric_family is found in the prom endpoint.
...@@ -175,10 +198,30 @@ EXPECTED_METRICS = [ ...@@ -175,10 +198,30 @@ EXPECTED_METRICS = [
"swap_space_bytes", "swap_space_bytes",
] ]
EXPECTED_METRICS_V1 = [
"vllm:num_requests_running",
"vllm:num_requests_waiting",
"vllm:gpu_cache_usage_perc",
"vllm:prompt_tokens_total",
"vllm:generation_tokens_total",
"vllm:request_prompt_tokens_sum",
"vllm:request_prompt_tokens_bucket",
"vllm:request_prompt_tokens_count",
"vllm:request_generation_tokens_sum",
"vllm:request_generation_tokens_bucket",
"vllm:request_generation_tokens_count",
"vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count",
"vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count",
]
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_metrics_exist(server: RemoteOpenAIServer, async def test_metrics_exist(server: RemoteOpenAIServer,
client: openai.AsyncClient): client: openai.AsyncClient, use_v1: bool):
# sending a request triggers the metrics to be logged. # sending a request triggers the metrics to be logged.
await client.completions.create(model=MODEL_NAME, await client.completions.create(model=MODEL_NAME,
prompt="Hello, my name is", prompt="Hello, my name is",
...@@ -188,11 +231,13 @@ async def test_metrics_exist(server: RemoteOpenAIServer, ...@@ -188,11 +231,13 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
response = requests.get(server.url_for("metrics")) response = requests.get(server.url_for("metrics"))
assert response.status_code == HTTPStatus.OK assert response.status_code == HTTPStatus.OK
for metric in EXPECTED_METRICS: for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
assert metric in response.text assert metric in response.text
def test_metrics_exist_run_batch(): def test_metrics_exist_run_batch(use_v1: bool):
if use_v1:
pytest.skip("Skipping test on vllm V1")
input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}""" # noqa: E501 input_batch = """{"custom_id": "request-0", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}""" # noqa: E501
#base_url = "0.0.0.0" #base_url = "0.0.0.0"
......
import pytest
import requests
from vllm.entrypoints.openai.protocol import RerankResponse
from ...utils import RemoteOpenAIServer
MODEL_NAME = "BAAI/bge-reranker-base"
@pytest.fixture(scope="module")
def server():
args = ["--enforce-eager", "--max-model-len", "100"]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_rerank_texts(server: RemoteOpenAIServer, model_name: str):
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.", "The capital of France is Paris."
]
rerank_response = requests.post(server.url_for("rerank"),
json={
"model": model_name,
"query": query,
"documents": documents,
})
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert rerank.id is not None
assert rerank.results is not None
assert len(rerank.results) == 2
assert rerank.results[0].relevance_score >= 0.9
assert rerank.results[1].relevance_score <= 0.01
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_top_n(server: RemoteOpenAIServer, model_name: str):
query = "What is the capital of France?"
documents = [
"The capital of Brazil is Brasilia.",
"The capital of France is Paris.", "Cross-encoder models are neat"
]
rerank_response = requests.post(server.url_for("rerank"),
json={
"model": model_name,
"query": query,
"documents": documents,
"top_n": 2
})
rerank_response.raise_for_status()
rerank = RerankResponse.model_validate(rerank_response.json())
assert rerank.id is not None
assert rerank.results is not None
assert len(rerank.results) == 2
assert rerank.results[0].relevance_score >= 0.9
assert rerank.results[1].relevance_score <= 0.01
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
query = "What is the capital of France?" * 100
documents = [
"The capital of Brazil is Brasilia.", "The capital of France is Paris."
]
rerank_response = requests.post(server.url_for("rerank"),
json={
"model": model_name,
"query": query,
"documents": documents
})
assert rerank_response.status_code == 400
# Assert just a small fragments of the response
assert "Please reduce the length of the input." in \
rerank_response.text
\ No newline at end of file
import json
import subprocess import subprocess
import sys import sys
import os import os
...@@ -39,6 +40,9 @@ INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": " ...@@ -39,6 +40,9 @@ INPUT_EMBEDDING_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "
{"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "Hello world!"}} {"custom_id": "request-3", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "Hello world!"}}
{"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}""" {"custom_id": "request-4", "method": "POST", "url": "/v1/embeddings", "body": {"model": "NonExistModel", "input": "Hello world!"}}"""
INPUT_SCORE_BATCH = """{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}"""
def test_empty_file(): def test_empty_file():
with tempfile.NamedTemporaryFile( with tempfile.NamedTemporaryFile(
...@@ -120,3 +124,36 @@ def test_embeddings(): ...@@ -120,3 +124,36 @@ def test_embeddings():
# Ensure that the output format conforms to the openai api. # Ensure that the output format conforms to the openai api.
# Validation should throw if the schema is wrong. # Validation should throw if the schema is wrong.
BatchRequestOutput.model_validate_json(line) BatchRequestOutput.model_validate_json(line)
def test_score():
with tempfile.NamedTemporaryFile(
"w") as input_file, tempfile.NamedTemporaryFile(
"r") as output_file:
input_file.write(INPUT_SCORE_BATCH)
input_file.flush()
proc = subprocess.Popen([
sys.executable,
"-m",
"vllm.entrypoints.openai.run_batch",
"-i",
input_file.name,
"-o",
output_file.name,
"--model",
"BAAI/bge-reranker-v2-m3",
], )
proc.communicate()
proc.wait()
assert proc.returncode == 0, f"{proc=}"
contents = output_file.read()
for line in contents.strip().split("\n"):
# Ensure that the output format conforms to the openai api.
# Validation should throw if the schema is wrong.
BatchRequestOutput.model_validate_json(line)
# Ensure that there is no error in the response.
line_dict = json.loads(line)
assert isinstance(line_dict, dict)
assert line_dict["error"] is None
...@@ -11,9 +11,7 @@ MODEL_NAME = os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3") ...@@ -11,9 +11,7 @@ MODEL_NAME = os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3")
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def server(): def server():
args = [ args = ["--enforce-eager", "--max-model-len", "100"]
"--enforce-eager",
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server yield remote_server
...@@ -21,8 +19,7 @@ def server(): ...@@ -21,8 +19,7 @@ def server():
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_text_1_str_text_2_list(server: RemoteOpenAIServer, def test_text_1_str_text_2_list(server: RemoteOpenAIServer, model_name: str):
model_name: str):
text_1 = "What is the capital of France?" text_1 = "What is the capital of France?"
text_2 = [ text_2 = [
"The capital of Brazil is Brasilia.", "The capital of France is Paris." "The capital of Brazil is Brasilia.", "The capital of France is Paris."
...@@ -46,8 +43,7 @@ async def test_text_1_str_text_2_list(server: RemoteOpenAIServer, ...@@ -46,8 +43,7 @@ async def test_text_1_str_text_2_list(server: RemoteOpenAIServer,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_text_1_list_text_2_list(server: RemoteOpenAIServer, def test_text_1_list_text_2_list(server: RemoteOpenAIServer, model_name: str):
model_name: str):
text_1 = [ text_1 = [
"What is the capital of the United States?", "What is the capital of the United States?",
"What is the capital of France?" "What is the capital of France?"
...@@ -74,8 +70,7 @@ async def test_text_1_list_text_2_list(server: RemoteOpenAIServer, ...@@ -74,8 +70,7 @@ async def test_text_1_list_text_2_list(server: RemoteOpenAIServer,
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_text_1_str_text_2_str(server: RemoteOpenAIServer, def test_text_1_str_text_2_str(server: RemoteOpenAIServer, model_name: str):
model_name: str):
text_1 = "What is the capital of France?" text_1 = "What is the capital of France?"
text_2 = "The capital of France is Paris." text_2 = "The capital of France is Paris."
...@@ -92,3 +87,36 @@ async def test_text_1_str_text_2_str(server: RemoteOpenAIServer, ...@@ -92,3 +87,36 @@ async def test_text_1_str_text_2_str(server: RemoteOpenAIServer,
assert score.data is not None assert score.data is not None
assert len(score.data) == 1 assert len(score.data) == 1
assert score.data[0].score >= 0.9 assert score.data[0].score >= 0.9
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_score_max_model_len(server: RemoteOpenAIServer, model_name: str):
text_1 = "What is the capital of France?" * 20
text_2 = [
"The capital of Brazil is Brasilia.", "The capital of France is Paris."
]
score_response = requests.post(server.url_for("score"),
json={
"model": model_name,
"text_1": text_1,
"text_2": text_2,
})
assert score_response.status_code == 400
# Assert just a small fragments of the response
assert "Please reduce the length of the input." in \
score_response.text
# Test truncation
score_response = requests.post(server.url_for("score"),
json={
"model": model_name,
"text_1": text_1,
"text_2": text_2,
"truncate_prompt_tokens": 101
})
assert score_response.status_code == 400
assert "Please, select a smaller truncation size." in \
score_response.text
...@@ -8,7 +8,8 @@ from vllm.config import MultiModalConfig ...@@ -8,7 +8,8 @@ from vllm.config import MultiModalConfig
from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.engine.multiprocessing.client import MQLLMEngineClient
from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_engine import BaseModelPath from vllm.entrypoints.openai.serving_models import (BaseModelPath,
OpenAIServingModels)
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import models_path_prefix from ...utils import models_path_prefix
...@@ -34,6 +35,8 @@ class MockModelConfig: ...@@ -34,6 +35,8 @@ class MockModelConfig:
hf_config = MockHFConfig() hf_config = MockHFConfig()
logits_processor_pattern = None logits_processor_pattern = None
diff_sampling_param: Optional[dict] = None diff_sampling_param: Optional[dict] = None
allowed_local_media_path: str = ""
encoder_config = None
def get_diff_sampling_param(self): def get_diff_sampling_param(self):
return self.diff_sampling_param or {} return self.diff_sampling_param or {}
...@@ -50,14 +53,13 @@ async def _async_serving_chat_init(): ...@@ -50,14 +53,13 @@ async def _async_serving_chat_init():
engine = MockEngine() engine = MockEngine()
model_config = await engine.get_model_config() model_config = await engine.get_model_config()
models = OpenAIServingModels(engine, model_config, BASE_MODEL_PATHS)
serving_completion = OpenAIServingChat(engine, serving_completion = OpenAIServingChat(engine,
model_config, model_config,
BASE_MODEL_PATHS, models,
response_role="assistant", response_role="assistant",
chat_template=CHAT_TEMPLATE, chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto", chat_template_content_format="auto",
lora_modules=None,
prompt_adapters=None,
request_logger=None) request_logger=None)
return serving_completion return serving_completion
...@@ -72,14 +74,15 @@ def test_serving_chat_should_set_correct_max_tokens(): ...@@ -72,14 +74,15 @@ def test_serving_chat_should_set_correct_max_tokens():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False mock_engine.errored = False
models = OpenAIServingModels(engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS,
model_config=MockModelConfig())
serving_chat = OpenAIServingChat(mock_engine, serving_chat = OpenAIServingChat(mock_engine,
MockModelConfig(), MockModelConfig(),
BASE_MODEL_PATHS, models,
response_role="assistant", response_role="assistant",
chat_template=CHAT_TEMPLATE, chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto", chat_template_content_format="auto",
lora_modules=None,
prompt_adapters=None,
request_logger=None) request_logger=None)
req = ChatCompletionRequest( req = ChatCompletionRequest(
model=MODEL_NAME, model=MODEL_NAME,
...@@ -101,6 +104,116 @@ def test_serving_chat_should_set_correct_max_tokens(): ...@@ -101,6 +104,116 @@ def test_serving_chat_should_set_correct_max_tokens():
assert mock_engine.generate.call_args.args[1].max_tokens == 10 assert mock_engine.generate.call_args.args[1].max_tokens == 10
# Setting server's max_tokens in the generation_config.json
# lower than context_window - prompt_tokens
mock_model_config = MockModelConfig()
mock_model_config.diff_sampling_param = {
"max_tokens": 10 # Setting server-side max_tokens limit
}
# Reinitialize the engine with new settings
mock_engine = MagicMock(spec=MQLLMEngineClient)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
# Initialize the serving chat
models = OpenAIServingModels(engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS,
model_config=mock_model_config)
serving_chat = OpenAIServingChat(mock_engine,
mock_model_config,
models,
response_role="assistant",
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None)
# Test Case 1: No max_tokens specified in request
req = ChatCompletionRequest(
model=MODEL_NAME,
messages=[{
"role": "user",
"content": "what is 1+1?"
}],
guided_decoding_backend="outlines",
)
with suppress(Exception):
asyncio.run(serving_chat.create_chat_completion(req))
assert mock_engine.generate.call_args.args[1].max_tokens == 10
# Test Case 2: Request's max_tokens set higher than server accepts
req.max_tokens = 15
with suppress(Exception):
asyncio.run(serving_chat.create_chat_completion(req))
assert mock_engine.generate.call_args.args[1].max_tokens == 10
# Test Case 3: Request's max_tokens set lower than server accepts
req.max_tokens = 5
with suppress(Exception):
asyncio.run(serving_chat.create_chat_completion(req))
assert mock_engine.generate.call_args.args[1].max_tokens == 5
# Setting server's max_tokens in the generation_config.json
# higher than context_window - prompt_tokens
mock_model_config = MockModelConfig()
mock_model_config.diff_sampling_param = {
"max_tokens": 200 # Setting server-side max_tokens limit
}
# Reinitialize the engine with new settings
mock_engine = MagicMock(spec=MQLLMEngineClient)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
# Initialize the serving chat
models = OpenAIServingModels(engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS,
model_config=mock_model_config)
serving_chat = OpenAIServingChat(mock_engine,
mock_model_config,
models,
response_role="assistant",
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None)
# Test case 1: No max_tokens specified, defaults to context_window
req = ChatCompletionRequest(
model=MODEL_NAME,
messages=[{
"role": "user",
"content": "what is 1+1?"
}],
guided_decoding_backend="outlines",
)
with suppress(Exception):
asyncio.run(serving_chat.create_chat_completion(req))
assert mock_engine.generate.call_args.args[1].max_tokens == 93
# Test Case 2: Request's max_tokens set higher than server accepts
req.max_tokens = 100
with suppress(Exception):
asyncio.run(serving_chat.create_chat_completion(req))
assert mock_engine.generate.call_args.args[1].max_tokens == 93
# Test Case 3: Request's max_tokens set lower than server accepts
req.max_tokens = 5
with suppress(Exception):
asyncio.run(serving_chat.create_chat_completion(req))
assert mock_engine.generate.call_args.args[1].max_tokens == 5
def test_serving_chat_could_load_correct_generation_config(): def test_serving_chat_could_load_correct_generation_config():
...@@ -115,14 +228,15 @@ def test_serving_chat_could_load_correct_generation_config(): ...@@ -115,14 +228,15 @@ def test_serving_chat_could_load_correct_generation_config():
mock_engine.errored = False mock_engine.errored = False
# Initialize the serving chat # Initialize the serving chat
models = OpenAIServingModels(engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS,
model_config=mock_model_config)
serving_chat = OpenAIServingChat(mock_engine, serving_chat = OpenAIServingChat(mock_engine,
mock_model_config, mock_model_config,
BASE_MODEL_PATHS, models,
response_role="assistant", response_role="assistant",
chat_template=CHAT_TEMPLATE, chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto", chat_template_content_format="auto",
lora_modules=None,
prompt_adapters=None,
request_logger=None) request_logger=None)
req = ChatCompletionRequest( req = ChatCompletionRequest(
model=MODEL_NAME, model=MODEL_NAME,
......
...@@ -9,8 +9,8 @@ from vllm.engine.protocol import EngineClient ...@@ -9,8 +9,8 @@ from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.protocol import (ErrorResponse, from vllm.entrypoints.openai.protocol import (ErrorResponse,
LoadLoraAdapterRequest, LoadLoraAdapterRequest,
UnloadLoraAdapterRequest) UnloadLoraAdapterRequest)
from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing from vllm.entrypoints.openai.serving_models import (BaseModelPath,
OpenAIServingModels)
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from ...utils import models_path_prefix from ...utils import models_path_prefix
...@@ -22,47 +22,48 @@ LORA_UNLOADING_SUCCESS_MESSAGE = ( ...@@ -22,47 +22,48 @@ LORA_UNLOADING_SUCCESS_MESSAGE = (
"Success: LoRA adapter '{lora_name}' removed successfully.") "Success: LoRA adapter '{lora_name}' removed successfully.")
async def _async_serving_engine_init(): async def _async_serving_models_init() -> OpenAIServingModels:
mock_engine_client = MagicMock(spec=EngineClient)
mock_model_config = MagicMock(spec=ModelConfig) mock_model_config = MagicMock(spec=ModelConfig)
mock_engine_client = MagicMock(spec=EngineClient)
# Set the max_model_len attribute to avoid missing attribute # Set the max_model_len attribute to avoid missing attribute
mock_model_config.max_model_len = 2048 mock_model_config.max_model_len = 2048
serving_engine = OpenAIServing(mock_engine_client, serving_models = OpenAIServingModels(engine_client=mock_engine_client,
mock_model_config, base_model_paths=BASE_MODEL_PATHS,
BASE_MODEL_PATHS, model_config=mock_model_config,
lora_modules=None, lora_modules=None,
prompt_adapters=None, prompt_adapters=None)
request_logger=None) await serving_models.init_static_loras()
return serving_engine
return serving_models
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_serving_model_name(): async def test_serving_model_name():
serving_engine = await _async_serving_engine_init() serving_models = await _async_serving_models_init()
assert serving_engine._get_model_name(None) == MODEL_NAME assert serving_models.model_name(None) == MODEL_NAME
request = LoRARequest(lora_name="adapter", request = LoRARequest(lora_name="adapter",
lora_path="/path/to/adapter2", lora_path="/path/to/adapter2",
lora_int_id=1) lora_int_id=1)
assert serving_engine._get_model_name(request) == request.lora_name assert serving_models.model_name(request) == request.lora_name
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_load_lora_adapter_success(): async def test_load_lora_adapter_success():
serving_engine = await _async_serving_engine_init() serving_models = await _async_serving_models_init()
request = LoadLoraAdapterRequest(lora_name="adapter", request = LoadLoraAdapterRequest(lora_name="adapter",
lora_path="/path/to/adapter2") lora_path="/path/to/adapter2")
response = await serving_engine.load_lora_adapter(request) response = await serving_models.load_lora_adapter(request)
assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter') assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter')
assert len(serving_engine.lora_requests) == 1 assert len(serving_models.lora_requests) == 1
assert serving_engine.lora_requests[0].lora_name == "adapter" assert serving_models.lora_requests[0].lora_name == "adapter"
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_load_lora_adapter_missing_fields(): async def test_load_lora_adapter_missing_fields():
serving_engine = await _async_serving_engine_init() serving_models = await _async_serving_models_init()
request = LoadLoraAdapterRequest(lora_name="", lora_path="") request = LoadLoraAdapterRequest(lora_name="", lora_path="")
response = await serving_engine.load_lora_adapter(request) response = await serving_models.load_lora_adapter(request)
assert isinstance(response, ErrorResponse) assert isinstance(response, ErrorResponse)
assert response.type == "InvalidUserInput" assert response.type == "InvalidUserInput"
assert response.code == HTTPStatus.BAD_REQUEST assert response.code == HTTPStatus.BAD_REQUEST
...@@ -70,43 +71,43 @@ async def test_load_lora_adapter_missing_fields(): ...@@ -70,43 +71,43 @@ async def test_load_lora_adapter_missing_fields():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_load_lora_adapter_duplicate(): async def test_load_lora_adapter_duplicate():
serving_engine = await _async_serving_engine_init() serving_models = await _async_serving_models_init()
request = LoadLoraAdapterRequest(lora_name="adapter1", request = LoadLoraAdapterRequest(lora_name="adapter1",
lora_path="/path/to/adapter1") lora_path="/path/to/adapter1")
response = await serving_engine.load_lora_adapter(request) response = await serving_models.load_lora_adapter(request)
assert response == LORA_LOADING_SUCCESS_MESSAGE.format( assert response == LORA_LOADING_SUCCESS_MESSAGE.format(
lora_name='adapter1') lora_name='adapter1')
assert len(serving_engine.lora_requests) == 1 assert len(serving_models.lora_requests) == 1
request = LoadLoraAdapterRequest(lora_name="adapter1", request = LoadLoraAdapterRequest(lora_name="adapter1",
lora_path="/path/to/adapter1") lora_path="/path/to/adapter1")
response = await serving_engine.load_lora_adapter(request) response = await serving_models.load_lora_adapter(request)
assert isinstance(response, ErrorResponse) assert isinstance(response, ErrorResponse)
assert response.type == "InvalidUserInput" assert response.type == "InvalidUserInput"
assert response.code == HTTPStatus.BAD_REQUEST assert response.code == HTTPStatus.BAD_REQUEST
assert len(serving_engine.lora_requests) == 1 assert len(serving_models.lora_requests) == 1
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_unload_lora_adapter_success(): async def test_unload_lora_adapter_success():
serving_engine = await _async_serving_engine_init() serving_models = await _async_serving_models_init()
request = LoadLoraAdapterRequest(lora_name="adapter1", request = LoadLoraAdapterRequest(lora_name="adapter1",
lora_path="/path/to/adapter1") lora_path="/path/to/adapter1")
response = await serving_engine.load_lora_adapter(request) response = await serving_models.load_lora_adapter(request)
assert len(serving_engine.lora_requests) == 1 assert len(serving_models.lora_requests) == 1
request = UnloadLoraAdapterRequest(lora_name="adapter1") request = UnloadLoraAdapterRequest(lora_name="adapter1")
response = await serving_engine.unload_lora_adapter(request) response = await serving_models.unload_lora_adapter(request)
assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format( assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format(
lora_name='adapter1') lora_name='adapter1')
assert len(serving_engine.lora_requests) == 0 assert len(serving_models.lora_requests) == 0
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_unload_lora_adapter_missing_fields(): async def test_unload_lora_adapter_missing_fields():
serving_engine = await _async_serving_engine_init() serving_models = await _async_serving_models_init()
request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None) request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None)
response = await serving_engine.unload_lora_adapter(request) response = await serving_models.unload_lora_adapter(request)
assert isinstance(response, ErrorResponse) assert isinstance(response, ErrorResponse)
assert response.type == "InvalidUserInput" assert response.type == "InvalidUserInput"
assert response.code == HTTPStatus.BAD_REQUEST assert response.code == HTTPStatus.BAD_REQUEST
...@@ -114,9 +115,9 @@ async def test_unload_lora_adapter_missing_fields(): ...@@ -114,9 +115,9 @@ async def test_unload_lora_adapter_missing_fields():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_unload_lora_adapter_not_found(): async def test_unload_lora_adapter_not_found():
serving_engine = await _async_serving_engine_init() serving_models = await _async_serving_models_init()
request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter") request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter")
response = await serving_engine.unload_lora_adapter(request) response = await serving_models.unload_lora_adapter(request)
assert isinstance(response, ErrorResponse) assert isinstance(response, ErrorResponse)
assert response.type == "InvalidUserInput" assert response.type == "NotFoundError"
assert response.code == HTTPStatus.BAD_REQUEST assert response.code == HTTPStatus.NOT_FOUND
import json
import os
import openai import openai
import pytest import pytest
...@@ -10,16 +7,7 @@ MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B") ...@@ -10,16 +7,7 @@ MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_shutdown_on_engine_failure(tmp_path): async def test_shutdown_on_engine_failure():
# Use a bad adapter to crash the engine
# (This test will fail when that bug is fixed)
adapter_path = tmp_path / "bad_adapter"
os.mkdir(adapter_path)
with open(adapter_path / "adapter_model_config.json", "w") as f:
json.dump({"not": "real"}, f)
with open(adapter_path / "adapter_model.safetensors", "wb") as f:
f.write(b"this is fake")
# dtype, max-len etc set so that this can run in CI # dtype, max-len etc set so that this can run in CI
args = [ args = [
"--dtype", "--dtype",
...@@ -29,9 +17,6 @@ async def test_shutdown_on_engine_failure(tmp_path): ...@@ -29,9 +17,6 @@ async def test_shutdown_on_engine_failure(tmp_path):
"--enforce-eager", "--enforce-eager",
"--max-num-seqs", "--max-num-seqs",
"128", "128",
"--enable-lora",
"--lora-modules",
f"bad-adapter={tmp_path / 'bad_adapter'}",
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
...@@ -39,9 +24,13 @@ async def test_shutdown_on_engine_failure(tmp_path): ...@@ -39,9 +24,13 @@ async def test_shutdown_on_engine_failure(tmp_path):
with pytest.raises( with pytest.raises(
(openai.APIConnectionError, openai.InternalServerError)): (openai.APIConnectionError, openai.InternalServerError)):
# This crashes the engine # Asking for lots of prompt logprobs will currently crash the
await client.completions.create(model="bad-adapter", # engine. This may change in the future when that bug is fixed
prompt="Hello, my name is") prompt = "Hello " * 4000
await client.completions.create(
model=MODEL_NAME,
prompt=prompt,
extra_body={"prompt_logprobs": 10})
# Now the server should shut down # Now the server should shut down
return_code = remote_server.proc.wait(timeout=8) return_code = remote_server.proc.wait(timeout=8)
......
...@@ -99,5 +99,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str, ...@@ -99,5 +99,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
assert len(embeddings.data) == 1 assert len(embeddings.data) == 1
assert len(embeddings.data[0].embedding) == 3072 assert len(embeddings.data[0].embedding) == 3072
assert embeddings.usage.completion_tokens == 0 assert embeddings.usage.completion_tokens == 0
assert embeddings.usage.prompt_tokens == 765 assert embeddings.usage.prompt_tokens == 764
assert embeddings.usage.total_tokens == 765 assert embeddings.usage.total_tokens == 764
...@@ -3,7 +3,6 @@ from typing import Optional ...@@ -3,7 +3,6 @@ from typing import Optional
import pytest import pytest
import os import os
from PIL import Image
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.config import ModelConfig from vllm.config import ModelConfig
...@@ -93,10 +92,7 @@ def _assert_mm_data_is_image_input( ...@@ -93,10 +92,7 @@ def _assert_mm_data_is_image_input(
image_data = mm_data.get("image") image_data = mm_data.get("image")
assert image_data is not None assert image_data is not None
if image_count == 1: assert isinstance(image_data, list) and len(image_data) == image_count
assert isinstance(image_data, Image.Image)
else:
assert isinstance(image_data, list) and len(image_data) == image_count
def test_parse_chat_messages_single_image( def test_parse_chat_messages_single_image(
...@@ -760,10 +756,12 @@ def test_resolve_content_format_hf_defined(model, expected_format): ...@@ -760,10 +756,12 @@ def test_resolve_content_format_hf_defined(model, expected_format):
("template_chatglm.jinja", "string"), ("template_chatglm.jinja", "string"),
("template_chatglm2.jinja", "string"), ("template_chatglm2.jinja", "string"),
("template_chatml.jinja", "string"), ("template_chatml.jinja", "string"),
("template_deepseek_vl2.jinja", "string"),
("template_falcon_180b.jinja", "string"), ("template_falcon_180b.jinja", "string"),
("template_falcon.jinja", "string"), ("template_falcon.jinja", "string"),
("template_inkbot.jinja", "string"), ("template_inkbot.jinja", "string"),
("template_llava.jinja", "string"), ("template_llava.jinja", "string"),
("template_pixtral_hf.jinja", "openai"),
("template_vlm2vec.jinja", "openai"), ("template_vlm2vec.jinja", "openai"),
("tool_chat_template_granite_20b_fc.jinja", "string"), ("tool_chat_template_granite_20b_fc.jinja", "string"),
("tool_chat_template_hermes.jinja", "string"), ("tool_chat_template_hermes.jinja", "string"),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment