Commit 38d80967 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori

parents 33650733 880c741b
...@@ -201,3 +201,32 @@ table: "table_1" | "table_2" ...@@ -201,3 +201,32 @@ table: "table_1" | "table_2"
condition: column "=" number condition: column "=" number
number: "1" | "2" number: "1" | "2"
""") """)
@pytest.fixture(scope="session")
def zephyr_lora_files():
"""Download zephyr LoRA files once per test session."""
from huggingface_hub import snapshot_download
return snapshot_download(repo_id="typeof/zephyr-7b-beta-lora")
@pytest.fixture(scope="session")
def zephyr_lora_added_tokens_files(zephyr_lora_files):
"""Create zephyr LoRA files with added tokens once per test session."""
import shutil
from tempfile import TemporaryDirectory
from transformers import AutoTokenizer
tmp_dir = TemporaryDirectory()
tmp_model_dir = f"{tmp_dir.name}/zephyr"
shutil.copytree(zephyr_lora_files, tmp_model_dir)
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
# Copy tokenizer to adapter and add some unique tokens
# 32000, 32001, 32002
added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
special_tokens=True)
assert added == 3
tokenizer.save_pretrained(tmp_model_dir)
yield tmp_model_dir
tmp_dir.cleanup()
...@@ -7,7 +7,7 @@ import pytest ...@@ -7,7 +7,7 @@ import pytest
from vllm import LLM from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory from vllm.distributed import cleanup_dist_env_and_memory
from ..openai.test_vision import TEST_IMAGE_URLS from ..openai.test_vision import TEST_IMAGE_ASSETS
@pytest.fixture(scope="function") @pytest.fixture(scope="function")
...@@ -95,7 +95,8 @@ def vision_llm(): ...@@ -95,7 +95,8 @@ def vision_llm():
@pytest.mark.parametrize("image_urls", @pytest.mark.parametrize("image_urls",
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]]) [[TEST_IMAGE_ASSETS[0], TEST_IMAGE_ASSETS[1]]],
indirect=True)
def test_chat_multi_image(vision_llm, image_urls: list[str]): def test_chat_multi_image(vision_llm, image_urls: list[str]):
messages = [{ messages = [{
"role": "role":
......
...@@ -79,7 +79,7 @@ def test_offline_mode(monkeypatch: pytest.MonkeyPatch): ...@@ -79,7 +79,7 @@ def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
) )
# Need to re-import huggingface_hub # Need to re-import huggingface_hub
# and friends to setup offline mode # and friends to set up offline mode
_re_import_modules() _re_import_modules()
# Cached model files should be used in offline mode # Cached model files should be used in offline mode
for model_config in MODEL_CONFIGS: for model_config in MODEL_CONFIGS:
...@@ -136,7 +136,7 @@ def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch): ...@@ -136,7 +136,7 @@ def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
disable_connect, disable_connect,
) )
# Need to re-import huggingface_hub # Need to re-import huggingface_hub
# and friends to setup offline mode # and friends to set up offline mode
_re_import_modules() _re_import_modules()
engine_args = EngineArgs(model="facebook/opt-125m") engine_args = EngineArgs(model="facebook/opt-125m")
LLM(**dataclasses.asdict(engine_args)) LLM(**dataclasses.asdict(engine_args))
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.assets.audio import AudioAsset
@pytest.fixture
def mary_had_lamb():
path = AudioAsset('mary_had_lamb').get_local_path()
with open(str(path), "rb") as f:
yield f
@pytest.fixture
def winning_call():
path = AudioAsset('winning_call').get_local_path()
with open(str(path), "rb") as f:
yield f
@pytest.fixture
def foscolo():
# Test translation it->en
path = AudioAsset('azacinto_foscolo').get_local_path()
with open(str(path), "rb") as f:
yield f
...@@ -32,7 +32,7 @@ def to_bytes(y, sr): ...@@ -32,7 +32,7 @@ def to_bytes(y, sr):
async def transcribe_audio(client, tokenizer, y, sr): async def transcribe_audio(client, tokenizer, y, sr):
# Send loaded audio directly instead of loading from disk, # Send loaded audio directly instead of loading from disk,
# dont account for that time though # don't account for that time though
with to_bytes(y, sr) as f: with to_bytes(y, sr) as f:
start_time = time.perf_counter() start_time = time.perf_counter()
transcription = await client.audio.transcriptions.create( transcription = await client.audio.transcriptions.create(
......
...@@ -12,11 +12,9 @@ import pytest_asyncio ...@@ -12,11 +12,9 @@ import pytest_asyncio
import regex as re import regex as re
import requests import requests
import torch import torch
from openai import BadRequestError, OpenAI from openai import BadRequestError
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
from .test_completion import zephyr_lora_added_tokens_files # noqa: F401
from .test_completion import zephyr_lora_files # noqa: F401
# any model with a chat template should work here # any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
...@@ -970,59 +968,6 @@ async def test_long_seed(client: openai.AsyncOpenAI): ...@@ -970,59 +968,6 @@ async def test_long_seed(client: openai.AsyncOpenAI):
or "less_than_equal" in exc_info.value.message) or "less_than_equal" in exc_info.value.message)
@pytest.mark.asyncio
async def test_http_chat_no_model_name_with_curl(server: RemoteOpenAIServer):
url = f"http://localhost:{server.port}/v1/chat/completions"
headers = {
"Content-Type": "application/json",
}
data = {
# model_name is avoided here.
"messages": [{
"role": "system",
"content": "You are a helpful assistant."
}, {
"role": "user",
"content": "what is 1+1?"
}],
"max_tokens":
5
}
response = requests.post(url, headers=headers, json=data)
response_data = response.json()
print(response_data)
assert response_data.get("model") == MODEL_NAME
choice = response_data.get("choices")[0]
message = choice.get("message")
assert message is not None
content = message.get("content")
assert content is not None
assert len(content) > 0
@pytest.mark.asyncio
async def test_http_chat_no_model_name_with_openai(server: RemoteOpenAIServer):
openai_api_key = "EMPTY"
openai_api_base = f"http://localhost:{server.port}/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
messages = [
{
"role": "user",
"content": "Hello, vLLM!"
},
]
response = client.chat.completions.create(
model="", # empty string
messages=messages,
)
assert response.model == MODEL_NAME
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_invocations(server: RemoteOpenAIServer, async def test_invocations(server: RemoteOpenAIServer,
client: openai.AsyncOpenAI): client: openai.AsyncOpenAI):
......
...@@ -104,7 +104,9 @@ def test_get_gen_prompt(model, template, add_generation_prompt, ...@@ -104,7 +104,9 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
trust_remote_code=model_info.trust_remote_code, trust_remote_code=model_info.trust_remote_code,
revision=model_info.revision, revision=model_info.revision,
hf_overrides=model_info.hf_overrides, hf_overrides=model_info.hf_overrides,
) skip_tokenizer_init=model_info.skip_tokenizer_init,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype)
# Initialize the tokenizer # Initialize the tokenizer
tokenizer = get_tokenizer( tokenizer = get_tokenizer(
......
...@@ -3,8 +3,6 @@ ...@@ -3,8 +3,6 @@
# imports for guided decoding tests # imports for guided decoding tests
import json import json
import os import os
import shutil
from tempfile import TemporaryDirectory
from typing import Optional from typing import Optional
import jsonschema import jsonschema
...@@ -14,9 +12,7 @@ import pytest_asyncio ...@@ -14,9 +12,7 @@ import pytest_asyncio
import regex as re import regex as re
import requests import requests
# downloading lora to test lora requests # downloading lora to test lora requests
from huggingface_hub import snapshot_download
from openai import BadRequestError from openai import BadRequestError
from transformers import AutoTokenizer
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
...@@ -26,32 +22,10 @@ from ...utils import RemoteOpenAIServer ...@@ -26,32 +22,10 @@ from ...utils import RemoteOpenAIServer
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
# technically these adapters use a different base model, # technically these adapters use a different base model,
# but we're not testing generation quality here # but we're not testing generation quality here
LORA_NAME = "typeof/zephyr-7b-beta-lora"
GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"] GUIDED_DECODING_BACKENDS = ["outlines", "xgrammar", "guidance"]
@pytest.fixture(scope="module")
def zephyr_lora_files():
return snapshot_download(repo_id=LORA_NAME)
@pytest.fixture(scope="module")
def zephyr_lora_added_tokens_files(zephyr_lora_files):
tmp_dir = TemporaryDirectory()
tmp_model_dir = f"{tmp_dir.name}/zephyr"
shutil.copytree(zephyr_lora_files, tmp_model_dir)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Copy tokenizer to adapter and add some unique tokens
# 32000, 32001, 32002
added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
special_tokens=True)
assert added == 3
tokenizer.save_pretrained(tmp_model_dir)
yield tmp_model_dir
tmp_dir.cleanup()
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files): def default_server_args(zephyr_lora_files, zephyr_lora_added_tokens_files):
return [ return [
......
...@@ -3,48 +3,23 @@ ...@@ -3,48 +3,23 @@
import base64 import base64
import io import io
import shutil
from tempfile import TemporaryDirectory
import openai # use the official client for correctness check import openai # use the official client for correctness check
import pytest import pytest
import pytest_asyncio import pytest_asyncio
import torch import torch
# downloading lora to test lora requests # downloading lora to test lora requests
from huggingface_hub import snapshot_download
from openai import BadRequestError from openai import BadRequestError
from transformers import AutoConfig, AutoTokenizer from transformers import AutoConfig
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
# any model with a chat template should work here # any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
LORA_NAME = "typeof/zephyr-7b-beta-lora"
CONFIG = AutoConfig.from_pretrained(MODEL_NAME) CONFIG = AutoConfig.from_pretrained(MODEL_NAME)
@pytest.fixture(scope="module")
def zephyr_lora_files():
return snapshot_download(repo_id=LORA_NAME)
@pytest.fixture(scope="module")
def zephyr_lora_added_tokens_files(zephyr_lora_files):
tmp_dir = TemporaryDirectory()
tmp_model_dir = f"{tmp_dir.name}/zephyr"
shutil.copytree(zephyr_lora_files, tmp_model_dir)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Copy tokenizer to adapter and add some unique tokens
# 32000, 32001, 32002
added = tokenizer.add_tokens(["vllm1", "vllm2", "vllm3"],
special_tokens=True)
assert added == 3
tokenizer.save_pretrained(tmp_model_dir)
yield tmp_model_dir
tmp_dir.cleanup()
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def default_server_args( def default_server_args(
zephyr_lora_files, zephyr_lora_files,
......
...@@ -30,6 +30,7 @@ async def client(server): ...@@ -30,6 +30,7 @@ async def client(server):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.skip(reason="bart is not yet supported in V1")
async def test_single_completion(client: openai.AsyncOpenAI, model_name: str): async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
completion = await client.completions.create(model=model_name, completion = await client.completions.create(model=model_name,
prompt="Hello, my name is", prompt="Hello, my name is",
......
...@@ -9,8 +9,6 @@ from contextlib import suppress ...@@ -9,8 +9,6 @@ from contextlib import suppress
import openai # use the official client for correctness check import openai # use the official client for correctness check
import pytest import pytest
import pytest_asyncio import pytest_asyncio
# downloading lora to test lora requests
from huggingface_hub import snapshot_download
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
...@@ -18,7 +16,6 @@ from ...utils import RemoteOpenAIServer ...@@ -18,7 +16,6 @@ from ...utils import RemoteOpenAIServer
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing # technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here # generation quality here
LORA_NAME = "typeof/zephyr-7b-beta-lora"
BADREQUEST_CASES = [ BADREQUEST_CASES = [
( (
...@@ -48,11 +45,6 @@ BADREQUEST_CASES = [ ...@@ -48,11 +45,6 @@ BADREQUEST_CASES = [
] ]
@pytest.fixture(scope="module")
def zephyr_lora_files():
return snapshot_download(repo_id=LORA_NAME)
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def monkeypatch_module(): def monkeypatch_module():
from _pytest.monkeypatch import MonkeyPatch from _pytest.monkeypatch import MonkeyPatch
......
...@@ -250,12 +250,15 @@ EXPECTED_METRICS_V1 = [ ...@@ -250,12 +250,15 @@ EXPECTED_METRICS_V1 = [
"vllm:request_params_max_tokens_sum", "vllm:request_params_max_tokens_sum",
"vllm:request_params_max_tokens_bucket", "vllm:request_params_max_tokens_bucket",
"vllm:request_params_max_tokens_count", "vllm:request_params_max_tokens_count",
"vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count",
"vllm:time_per_output_token_seconds_sum", "vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket", "vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count", "vllm:time_per_output_token_seconds_count",
"vllm:time_to_first_token_seconds_sum",
"vllm:time_to_first_token_seconds_bucket",
"vllm:time_to_first_token_seconds_count",
"vllm:inter_token_latency_seconds_sum",
"vllm:inter_token_latency_seconds_bucket",
"vllm:inter_token_latency_seconds_count",
"vllm:e2e_request_latency_seconds_sum", "vllm:e2e_request_latency_seconds_sum",
"vllm:e2e_request_latency_seconds_bucket", "vllm:e2e_request_latency_seconds_bucket",
"vllm:e2e_request_latency_seconds_count", "vllm:e2e_request_latency_seconds_count",
...@@ -273,7 +276,11 @@ EXPECTED_METRICS_V1 = [ ...@@ -273,7 +276,11 @@ EXPECTED_METRICS_V1 = [
"vllm:request_decode_time_seconds_count", "vllm:request_decode_time_seconds_count",
] ]
HIDDEN_DEPRECATED_METRICS: list[str] = [] HIDDEN_DEPRECATED_METRICS: list[str] = [
"vllm:time_per_output_token_seconds_sum",
"vllm:time_per_output_token_seconds_bucket",
"vllm:time_per_output_token_seconds_count",
]
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -289,9 +296,10 @@ async def test_metrics_exist(server: RemoteOpenAIServer, ...@@ -289,9 +296,10 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
assert response.status_code == HTTPStatus.OK assert response.status_code == HTTPStatus.OK
for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS): for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
if (not server.show_hidden_metrics if (metric in HIDDEN_DEPRECATED_METRICS
and metric not in HIDDEN_DEPRECATED_METRICS): and not server.show_hidden_metrics):
assert metric in response.text continue
assert metric in response.text
@pytest.mark.asyncio @pytest.mark.asyncio
......
...@@ -4,8 +4,6 @@ ...@@ -4,8 +4,6 @@
import openai # use the official client for correctness check import openai # use the official client for correctness check
import pytest import pytest
import pytest_asyncio import pytest_asyncio
# downloading lora to test lora requests
from huggingface_hub import snapshot_download
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
...@@ -13,12 +11,6 @@ from ...utils import RemoteOpenAIServer ...@@ -13,12 +11,6 @@ from ...utils import RemoteOpenAIServer
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
# technically this needs Mistral-7B-v0.1 as base, but we're not testing # technically this needs Mistral-7B-v0.1 as base, but we're not testing
# generation quality here # generation quality here
LORA_NAME = "typeof/zephyr-7b-beta-lora"
@pytest.fixture(scope="module")
def zephyr_lora_files():
return snapshot_download(repo_id=LORA_NAME)
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
......
...@@ -10,7 +10,7 @@ import pytest ...@@ -10,7 +10,7 @@ import pytest
import regex as re import regex as re
import torch import torch
from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.renderer import BaseRenderer
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
...@@ -27,12 +27,16 @@ async def test_empty_prompt(): ...@@ -27,12 +27,16 @@ async def test_empty_prompt():
with RemoteOpenAIServer(model_name, server_args) as remote_server: with RemoteOpenAIServer(model_name, server_args) as remote_server:
client = remote_server.get_async_client() client = remote_server.get_async_client()
with pytest.raises(openai.BadRequestError, with pytest.raises(
match="decoder prompt cannot be empty"): openai.BadRequestError,
match=
"Either prompt or prompt_embeds must be provided and non-empty."
):
await client.completions.create(model=model_name, await client.completions.create(model=model_name,
prompt="", prompt="",
max_tokens=5, max_tokens=5,
temperature=0.0) temperature=0.0,
extra_body={"prompt_embeds": []})
@pytest.mark.asyncio @pytest.mark.asyncio
...@@ -83,7 +87,7 @@ def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout, ...@@ -83,7 +87,7 @@ def test_load_prompt_embeds(dtype: torch.dtype, layout: torch.layout,
buffer.seek(0) buffer.seek(0)
encoded_tensor = pybase64.b64encode(buffer.getvalue()) encoded_tensor = pybase64.b64encode(buffer.getvalue())
loaded_prompt_embeds = OpenAIServing._load_prompt_embeds(encoded_tensor) loaded_prompt_embeds = BaseRenderer.load_prompt_embeds(encoded_tensor)
assert len(loaded_prompt_embeds) == 1 assert len(loaded_prompt_embeds) == 1
loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"] loaded_tensor = loaded_prompt_embeds[0]["prompt_embeds"]
assert loaded_tensor.device.type == "cpu" assert loaded_tensor.device.type == "cpu"
......
...@@ -275,7 +275,8 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str): ...@@ -275,7 +275,8 @@ async def test_stateful_multi_turn(client: OpenAI, model_name: str):
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
async def test_streaming(client: OpenAI, model_name: str): @pytest.mark.parametrize("background", [True, False])
async def test_streaming(client: OpenAI, model_name: str, background: bool):
# TODO: Add back when web search and code interpreter are available in CI # TODO: Add back when web search and code interpreter are available in CI
prompts = [ prompts = [
"tell me a story about a cat in 20 words", "tell me a story about a cat in 20 words",
...@@ -300,11 +301,16 @@ async def test_streaming(client: OpenAI, model_name: str): ...@@ -300,11 +301,16 @@ async def test_streaming(client: OpenAI, model_name: str):
# }, # },
], ],
stream=True, stream=True,
background=background,
) )
events = [] events = []
current_event_mode = None current_event_mode = None
resp_id = None
async for event in response: async for event in response:
if event.type == "response.created":
resp_id = event.response.id
if current_event_mode != event.type: if current_event_mode != event.type:
current_event_mode = event.type current_event_mode = event.type
print(f"\n[{event.type}] ", end="", flush=True) print(f"\n[{event.type}] ", end="", flush=True)
...@@ -322,6 +328,17 @@ async def test_streaming(client: OpenAI, model_name: str): ...@@ -322,6 +328,17 @@ async def test_streaming(client: OpenAI, model_name: str):
assert len(events) > 0 assert len(events) > 0
if background:
starting_after = 5
async with await client.responses.retrieve(
response_id=resp_id,
stream=True,
starting_after=starting_after) as stream:
counter = starting_after
async for event in stream:
counter += 1
assert event == events[counter]
@pytest.mark.asyncio @pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME]) @pytest.mark.parametrize("model_name", [MODEL_NAME])
......
...@@ -224,7 +224,7 @@ async def test_comparison_with_prompt_logprobs_and_logprobs(server): ...@@ -224,7 +224,7 @@ async def test_comparison_with_prompt_logprobs_and_logprobs(server):
logprobs_token_ids.append(token_id) logprobs_token_ids.append(token_id)
# When echo=True, the logprobs include both prompt and response tokens # When echo=True, the logprobs include both prompt and response tokens
# The token_ids field should match the the suffix of response portion # The token_ids field should match the suffix of response portion
# The prompt_token_ids should match the prompt portion # The prompt_token_ids should match the prompt portion
assert len(completion.choices[0].token_ids) < len(logprobs_token_ids) assert len(completion.choices[0].token_ids) < len(logprobs_token_ids)
response_token_ids_length = len(completion.choices[0].token_ids) response_token_ids_length = len(completion.choices[0].token_ids)
......
...@@ -11,8 +11,6 @@ from vllm.transformers_utils.tokenizer import get_tokenizer ...@@ -11,8 +11,6 @@ from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
from .test_completion import default_server_args # noqa: F401 from .test_completion import default_server_args # noqa: F401
from .test_completion import zephyr_lora_added_tokens_files # noqa: F401
from .test_completion import zephyr_lora_files # noqa: F401
from .test_completion import MODEL_NAME from .test_completion import MODEL_NAME
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import asyncio import asyncio
from contextlib import suppress from contextlib import suppress
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import Any, Optional from typing import TYPE_CHECKING, Any, Optional
from unittest.mock import MagicMock from unittest.mock import MagicMock
import pytest import pytest
import pytest_asyncio
from vllm.config import MultiModalConfig from vllm.config import MultiModalConfig
from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.engine.multiprocessing.client import MQLLMEngineClient
...@@ -17,9 +20,205 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath, ...@@ -17,9 +20,205 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath,
OpenAIServingModels) OpenAIServingModels)
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import RemoteOpenAIServer
if TYPE_CHECKING:
from openai import OpenAI
GPT_OSS_MODEL_NAME = "openai/gpt-oss-20b"
@pytest.fixture(scope="module")
def monkeypatch_module():
from _pytest.monkeypatch import MonkeyPatch
mpatch = MonkeyPatch()
yield mpatch
mpatch.undo()
@pytest.fixture(scope="module",
params=[True, False],
ids=["with_tool_parser", "without_tool_parser"])
def with_tool_parser(request) -> bool:
return request.param
@pytest.fixture(scope="module")
def default_server_args(with_tool_parser: bool):
args = [
# use half precision for speed and memory savings in CI environment
"--enforce-eager",
"--max-model-len",
"4096",
"--reasoning-parser",
"openai_gptoss",
"--gpu-memory-utilization",
"0.8",
]
if with_tool_parser:
args.extend([
"--tool-call-parser",
"openai",
"--enable-auto-tool-choice",
])
return args
@pytest.fixture(scope="module")
def gptoss_server(monkeypatch_module: pytest.MonkeyPatch,
default_server_args: list[str]):
with monkeypatch_module.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
with RemoteOpenAIServer(GPT_OSS_MODEL_NAME,
default_server_args) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def gptoss_client(gptoss_server):
async with gptoss_server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
async def test_gpt_oss_chat_tool_call_streaming(gptoss_client: OpenAI,
with_tool_parser: bool):
tools = [{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string"
},
"state": {
"type": "string"
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["city", "state", "unit"],
},
},
}]
messages = [
{
"role": "user",
"content": "What is the weather in Dallas, TX?"
},
]
stream = await gptoss_client.chat.completions.create(
model=GPT_OSS_MODEL_NAME,
messages=messages,
tools=tools if with_tool_parser else None,
stream=True)
name = None
args_buf = ""
content_buf = ""
async for chunk in stream:
delta = chunk.choices[0].delta
if delta.tool_calls:
tc = delta.tool_calls[0]
if tc.function and tc.function.name:
name = tc.function.name
if tc.function and tc.function.arguments:
args_buf += tc.function.arguments
if getattr(delta, "content", None):
content_buf += delta.content
if with_tool_parser:
assert name is not None
assert len(args_buf) > 0
else:
assert name is None
assert len(args_buf) == 0
assert len(content_buf) > 0
@pytest.mark.asyncio
async def test_gpt_oss_multi_turn_chat(gptoss_client: OpenAI,
with_tool_parser: bool):
if not with_tool_parser:
pytest.skip("skip non-tool for multi-turn tests")
tools = [{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string"
},
"state": {
"type": "string"
},
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["city", "state", "unit"],
},
},
}]
messages = [
{
"role": "system",
"content": "you are a helpful assistant"
},
{
"role": "user",
"content": "What is the weather in Dallas, TX with celsius?"
},
]
first = await gptoss_client.chat.completions.create(
model=GPT_OSS_MODEL_NAME,
messages=messages,
tools=tools,
temperature=0.0,
)
first_msg = first.choices[0].message
assert first_msg.tool_calls is not None and len(first_msg.tool_calls) > 0
tc = first_msg.tool_calls[0]
assert tc.function is not None and tc.function.name == "get_current_weather"
args1 = tc.function.arguments
assert args1 is not None and len(args1) > 0
messages.append({"role": "assistant", "content": args1})
messages.append({
"role": "user",
"content": "Now convert to celsius and return JSON only"
})
second = await gptoss_client.chat.completions.create(
model=GPT_OSS_MODEL_NAME,
messages=messages,
tools=tools,
temperature=0.0,
)
second_msg = second.choices[0].message
assert (second_msg.content is not None and len(second_msg.content) > 0) or \
(second_msg.tool_calls is not None and len(second_msg.tool_calls) > 0)
MODEL_NAME = "openai-community/gpt2" MODEL_NAME = "openai-community/gpt2"
MODEL_NAME_SHORT = "gpt2"
CHAT_TEMPLATE = "Dummy chat template for testing {}" CHAT_TEMPLATE = "Dummy chat template for testing {}"
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)] BASE_MODEL_PATHS = [
BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME),
BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT)
]
@dataclass @dataclass
...@@ -75,6 +274,42 @@ def test_async_serving_chat_init(): ...@@ -75,6 +274,42 @@ def test_async_serving_chat_init():
assert serving_completion.chat_template == CHAT_TEMPLATE assert serving_completion.chat_template == CHAT_TEMPLATE
@pytest.mark.asyncio
async def test_serving_chat_returns_correct_model_name():
mock_engine = MagicMock(spec=MQLLMEngineClient)
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
models = OpenAIServingModels(engine_client=mock_engine,
base_model_paths=BASE_MODEL_PATHS,
model_config=MockModelConfig())
serving_chat = OpenAIServingChat(mock_engine,
MockModelConfig(),
models,
response_role="assistant",
chat_template=CHAT_TEMPLATE,
chat_template_content_format="auto",
request_logger=None)
messages = [{"role": "user", "content": "what is 1+1?"}]
async def return_model_name(*args):
return args[3]
serving_chat.chat_completion_full_generator = return_model_name
# Test that full name is returned when short name is requested
req = ChatCompletionRequest(model=MODEL_NAME_SHORT, messages=messages)
assert await serving_chat.create_chat_completion(req) == MODEL_NAME
# Test that full name is returned when empty string is specified
req = ChatCompletionRequest(model="", messages=messages)
assert await serving_chat.create_chat_completion(req) == MODEL_NAME
# Test that full name is returned when no model is specified
req = ChatCompletionRequest(messages=messages)
assert await serving_chat.create_chat_completion(req) == MODEL_NAME
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_serving_chat_should_set_correct_max_tokens(): async def test_serving_chat_should_set_correct_max_tokens():
mock_engine = MagicMock(spec=MQLLMEngineClient) mock_engine = MagicMock(spec=MQLLMEngineClient)
...@@ -313,7 +548,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type): ...@@ -313,7 +548,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
}], }],
) )
# By default cache_salt in the engine prompt is not set # By default, cache_salt in the engine prompt is not set
with suppress(Exception): with suppress(Exception):
await serving_chat.create_chat_completion(req) await serving_chat.create_chat_completion(req)
assert "cache_salt" not in mock_engine.generate.call_args.args[0] assert "cache_salt" not in mock_engine.generate.call_args.args[0]
......
...@@ -11,7 +11,7 @@ import torch ...@@ -11,7 +11,7 @@ import torch
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
MODEL_NAME = "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM" MODEL_NAME = "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"
DTYPE = "float16" DTYPE = "float16"
...@@ -35,7 +35,9 @@ def server(): ...@@ -35,7 +35,9 @@ def server():
"--trust-remote-code", "--trust-remote-code",
"--skip-tokenizer-init", "--skip-tokenizer-init",
"--max-num-seqs", "--max-num-seqs",
"32" "32",
"--model-impl",
"terratorch"
] ]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
......
...@@ -8,8 +8,6 @@ import requests ...@@ -8,8 +8,6 @@ import requests
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
from ...utils import RemoteOpenAIServer from ...utils import RemoteOpenAIServer
from .test_completion import zephyr_lora_added_tokens_files # noqa: F401
from .test_completion import zephyr_lora_files # noqa: F401
# any model with a chat template should work here # any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment