Commit 0da93439 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

parents 25f2f756 298e5108
......@@ -7,11 +7,10 @@ import openai
import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from vllm.assets.audio import AudioAsset
from vllm.multimodal.utils import encode_audio_base64, encode_audio_url, fetch_audio
from ...utils import RemoteOpenAIServer
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
TEST_AUDIO_URLS = [
AudioAsset("winning_call").url,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import json
import openai
import pybase64 as base64
import pytest
import pytest_asyncio
from ...conftest import VideoTestAssets
from ...utils import RemoteOpenAIServer
from tests.conftest import VideoTestAssets
from tests.utils import ROCM_EXTRA_ARGS, RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen2.5-Omni-3B"
......@@ -22,6 +22,7 @@ def server():
"--enforce-eager",
"--limit-mm-per-prompt",
json.dumps({"audio": 3, "video": 3}),
*ROCM_EXTRA_ARGS,
]
with RemoteOpenAIServer(
......
......@@ -111,7 +111,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
[{"prompt_token_ids": [1, 2, 3]}],
)
serving_chat.openai_serving_render._preprocess_chat = AsyncMock(
serving_chat.openai_serving_render.preprocess_chat = AsyncMock(
side_effect=_fake_preprocess_chat
)
return serving_chat
......
......@@ -231,13 +231,14 @@ def k2_server():
"--gpu-memory-utilization",
"0.4",
] + ROCM_EXTRA_ARGS
# hack to test kimi_k2 tool use tool_id format.
# avoid error in is_deepseek_mla check by setting kv_lora_rank=null
# Test kimi_k2 tool use tool_id format by overriding model_type.
# is_deepseek_mla safely returns False via getattr when kv_lora_rank
# is absent from the underlying config.
with RemoteOpenAIServer(
MODEL_NAME,
args,
env_dict=ROCM_ENV_OVERRIDES,
override_hf_configs={"model_type": "kimi_k2", "kv_lora_rank": None},
override_hf_configs={"model_type": "kimi_k2"},
) as remote_server:
yield remote_server
......
......@@ -8,8 +8,8 @@ import pytest
import pytest_asyncio
from huggingface_hub import snapshot_download
from ...conftest import AudioTestAssets
from ...utils import RemoteOpenAIServer
from tests.conftest import AudioTestAssets
from tests.utils import RemoteOpenAIServer
# NOTE - the tests in this module are currently analogous to test_chat, but are
# separated to avoid OOM killing due to module-scoped servers, since we
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from ...utils import VLLM_PATH, RemoteOpenAIServer
from tests.utils import VLLM_PATH, RemoteOpenAIServer
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
assert chatml_jinja_path.exists()
......
......@@ -8,7 +8,7 @@ from typing import Any, NamedTuple
import openai # use the official client for correctness check
import pytest
from ...utils import RemoteOpenAIServer
from tests.utils import RemoteOpenAIServer
# # any model with a chat template should work here
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
......
......@@ -484,7 +484,7 @@ class TestGPTOSSSpeculativeChat:
)
content = ""
reasoning_content = ""
reasoning = ""
async for chunk in stream:
delta = chunk.choices[0].delta
if delta.content:
......@@ -492,9 +492,9 @@ class TestGPTOSSSpeculativeChat:
chunk_reasoning = getattr(delta, "reasoning", None)
if chunk_reasoning:
reasoning_content += delta.reasoning
reasoning += delta.reasoning
assert len(reasoning_content) > 0, "No reasoning was generated."
assert len(reasoning) > 0, "No reasoning was generated."
assert content.strip() == "4"
......
......@@ -7,11 +7,10 @@ import openai
import pytest
import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from vllm.multimodal.utils import encode_video_url, fetch_video
from vllm.platforms import current_platform
from ...utils import RemoteOpenAIServer
MODEL_NAME = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
MAXIMUM_VIDEOS = 3
......
......@@ -8,12 +8,11 @@ import pytest
import pytest_asyncio
from transformers import AutoProcessor
from tests.utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
from vllm.multimodal.media import MediaWithBytes
from vllm.multimodal.utils import encode_image_url, fetch_image
from vllm.platforms import current_platform
from ...utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
MAXIMUM_IMAGES = 2
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import numpy as np
import pybase64 as base64
import pytest
import requests
import torch
from tests.utils import RemoteOpenAIServer
from vllm.utils.serial_utils import tensor2base64
from ...utils import RemoteOpenAIServer
@pytest.mark.parametrize(
"model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
......
......@@ -26,19 +26,12 @@ def default_server_args():
"128",
"--enforce-eager",
"--enable-prompt-tokens-details",
"--no-enable-prefix-caching",
]
@pytest.fixture(
scope="module",
params=[
["--no-enable-prefix-caching"],
["--no-enable-prefix-caching", "--disable-frontend-multiprocessing"],
],
)
def server(default_server_args, request):
if request.param:
default_server_args = default_server_args + request.param
@pytest.fixture(scope="module")
def server(default_server_args):
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import io
import json
import openai # use the official client for correctness check
import pybase64 as base64
import pytest
import pytest_asyncio
import torch
......@@ -14,7 +14,7 @@ import torch
from openai import BadRequestError
from transformers import AutoConfig
from ...utils import RemoteOpenAIServer
from tests.utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "facebook/opt-125m"
......@@ -83,11 +83,8 @@ def example_prompt_embeds(hf_runner):
return [_encode_embeds(item) for item in example_embeddings]
@pytest.fixture(scope="module", params=["", "--disable-frontend-multiprocessing"])
def server_with_prompt_embeds(default_server_args, request):
if request.param:
default_server_args.append(request.param)
@pytest.fixture(scope="module")
def server_with_prompt_embeds(default_server_args):
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server
......
......@@ -11,11 +11,10 @@ import pytest
import regex as re
import torch
from tests.utils import RemoteOpenAIServer
from vllm.config import ModelConfig
from vllm.renderers.embed_utils import safe_load_prompt_embeds
from ...utils import RemoteOpenAIServer
@pytest.mark.asyncio
async def test_empty_prompt():
......
......@@ -150,7 +150,6 @@ async def test_shutdown_on_engine_failure():
"0.05",
"--max-num-seqs",
"2",
"--disable-frontend-multiprocessing",
],
# ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when
# stdout/stderr pipes are enabled during ROCm GPU initialization.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment