Commit 0da93439 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

parents 25f2f756 298e5108
...@@ -7,11 +7,10 @@ import openai ...@@ -7,11 +7,10 @@ import openai
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from vllm.multimodal.utils import encode_audio_base64, encode_audio_url, fetch_audio from vllm.multimodal.utils import encode_audio_base64, encode_audio_url, fetch_audio
from ...utils import RemoteOpenAIServer
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b" MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
TEST_AUDIO_URLS = [ TEST_AUDIO_URLS = [
AudioAsset("winning_call").url, AudioAsset("winning_call").url,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import json import json
import openai import openai
import pybase64 as base64
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from ...conftest import VideoTestAssets from tests.conftest import VideoTestAssets
from ...utils import RemoteOpenAIServer from tests.utils import ROCM_EXTRA_ARGS, RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen2.5-Omni-3B" MODEL_NAME = "Qwen/Qwen2.5-Omni-3B"
...@@ -22,6 +22,7 @@ def server(): ...@@ -22,6 +22,7 @@ def server():
"--enforce-eager", "--enforce-eager",
"--limit-mm-per-prompt", "--limit-mm-per-prompt",
json.dumps({"audio": 3, "video": 3}), json.dumps({"audio": 3, "video": 3}),
*ROCM_EXTRA_ARGS,
] ]
with RemoteOpenAIServer( with RemoteOpenAIServer(
......
...@@ -111,7 +111,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: ...@@ -111,7 +111,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
[{"prompt_token_ids": [1, 2, 3]}], [{"prompt_token_ids": [1, 2, 3]}],
) )
serving_chat.openai_serving_render._preprocess_chat = AsyncMock( serving_chat.openai_serving_render.preprocess_chat = AsyncMock(
side_effect=_fake_preprocess_chat side_effect=_fake_preprocess_chat
) )
return serving_chat return serving_chat
......
...@@ -231,13 +231,14 @@ def k2_server(): ...@@ -231,13 +231,14 @@ def k2_server():
"--gpu-memory-utilization", "--gpu-memory-utilization",
"0.4", "0.4",
] + ROCM_EXTRA_ARGS ] + ROCM_EXTRA_ARGS
# hack to test kimi_k2 tool use tool_id format. # Test kimi_k2 tool use tool_id format by overriding model_type.
# avoid error in is_deepseek_mla check by setting kv_lora_rank=null # is_deepseek_mla safely returns False via getattr when kv_lora_rank
# is absent from the underlying config.
with RemoteOpenAIServer( with RemoteOpenAIServer(
MODEL_NAME, MODEL_NAME,
args, args,
env_dict=ROCM_ENV_OVERRIDES, env_dict=ROCM_ENV_OVERRIDES,
override_hf_configs={"model_type": "kimi_k2", "kv_lora_rank": None}, override_hf_configs={"model_type": "kimi_k2"},
) as remote_server: ) as remote_server:
yield remote_server yield remote_server
......
...@@ -8,8 +8,8 @@ import pytest ...@@ -8,8 +8,8 @@ import pytest
import pytest_asyncio import pytest_asyncio
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from ...conftest import AudioTestAssets from tests.conftest import AudioTestAssets
from ...utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
# NOTE - the tests in this module are currently analogous to test_chat, but are # NOTE - the tests in this module are currently analogous to test_chat, but are
# separated to avoid OOM killing due to module-scoped servers, since we # separated to avoid OOM killing due to module-scoped servers, since we
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from ...utils import VLLM_PATH, RemoteOpenAIServer from tests.utils import VLLM_PATH, RemoteOpenAIServer
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja" chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
assert chatml_jinja_path.exists() assert chatml_jinja_path.exists()
......
...@@ -8,7 +8,7 @@ from typing import Any, NamedTuple ...@@ -8,7 +8,7 @@ from typing import Any, NamedTuple
import openai # use the official client for correctness check import openai # use the official client for correctness check
import pytest import pytest
from ...utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
# # any model with a chat template should work here # # any model with a chat template should work here
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct" MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
......
...@@ -484,7 +484,7 @@ class TestGPTOSSSpeculativeChat: ...@@ -484,7 +484,7 @@ class TestGPTOSSSpeculativeChat:
) )
content = "" content = ""
reasoning_content = "" reasoning = ""
async for chunk in stream: async for chunk in stream:
delta = chunk.choices[0].delta delta = chunk.choices[0].delta
if delta.content: if delta.content:
...@@ -492,9 +492,9 @@ class TestGPTOSSSpeculativeChat: ...@@ -492,9 +492,9 @@ class TestGPTOSSSpeculativeChat:
chunk_reasoning = getattr(delta, "reasoning", None) chunk_reasoning = getattr(delta, "reasoning", None)
if chunk_reasoning: if chunk_reasoning:
reasoning_content += delta.reasoning reasoning += delta.reasoning
assert len(reasoning_content) > 0, "No reasoning was generated." assert len(reasoning) > 0, "No reasoning was generated."
assert content.strip() == "4" assert content.strip() == "4"
......
...@@ -7,11 +7,10 @@ import openai ...@@ -7,11 +7,10 @@ import openai
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from tests.utils import RemoteOpenAIServer
from vllm.multimodal.utils import encode_video_url, fetch_video from vllm.multimodal.utils import encode_video_url, fetch_video
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ...utils import RemoteOpenAIServer
MODEL_NAME = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf" MODEL_NAME = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
MAXIMUM_VIDEOS = 3 MAXIMUM_VIDEOS = 3
......
...@@ -8,12 +8,11 @@ import pytest ...@@ -8,12 +8,11 @@ import pytest
import pytest_asyncio import pytest_asyncio
from transformers import AutoProcessor from transformers import AutoProcessor
from tests.utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
from vllm.multimodal.media import MediaWithBytes from vllm.multimodal.media import MediaWithBytes
from vllm.multimodal.utils import encode_image_url, fetch_image from vllm.multimodal.utils import encode_image_url, fetch_image
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ...utils import ROCM_ENV_OVERRIDES, ROCM_EXTRA_ARGS, RemoteOpenAIServer
MODEL_NAME = "microsoft/Phi-3.5-vision-instruct" MODEL_NAME = "microsoft/Phi-3.5-vision-instruct"
MAXIMUM_IMAGES = 2 MAXIMUM_IMAGES = 2
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import numpy as np import numpy as np
import pybase64 as base64
import pytest import pytest
import requests import requests
import torch import torch
from tests.utils import RemoteOpenAIServer
from vllm.utils.serial_utils import tensor2base64 from vllm.utils.serial_utils import tensor2base64
from ...utils import RemoteOpenAIServer
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"] "model_name", ["ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]
......
...@@ -26,19 +26,12 @@ def default_server_args(): ...@@ -26,19 +26,12 @@ def default_server_args():
"128", "128",
"--enforce-eager", "--enforce-eager",
"--enable-prompt-tokens-details", "--enable-prompt-tokens-details",
"--no-enable-prefix-caching",
] ]
@pytest.fixture( @pytest.fixture(scope="module")
scope="module", def server(default_server_args):
params=[
["--no-enable-prefix-caching"],
["--no-enable-prefix-caching", "--disable-frontend-multiprocessing"],
],
)
def server(default_server_args, request):
if request.param:
default_server_args = default_server_args + request.param
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server yield remote_server
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import io import io
import json import json
import openai # use the official client for correctness check import openai # use the official client for correctness check
import pybase64 as base64
import pytest import pytest
import pytest_asyncio import pytest_asyncio
import torch import torch
...@@ -14,7 +14,7 @@ import torch ...@@ -14,7 +14,7 @@ import torch
from openai import BadRequestError from openai import BadRequestError
from transformers import AutoConfig from transformers import AutoConfig
from ...utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
# any model with a chat template should work here # any model with a chat template should work here
MODEL_NAME = "facebook/opt-125m" MODEL_NAME = "facebook/opt-125m"
...@@ -83,11 +83,8 @@ def example_prompt_embeds(hf_runner): ...@@ -83,11 +83,8 @@ def example_prompt_embeds(hf_runner):
return [_encode_embeds(item) for item in example_embeddings] return [_encode_embeds(item) for item in example_embeddings]
@pytest.fixture(scope="module", params=["", "--disable-frontend-multiprocessing"]) @pytest.fixture(scope="module")
def server_with_prompt_embeds(default_server_args, request): def server_with_prompt_embeds(default_server_args):
if request.param:
default_server_args.append(request.param)
with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server: with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
yield remote_server yield remote_server
......
...@@ -11,11 +11,10 @@ import pytest ...@@ -11,11 +11,10 @@ import pytest
import regex as re import regex as re
import torch import torch
from tests.utils import RemoteOpenAIServer
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.renderers.embed_utils import safe_load_prompt_embeds from vllm.renderers.embed_utils import safe_load_prompt_embeds
from ...utils import RemoteOpenAIServer
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_empty_prompt(): async def test_empty_prompt():
......
...@@ -150,7 +150,6 @@ async def test_shutdown_on_engine_failure(): ...@@ -150,7 +150,6 @@ async def test_shutdown_on_engine_failure():
"0.05", "0.05",
"--max-num-seqs", "--max-num-seqs",
"2", "2",
"--disable-frontend-multiprocessing",
], ],
# ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when # ROCm: Disable stdout/stderr pipe capture. Subprocess hangs when
# stdout/stderr pipes are enabled during ROCm GPU initialization. # stdout/stderr pipes are enabled during ROCm GPU initialization.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment