Commit 0da93439 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

parents 25f2f756 298e5108
......@@ -11,11 +11,10 @@ import pytest_asyncio
import requests
from fastapi import Request
from tests.utils import RemoteOpenAIServer
from vllm.v1.engine.exceptions import EngineDeadError
from vllm.version import __version__ as VLLM_VERSION
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-0.6B"
......@@ -28,7 +27,7 @@ def server_args(request: pytest.FixtureRequest) -> list[str]:
>>> @pytest.mark.parametrize(
>>> "server_args",
>>> [
>>> ["--disable-frontend-multiprocessing"],
>>> ["--max-model-len", "10100"],
>>> [
>>> "--model=NousResearch/Hermes-3-Llama-3.1-70B",
>>> "--enable-auto-tool-choice",
......@@ -40,7 +39,7 @@ def server_args(request: pytest.FixtureRequest) -> list[str]:
>>> ...
This will run `test_foo` twice with servers with:
- `--disable-frontend-multiprocessing`
- `--max-model-len 10100`
- `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
"""
......@@ -79,17 +78,6 @@ async def client(server):
yield async_client
@pytest.mark.parametrize(
"server_args",
[
pytest.param([], id="default-frontend-multiprocessing"),
pytest.param(
["--disable-frontend-multiprocessing"],
id="disable-frontend-multiprocessing",
),
],
indirect=True,
)
@pytest.mark.asyncio
async def test_show_version(server: RemoteOpenAIServer):
response = requests.get(server.url_for("version"))
......@@ -98,17 +86,6 @@ async def test_show_version(server: RemoteOpenAIServer):
assert response.json() == {"version": VLLM_VERSION}
@pytest.mark.parametrize(
"server_args",
[
pytest.param([], id="default-frontend-multiprocessing"),
pytest.param(
["--disable-frontend-multiprocessing"],
id="disable-frontend-multiprocessing",
),
],
indirect=True,
)
@pytest.mark.asyncio
async def test_check_health(server: RemoteOpenAIServer):
response = requests.get(server.url_for("health"))
......@@ -119,13 +96,7 @@ async def test_check_health(server: RemoteOpenAIServer):
@pytest.mark.parametrize(
"server_args",
[
pytest.param(
["--max-model-len", "10100"], id="default-frontend-multiprocessing"
),
pytest.param(
["--disable-frontend-multiprocessing", "--max-model-len", "10100"],
id="disable-frontend-multiprocessing",
),
pytest.param(["--max-model-len", "10100"]),
],
indirect=True,
)
......
......@@ -50,7 +50,6 @@ def default_server_args():
params=[
"",
"--enable-chunked-prefill",
"--disable-frontend-multiprocessing",
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
],
)
......
......@@ -10,7 +10,7 @@ from http import HTTPStatus
import pytest
import requests
from ...utils import RemoteOpenAIServer
from tests.utils import RemoteOpenAIServer
# Use a small embeddings model for faster startup and smaller memory footprint.
# Since we are not testing any chat functionality,
......
......@@ -5,7 +5,7 @@ import openai
import pytest
import pytest_asyncio
from ...utils import RemoteOpenAIServer
from tests.utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "Qwen/Qwen3-0.6B"
......
......@@ -10,7 +10,7 @@ import openai # use the official client for correctness check
import pytest
import pytest_asyncio
from ...utils import RemoteOpenAIServer
from tests.utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "Qwen/Qwen3-0.6B"
......
......@@ -6,7 +6,7 @@ import httpx
import pytest
import pytest_asyncio
from ...utils import RemoteLaunchRenderServer
from tests.utils import RemoteLaunchRenderServer
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
......
......@@ -5,10 +5,9 @@ import pytest
import pytest_asyncio
import requests
from tests.utils import RemoteOpenAIServer
from vllm.tokenizers import get_tokenizer
from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
......
......@@ -13,7 +13,7 @@ import json
import pytest
import requests
from ...utils import RemoteOpenAIServer
from tests.utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
......
# GFX942 model configurations for GPQA evaluation
# Tests different environment variable combinations
gpt-oss-20b-rocm-baseline.yaml
\ No newline at end of file
gpt-oss-20b-rocm-baseline.yaml
model_name: "deepseek-ai/DeepSeek-R1"
accuracy_threshold: 0.95
num_questions: 1319
num_fewshot: 5
startup_max_wait_seconds: 1200
server_args: >-
--enforce-eager
--max-model-len 4096
--data-parallel-size 8
--enable-expert-parallel
--attention-backend=TRITON_ATTN
--speculative-config '{"method":"mtp","num_speculative_tokens":3}'
model_name: "deepseek-ai/DeepSeek-R1"
accuracy_threshold: 0.95
num_questions: 1319
num_fewshot: 5
startup_max_wait_seconds: 1200
server_args: >-
--enforce-eager
--max-model-len 4096
--tensor-parallel-size 8
--enable-expert-parallel
--attention-backend=TRITON_ATTN
--speculative-config '{"method":"mtp","num_speculative_tokens":3}'
model_name: "deepseek-ai/DeepSeek-V3.2"
accuracy_threshold: 0.95
num_questions: 1319
num_fewshot: 5
startup_max_wait_seconds: 1200
server_args: >-
--enforce-eager
--max-model-len 4096
--data-parallel-size 8
--enable-expert-parallel
--attention-backend=TRITON_ATTN
--speculative-config '{"method":"mtp","num_speculative_tokens":3}'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment