Commit 0da93439 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.1rc0' into v0.18.1rc0-ori

parents 25f2f756 298e5108
...@@ -11,11 +11,10 @@ import pytest_asyncio ...@@ -11,11 +11,10 @@ import pytest_asyncio
import requests import requests
from fastapi import Request from fastapi import Request
from tests.utils import RemoteOpenAIServer
from vllm.v1.engine.exceptions import EngineDeadError from vllm.v1.engine.exceptions import EngineDeadError
from vllm.version import __version__ as VLLM_VERSION from vllm.version import __version__ as VLLM_VERSION
from ...utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-0.6B" MODEL_NAME = "Qwen/Qwen3-0.6B"
...@@ -28,7 +27,7 @@ def server_args(request: pytest.FixtureRequest) -> list[str]: ...@@ -28,7 +27,7 @@ def server_args(request: pytest.FixtureRequest) -> list[str]:
>>> @pytest.mark.parametrize( >>> @pytest.mark.parametrize(
>>> "server_args", >>> "server_args",
>>> [ >>> [
>>> ["--disable-frontend-multiprocessing"], >>> ["--max-model-len", "10100"],
>>> [ >>> [
>>> "--model=NousResearch/Hermes-3-Llama-3.1-70B", >>> "--model=NousResearch/Hermes-3-Llama-3.1-70B",
>>> "--enable-auto-tool-choice", >>> "--enable-auto-tool-choice",
...@@ -40,7 +39,7 @@ def server_args(request: pytest.FixtureRequest) -> list[str]: ...@@ -40,7 +39,7 @@ def server_args(request: pytest.FixtureRequest) -> list[str]:
>>> ... >>> ...
This will run `test_foo` twice with servers with: This will run `test_foo` twice with servers with:
- `--disable-frontend-multiprocessing` - `--max-model-len 10100`
- `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`. - `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
""" """
...@@ -79,17 +78,6 @@ async def client(server): ...@@ -79,17 +78,6 @@ async def client(server):
yield async_client yield async_client
@pytest.mark.parametrize(
"server_args",
[
pytest.param([], id="default-frontend-multiprocessing"),
pytest.param(
["--disable-frontend-multiprocessing"],
id="disable-frontend-multiprocessing",
),
],
indirect=True,
)
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_show_version(server: RemoteOpenAIServer): async def test_show_version(server: RemoteOpenAIServer):
response = requests.get(server.url_for("version")) response = requests.get(server.url_for("version"))
...@@ -98,17 +86,6 @@ async def test_show_version(server: RemoteOpenAIServer): ...@@ -98,17 +86,6 @@ async def test_show_version(server: RemoteOpenAIServer):
assert response.json() == {"version": VLLM_VERSION} assert response.json() == {"version": VLLM_VERSION}
@pytest.mark.parametrize(
"server_args",
[
pytest.param([], id="default-frontend-multiprocessing"),
pytest.param(
["--disable-frontend-multiprocessing"],
id="disable-frontend-multiprocessing",
),
],
indirect=True,
)
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_check_health(server: RemoteOpenAIServer): async def test_check_health(server: RemoteOpenAIServer):
response = requests.get(server.url_for("health")) response = requests.get(server.url_for("health"))
...@@ -119,13 +96,7 @@ async def test_check_health(server: RemoteOpenAIServer): ...@@ -119,13 +96,7 @@ async def test_check_health(server: RemoteOpenAIServer):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"server_args", "server_args",
[ [
pytest.param( pytest.param(["--max-model-len", "10100"]),
["--max-model-len", "10100"], id="default-frontend-multiprocessing"
),
pytest.param(
["--disable-frontend-multiprocessing", "--max-model-len", "10100"],
id="disable-frontend-multiprocessing",
),
], ],
indirect=True, indirect=True,
) )
......
...@@ -50,7 +50,6 @@ def default_server_args(): ...@@ -50,7 +50,6 @@ def default_server_args():
params=[ params=[
"", "",
"--enable-chunked-prefill", "--enable-chunked-prefill",
"--disable-frontend-multiprocessing",
f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}", f"--show-hidden-metrics-for-version={PREV_MINOR_VERSION}",
], ],
) )
......
...@@ -10,7 +10,7 @@ from http import HTTPStatus ...@@ -10,7 +10,7 @@ from http import HTTPStatus
import pytest import pytest
import requests import requests
from ...utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
# Use a small embeddings model for faster startup and smaller memory footprint. # Use a small embeddings model for faster startup and smaller memory footprint.
# Since we are not testing any chat functionality, # Since we are not testing any chat functionality,
......
...@@ -5,7 +5,7 @@ import openai ...@@ -5,7 +5,7 @@ import openai
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from ...utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
# any model with a chat template should work here # any model with a chat template should work here
MODEL_NAME = "Qwen/Qwen3-0.6B" MODEL_NAME = "Qwen/Qwen3-0.6B"
......
...@@ -10,7 +10,7 @@ import openai # use the official client for correctness check ...@@ -10,7 +10,7 @@ import openai # use the official client for correctness check
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from ...utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
# any model with a chat template should work here # any model with a chat template should work here
MODEL_NAME = "Qwen/Qwen3-0.6B" MODEL_NAME = "Qwen/Qwen3-0.6B"
......
...@@ -6,7 +6,7 @@ import httpx ...@@ -6,7 +6,7 @@ import httpx
import pytest import pytest
import pytest_asyncio import pytest_asyncio
from ...utils import RemoteLaunchRenderServer from tests.utils import RemoteLaunchRenderServer
MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM" MODEL_NAME = "hmellor/tiny-random-LlamaForCausalLM"
......
...@@ -5,10 +5,9 @@ import pytest ...@@ -5,10 +5,9 @@ import pytest
import pytest_asyncio import pytest_asyncio
import requests import requests
from tests.utils import RemoteOpenAIServer
from vllm.tokenizers import get_tokenizer from vllm.tokenizers import get_tokenizer
from ...utils import RemoteOpenAIServer
# any model with a chat template should work here # any model with a chat template should work here
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta" MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
......
...@@ -13,7 +13,7 @@ import json ...@@ -13,7 +13,7 @@ import json
import pytest import pytest
import requests import requests
from ...utils import RemoteOpenAIServer from tests.utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct" MODEL_NAME = "Qwen/Qwen2.5-VL-3B-Instruct"
......
# GFX942 model configurations for GPQA evaluation # GFX942 model configurations for GPQA evaluation
# Tests different environment variable combinations # Tests different environment variable combinations
gpt-oss-20b-rocm-baseline.yaml gpt-oss-20b-rocm-baseline.yaml
\ No newline at end of file
model_name: "deepseek-ai/DeepSeek-R1"
accuracy_threshold: 0.95
num_questions: 1319
num_fewshot: 5
startup_max_wait_seconds: 1200
server_args: >-
--enforce-eager
--max-model-len 4096
--data-parallel-size 8
--enable-expert-parallel
--attention-backend=TRITON_ATTN
--speculative-config '{"method":"mtp","num_speculative_tokens":3}'
model_name: "deepseek-ai/DeepSeek-R1"
accuracy_threshold: 0.95
num_questions: 1319
num_fewshot: 5
startup_max_wait_seconds: 1200
server_args: >-
--enforce-eager
--max-model-len 4096
--tensor-parallel-size 8
--enable-expert-parallel
--attention-backend=TRITON_ATTN
--speculative-config '{"method":"mtp","num_speculative_tokens":3}'
model_name: "deepseek-ai/DeepSeek-V3.2"
accuracy_threshold: 0.95
num_questions: 1319
num_fewshot: 5
startup_max_wait_seconds: 1200
server_args: >-
--enforce-eager
--max-model-len 4096
--data-parallel-size 8
--enable-expert-parallel
--attention-backend=TRITON_ATTN
--speculative-config '{"method":"mtp","num_speculative_tokens":3}'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment