Commit a99300bd authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.2rc1' into v0.10.2rc1-dev

parents cc3e01c7 5438967f
......@@ -18,7 +18,8 @@ from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
tensor_model_parallel_all_reduce,
tensor_model_parallel_reduce_scatter)
from ..utils import init_test_distributed_environment, multi_process_parallel
from ..utils import (init_test_distributed_environment, multi_gpu_test,
multi_process_parallel)
@ray.remote(num_gpus=1, max_calls=1)
......@@ -226,8 +227,7 @@ def send_recv_test_worker(
torch.testing.assert_close(test_tensor, recv_tensor)
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.")
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("test_target", [
all_reduce_test_worker, all_gather_test_worker,
......@@ -241,8 +241,7 @@ def test_multi_process_tensor_parallel(
multi_process_parallel(monkeypatch, tp_size, 1, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason="Need at least 2 GPUs to run the test.")
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize(
"test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
......@@ -254,8 +253,7 @@ def test_multi_process_pipeline_parallel(
multi_process_parallel(monkeypatch, 1, pp_size, test_target)
@pytest.mark.skipif(torch.cuda.device_count() < 4,
reason="Need at least 4 GPUs to run the test.")
@multi_gpu_test(num_gpus=4)
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pp_size", [2])
@pytest.mark.parametrize("test_target", [
......
......@@ -118,6 +118,8 @@ class PPTestSettings:
multi_node_only: bool = False,
load_format: Optional[str] = None,
):
vllm_major_versions = ["1"] if runner == "pooling" else ["0"]
return PPTestSettings(
parallel_setups=[
ParallelSetup(tp_size=tp_base,
......@@ -126,7 +128,7 @@ class PPTestSettings:
chunked_prefill=False),
],
distributed_backends=["mp"],
vllm_major_versions=["0"],
vllm_major_versions=vllm_major_versions,
runner=runner,
test_options=PPTestOptions(multi_node_only=multi_node_only,
load_format=load_format),
......@@ -213,7 +215,9 @@ TEXT_GENERATION_MODELS = {
EMBEDDING_MODELS = { # type: ignore[var-annotated]
# [Text-only]
os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct"): PPTestSettings.fast(runner="pooling"),
os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"): PPTestSettings.fast(runner="pooling"),
# TODO: re-enable when https://github.com/vllm-project/vllm/issues/23883
# is fixed
#"BAAI/bge-multilingual-gemma2": PPTestSettings.fast(runner="pooling"),
os.path.join(models_path_prefix, "Qwen/Qwen2.5-Math-RM-72B"): PPTestSettings.fast(
load_format="dummy", runner="pooling"
),
......@@ -233,6 +237,7 @@ MULTIMODAL_MODELS = {
os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "allenai/Molmo-7B-D-0924"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "AIDC-AI/Ovis2-1B"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "AIDC-AI/Ovis2.5-2B"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"): PPTestSettings.fast(),
os.path.join(models_path_prefix, "mistralai/Pixtral-12B-2409"): PPTestSettings.fast(load_format="dummy"),
os.path.join(models_path_prefix, "Qwen/Qwen-VL-Chat"): PPTestSettings.fast(),
......
......@@ -18,7 +18,6 @@ if TYPE_CHECKING:
])
@pytest.mark.parametrize("ATTN_BACKEND", [
"FLASH_ATTN",
# "FLASHINFER",
])
@create_new_process_for_each_test()
def test_pp_cudagraph(
......
......@@ -292,7 +292,7 @@ SP_TEST_MODELS = [
# TODO support other models
# [LANGUAGE GENERATION]
"meta-llama/Llama-3.2-1B-Instruct",
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
]
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random
import typing
import pytest
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import vllm.envs as envs
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.distributed.communication_op import tensor_model_parallel_all_reduce
from vllm.distributed.device_communicators.cuda_communicator import (
CudaCommunicator)
from vllm.distributed.parallel_state import (get_tensor_model_parallel_group,
get_tp_group,
init_distributed_environment,
initialize_model_parallel)
from vllm.platforms import current_platform
from vllm.utils import update_environment_variables
torch.manual_seed(42)
random.seed(44)
test_size_elements = 4 * 1024 * 1024
def symm_mem_allreduce_worker(local_rank: int, world_size: int):
monkeypatch = pytest.MonkeyPatch()
with monkeypatch.context() as m:
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
dtype = torch.bfloat16
device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
torch.set_default_device(device)
torch.set_default_dtype(dtype)
update_environment_variables({
'RANK': str(local_rank),
'LOCAL_RANK': str(local_rank),
'WORLD_SIZE': str(world_size),
'MASTER_ADDR': 'localhost',
'MASTER_PORT': '12345',
})
init_distributed_environment()
initialize_model_parallel(tensor_model_parallel_size=world_size)
cuda_communicator = typing.cast(CudaCommunicator,
get_tp_group().device_communicator)
symm_mem_comm = cuda_communicator.symm_mem_comm
if symm_mem_comm is None or symm_mem_comm.disabled:
pytest.skip("SymmMemCommunicator is not available or disabled.")
inp_direct_symm_mem = torch.randint(1,
23, (test_size_elements, ),
dtype=dtype,
device=device)
if not symm_mem_comm.should_use_symm_mem(inp_direct_symm_mem):
pytest.skip(
"SymmMemCommunicator isn't used for this world and input size."
)
original_inp_direct_symm_mem = inp_direct_symm_mem.clone()
out_direct_symm_mem = symm_mem_comm.all_reduce(inp_direct_symm_mem)
assert out_direct_symm_mem is not None
group = get_tensor_model_parallel_group().device_group
dist.all_reduce(original_inp_direct_symm_mem, group=group)
torch.testing.assert_close(out_direct_symm_mem,
original_inp_direct_symm_mem,
atol=2.5,
rtol=0.1)
# Test tensor_model_parallel_all_reduce which should use symm_mem
inp_tensor_parallel = torch.randint(-23,
1, (test_size_elements, ),
dtype=dtype,
device=device)
original_inp_tensor_parallel = inp_tensor_parallel.clone()
out_tensor_parallel = tensor_model_parallel_all_reduce(
inp_tensor_parallel)
dist.all_reduce(original_inp_tensor_parallel, group=group)
torch.testing.assert_close(out_tensor_parallel,
original_inp_tensor_parallel,
atol=2.5,
rtol=0.1)
@pytest.mark.skipif(
not current_platform.is_cuda(),
reason="SymmMemAllreduce is only available for CUDA platforms.")
@pytest.mark.parametrize("tp_size", [2])
@pytest.mark.parametrize("pipeline_parallel_size", [1])
@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
reason="Only test on CUDA")
def test_symm_mem_allreduce(monkeypatch: pytest.MonkeyPatch, tp_size,
pipeline_parallel_size):
world_size = tp_size * pipeline_parallel_size
if world_size > torch.cuda.device_count():
pytest.skip("Not enough GPUs to run the test.")
# Enable SymmMemCommunicator
monkeypatch.setenv("VLLM_ALLREDUCE_USE_SYMM_MEM", "1")
mp.spawn(symm_mem_allreduce_worker, args=(world_size, ), nprocs=world_size)
cleanup_dist_env_and_memory()
......@@ -20,10 +20,9 @@ def text_llm():
enforce_eager=True,
seed=0)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)
del llm
del llm
cleanup_dist_env_and_memory()
......@@ -90,10 +89,9 @@ def vision_llm():
seed=0,
)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)
del llm
del llm
cleanup_dist_env_and_memory()
......@@ -160,10 +158,9 @@ def thinking_llm():
seed=0,
)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)
del llm
del llm
cleanup_dist_env_and_memory()
......
......@@ -16,14 +16,6 @@ MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
prompts = ["The chef prepared a delicious meal."]
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
......@@ -35,10 +27,9 @@ def llm():
enforce_eager=True,
seed=0)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)
del llm
del llm
cleanup_dist_env_and_memory()
......@@ -71,3 +62,9 @@ def test_encode_api(llm: LLM):
err_msg = "pooling_task must be one of.+"
with pytest.raises(ValueError, match=err_msg):
llm.encode(prompts, use_tqdm=False)
def test_score_api(llm: LLM):
err_msg = "Score API is only enabled for num_labels == 1."
with pytest.raises(ValueError, match=err_msg):
llm.score("ping", "pong", use_tqdm=False)
......@@ -26,10 +26,9 @@ def llm():
enforce_eager=True,
seed=0)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)
del llm
del llm
cleanup_dist_env_and_memory()
......
......@@ -6,12 +6,10 @@ import weakref
import pytest
import os
from vllm import LLM, PoolingParams, PoolingRequestOutput
from vllm import LLM, PoolingParams
from vllm.distributed import cleanup_dist_env_and_memory
from ...utils import models_path_prefix
from ...models.utils import check_embeddings_close
MODEL_NAME = os.path.join(models_path_prefix, "intfloat/multilingual-e5-small")
PROMPTS = [
......@@ -31,14 +29,6 @@ TOKEN_IDS = [
]
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
......@@ -50,57 +40,13 @@ def llm():
enforce_eager=True,
seed=0)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)
del llm
del llm
cleanup_dist_env_and_memory()
def assert_outputs_match(o1: list[PoolingRequestOutput],
o2: list[PoolingRequestOutput]):
check_embeddings_close(
embeddings_0_lst=[o.outputs.data for o in o1],
embeddings_1_lst=[o.outputs.data for o in o2],
name_0="hf",
name_1="vllm",
tol=1e-2,
)
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
prompt_token_ids):
pooling_params = PoolingParams()
with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
v1_output = llm.encode(prompt_token_ids=prompt_token_ids,
pooling_params=pooling_params)
v2_output = llm.encode({"prompt_token_ids": prompt_token_ids},
pooling_params=pooling_params)
assert_outputs_match(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
pooling_params = PoolingParams()
with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
v1_output = llm.encode(prompt_token_ids=TOKEN_IDS,
pooling_params=pooling_params)
v2_output = llm.encode(
[{
"prompt_token_ids": p
} for p in TOKEN_IDS],
pooling_params=pooling_params,
)
assert_outputs_match(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_multiple_pooling_params(llm: LLM):
pooling_params = [
......
......@@ -6,7 +6,7 @@ import os
import pytest
from vllm import LLM, RequestOutput, SamplingParams
from vllm import LLM, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory
from ...utils import models_path_prefix
......@@ -43,50 +43,13 @@ def llm():
gpu_memory_utilization=0.10,
enforce_eager=True)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)
del llm
del llm
cleanup_dist_env_and_memory()
def assert_outputs_equal(o1: list[RequestOutput], o2: list[RequestOutput]):
assert [o.outputs for o in o1] == [o.outputs for o in o2]
@pytest.mark.skip_global_cleanup
@pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS)
def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM,
prompt_token_ids):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
v1_output = llm.generate(prompt_token_ids=prompt_token_ids,
sampling_params=sampling_params)
v2_output = llm.generate({"prompt_token_ids": prompt_token_ids},
sampling_params=sampling_params)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM):
sampling_params = SamplingParams(temperature=0.0, top_p=1.0)
with pytest.warns(DeprecationWarning, match="'prompt_token_ids'"):
v1_output = llm.generate(prompt_token_ids=TOKEN_IDS,
sampling_params=sampling_params)
v2_output = llm.generate(
[{
"prompt_token_ids": p
} for p in TOKEN_IDS],
sampling_params=sampling_params,
)
assert_outputs_equal(v1_output, v2_output)
@pytest.mark.skip_global_cleanup
def test_multiple_sampling_params(llm: LLM):
sampling_params = [
......
......@@ -16,14 +16,6 @@ MODEL_NAME = "internlm/internlm2-1_8b-reward"
prompts = ["The chef prepared a delicious meal."]
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
......@@ -36,10 +28,9 @@ def llm():
trust_remote_code=True,
seed=0)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)
del llm
del llm
cleanup_dist_env_and_memory()
......
......@@ -14,14 +14,6 @@ from ...models.utils import softmax
MODEL_NAME = "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.fixture(scope="module")
def llm():
# pytest caches the fixture so we use weakref.proxy to
......@@ -33,10 +25,9 @@ def llm():
enforce_eager=True,
seed=0)
with llm.deprecate_legacy_api():
yield weakref.proxy(llm)
yield weakref.proxy(llm)
del llm
del llm
cleanup_dist_env_and_memory()
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for HF_HUB_OFFLINE mode"""
import dataclasses
import importlib
import sys
import os
......@@ -10,9 +11,9 @@ import urllib3
from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.engine.arg_utils import EngineArgs
from ...utils import models_path_prefix
MODEL_CONFIGS = [
{
"model": os.path.join(models_path_prefix, "facebook/opt-125m"),
......@@ -33,15 +34,16 @@ MODEL_CONFIGS = [
"tensor_parallel_size": 1,
"tokenizer_mode": "mistral",
},
{
"model": "sentence-transformers/all-MiniLM-L12-v2",
"enforce_eager": True,
"gpu_memory_utilization": 0.20,
"max_model_len": 64,
"max_num_batched_tokens": 64,
"max_num_seqs": 64,
"tensor_parallel_size": 1,
},
# TODO: re-enable once these tests are run with V1
# {
# "model": "sentence-transformers/all-MiniLM-L12-v2",
# "enforce_eager": True,
# "gpu_memory_utilization": 0.20,
# "max_model_len": 64,
# "max_num_batched_tokens": 64,
# "max_num_seqs": 64,
# "tensor_parallel_size": 1,
# },
]
......@@ -111,3 +113,36 @@ def _re_import_modules():
# Error this test if reloading a module failed
if reload_exception is not None:
raise reload_exception
@pytest.mark.skip_global_cleanup
@pytest.mark.usefixtures("cache_models")
def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
# Set HF to offline mode and ensure we can still construct an LLM
with monkeypatch.context() as m:
try:
m.setenv("HF_HUB_OFFLINE", "1")
m.setenv("VLLM_NO_USAGE_STATS", "1")
def disable_connect(*args, **kwargs):
raise RuntimeError("No http calls allowed")
m.setattr(
urllib3.connection.HTTPConnection,
"connect",
disable_connect,
)
m.setattr(
urllib3.connection.HTTPSConnection,
"connect",
disable_connect,
)
# Need to re-import huggingface_hub
# and friends to setup offline mode
_re_import_modules()
engine_args = EngineArgs(model="facebook/opt-125m")
LLM(**dataclasses.asdict(engine_args))
finally:
# Reset the environment after the test
# NB: Assuming tests are run in online mode
_re_import_modules()
......@@ -49,8 +49,7 @@ async def transcribe_audio(client, tokenizer, y, sr):
return latency, num_output_tokens, transcription.text
async def bound_transcribe(model_name, sem, client, audio, reference):
tokenizer = AutoTokenizer.from_pretrained(model_name)
async def bound_transcribe(sem, client, tokenizer, audio, reference):
# Use semaphore to limit concurrent requests.
async with sem:
result = await transcribe_audio(client, tokenizer, *audio)
......@@ -63,15 +62,19 @@ async def bound_transcribe(model_name, sem, client, audio, reference):
async def process_dataset(model, client, data, concurrent_request):
sem = asyncio.Semaphore(concurrent_request)
# Load tokenizer once outside the loop
tokenizer = AutoTokenizer.from_pretrained(model)
# Warmup call as the first `librosa.load` server-side is quite slow.
audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
_ = await bound_transcribe(model, sem, client, (audio, sr), "")
_ = await bound_transcribe(sem, client, tokenizer, (audio, sr), "")
tasks: list[asyncio.Task] = []
for sample in data:
audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
task = asyncio.create_task(
bound_transcribe(model, sem, client, (audio, sr), sample["text"]))
bound_transcribe(sem, client, tokenizer, (audio, sr),
sample["text"]))
tasks.append(task)
return await asyncio.gather(*tasks)
......
......@@ -226,3 +226,33 @@ def test_pooling(server: RemoteOpenAIServer, model_name: str):
},
)
assert response.json()["error"]["type"] == "BadRequestError"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_score(server: RemoteOpenAIServer, model_name: str):
# score api is only enabled for num_labels == 1.
response = requests.post(
server.url_for("score"),
json={
"model": model_name,
"text_1": "ping",
"text_2": "pong",
},
)
assert response.json()["error"]["type"] == "BadRequestError"
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_rerank(server: RemoteOpenAIServer, model_name: str):
# rerank api is only enabled for num_labels == 1.
response = requests.post(
server.url_for("rerank"),
json={
"model": model_name,
"query": "ping",
"documents": ["pong"],
},
)
assert response.json()["error"]["type"] == "BadRequestError"
......@@ -27,6 +27,28 @@ def serve_parser():
return make_arg_parser(parser)
### Test config parsing
def test_config_arg_parsing(serve_parser, cli_config_file):
args = serve_parser.parse_args([])
assert args.port == 8000
args = serve_parser.parse_args(['--config', cli_config_file])
assert args.port == 12312
args = serve_parser.parse_args([
'--config',
cli_config_file,
'--port',
'9000',
])
assert args.port == 9000
args = serve_parser.parse_args([
'--port',
'9000',
'--config',
cli_config_file,
])
assert args.port == 9000
### Tests for LoRA module parsing
def test_valid_key_value_format(serve_parser):
# Test old format: name=path
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
import pytest
import requests
from tests.utils import RemoteOpenAIServer
MODEL_NAME = "Qwen/Qwen3-0.6B"
class TestWorkerExtension:
def get_model_name(self) -> str:
"""Test non-pydantic return type."""
return MODEL_NAME
def echo_args_kwargs(self, *args, **kwargs) -> dict[str, Any]:
"""Echo back both args and kwargs."""
return dict(
args=list(args),
kwargs=kwargs,
total_items=len(args) + len(kwargs),
)
def return_none(self, *args, **kwargs) -> None:
"""Test method that does not return anything"""
return
@pytest.fixture(scope="module")
def server():
args = [
"--max-model-len",
"8192",
"--max-num-seqs",
"128",
"--worker-extension-cls",
"tests.entrypoints.openai.test_collective_rpc.TestWorkerExtension",
]
with RemoteOpenAIServer(
MODEL_NAME,
args,
env_dict={
"VLLM_SERVER_DEV_MODE": "1",
"CUDA_VISIBLE_DEVICES": "0"
},
) as remote_server:
yield remote_server
def test_get_model_name(server):
"""Test basic response"""
response = requests.post(server.url_for("collective_rpc"),
json={"method": "get_model_name"})
assert response.status_code == 200
results = response.json()
assert "results" in results
assert results["results"] == [MODEL_NAME]
def test_return_none(server):
"""Test return none"""
response = requests.post(server.url_for("collective_rpc"),
json={"method": "return_none"})
assert response.status_code == 200
results = response.json()
assert results["results"] == [None]
def test_echo_args_kwargs(server):
"""Test args, kwargs, and dict response"""
args = ["arg1", "arg2"]
kwargs = {"key1": "value1", "key2": "value2"}
response = requests.post(server.url_for("collective_rpc"),
json={
"method": "echo_args_kwargs",
"args": args,
"kwargs": kwargs
})
assert response.status_code == 200
results = response.json()
result = results["results"][0]
assert result["args"] == args
assert result["kwargs"] == kwargs
assert result["total_items"] == len(args) + len(kwargs)
......@@ -13,6 +13,127 @@ from ...utils import RemoteOpenAIServer
# any model with a chat template should work here
MODEL_NAME = "Qwen/Qwen3-0.6B"
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description":
"The city to find the weather for, e.g. 'Vienna'",
"default": "Vienna",
},
"country": {
"type":
"string",
"description":
"The country that the city is in, e.g. 'Austria'",
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
"options": {
"$ref": "#/$defs/WeatherOptions",
"description": "Optional parameters for weather query",
},
},
"required": ["country", "unit"],
"$defs": {
"WeatherOptions": {
"title": "WeatherOptions",
"type": "object",
"additionalProperties": False,
"properties": {
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"default": "celsius",
"description": "Temperature unit",
"title": "Temperature Unit",
},
"include_forecast": {
"type": "boolean",
"default": False,
"description":
"Whether to include a 24-hour forecast",
"title": "Include Forecast",
},
"language": {
"type": "string",
"default": "zh-CN",
"description": "Language of the response",
"title": "Language",
"enum": ["zh-CN", "en-US", "ja-JP"],
},
},
},
},
},
},
},
{
"type": "function",
"function": {
"name": "get_forecast",
"description": "Get the weather forecast for a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description":
"The city to get the forecast for, e.g. 'Vienna'",
"default": "Vienna",
},
"country": {
"type":
"string",
"description":
"The country that the city is in, e.g. 'Austria'",
},
"days": {
"type":
"integer",
"description":
"Number of days to get the forecast for (1-7)",
},
"unit": {
"type": "string",
"description": "The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["country", "days", "unit"],
},
},
},
]
messages = [
{
"role": "user",
"content": "Hi! How are you doing today?"
},
{
"role": "assistant",
"content": "I'm doing well! How can I help you?"
},
{
"role":
"user",
"content":
"Can you tell me what the current weather is in Berlin and the "\
"forecast for the next 5 days, in fahrenheit?",
},
]
@pytest.fixture(scope="module")
def server(): # noqa: F811
......@@ -27,6 +148,8 @@ def server(): # noqa: F811
"hermes",
"--reasoning-parser",
"qwen3",
"--gpu-memory-utilization",
"0.4"
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
......@@ -54,129 +177,6 @@ async def client(server):
async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
stream: bool, tool_choice: Union[str, dict],
enable_thinking: bool):
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description":
"The city to find the weather for, e.g. 'Vienna'",
"default": "Vienna",
},
"country": {
"type":
"string",
"description":
"The country that the city is in, e.g. 'Austria'",
},
"unit": {
"type": "string",
"description":
"The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
"options": {
"$ref": "#/$defs/WeatherOptions",
"description":
"Optional parameters for weather query",
},
},
"required": ["country", "unit"],
"$defs": {
"WeatherOptions": {
"title": "WeatherOptions",
"type": "object",
"additionalProperties": False,
"properties": {
"unit": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"default": "celsius",
"description": "Temperature unit",
"title": "Temperature Unit",
},
"include_forecast": {
"type": "boolean",
"default": False,
"description":
"Whether to include a 24-hour forecast",
"title": "Include Forecast",
},
"language": {
"type": "string",
"default": "zh-CN",
"description": "Language of the response",
"title": "Language",
"enum": ["zh-CN", "en-US", "ja-JP"],
},
},
},
},
},
},
},
{
"type": "function",
"function": {
"name": "get_forecast",
"description": "Get the weather forecast for a given location",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description":
"The city to get the forecast for, e.g. 'Vienna'",
"default": "Vienna",
},
"country": {
"type":
"string",
"description":
"The country that the city is in, e.g. 'Austria'",
},
"days": {
"type":
"integer",
"description":
"Number of days to get the forecast for (1-7)",
},
"unit": {
"type": "string",
"description":
"The unit to fetch the temperature in",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["country", "days", "unit"],
},
},
},
]
messages = [
{
"role": "user",
"content": "Hi! How are you doing today?"
},
{
"role": "assistant",
"content": "I'm doing well! How can I help you?"
},
{
"role":
"user",
"content":
"Can you tell me what the current weather is in Berlin and the "\
"forecast for the next 5 days, in fahrenheit?",
},
]
if not stream:
# Non-streaming test
chat_completion = await client.chat.completions.create(
......@@ -216,3 +216,71 @@ async def test_function_tool_use(client: openai.AsyncOpenAI, model_name: str,
output.extend(chunk.choices[0].delta.tool_calls)
assert len(output) > 0
@pytest.fixture(scope="module")
def k2_server(): # noqa: F811
args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
"half",
"--enable-auto-tool-choice",
"--guided-decoding-backend",
"xgrammar",
"--tool-call-parser",
"hermes",
"--reasoning-parser",
"qwen3",
"--gpu-memory-utilization",
"0.4",
]
# hack to test kimi_k2 tool use tool_id format.
# avoid error in is_deepseek_mla check by setting kv_lora_rank=null
with RemoteOpenAIServer(MODEL_NAME,
args,
override_hf_configs={
"model_type": 'kimi_k2',
'kv_lora_rank': None
}) as remote_server:
yield remote_server
@pytest_asyncio.fixture
async def k2_client(k2_server):
async with k2_server.get_async_client() as async_client:
yield async_client
@pytest.mark.asyncio
@pytest.mark.parametrize("model_name", [MODEL_NAME])
@pytest.mark.parametrize("stream", [True, False])
@pytest.mark.parametrize("tool_choice", ["required"])
async def test_tool_id_kimi_k2(k2_client: openai.AsyncOpenAI, model_name: str,
stream: bool, tool_choice: str):
if not stream:
# Non-streaming test
chat_completion = await k2_client.chat.completions.create(
messages=messages,
model=model_name,
tools=tools,
tool_choice=tool_choice)
assert chat_completion.choices[0].message.tool_calls is not None
assert len(chat_completion.choices[0].message.tool_calls) > 0
assert chat_completion.choices[0].message.tool_calls[
0].id == 'functions.get_current_weather:0'
else:
# Streaming test
output_stream = await k2_client.chat.completions.create(
messages=messages,
model=model_name,
tools=tools,
tool_choice=tool_choice,
stream=True)
output = []
async for chunk in output_stream:
if chunk.choices and chunk.choices[0].delta.tool_calls:
output.extend(chunk.choices[0].delta.tool_calls)
for o in output:
assert o.id is None or o.id == 'functions.get_current_weather:0'
......@@ -26,14 +26,6 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' +
DTYPE = "bfloat16"
@pytest.fixture(autouse=True)
def v1(run_with_both_engines):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.fixture(scope="module")
def server():
args = [
......
......@@ -47,6 +47,7 @@ class MockModelConfig:
allowed_local_media_path: str = ""
encoder_config = None
generation_config: str = "auto"
skip_tokenizer_init: bool = False
def get_diff_sampling_param(self):
return self.diff_sampling_param or {}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment