test: add tool calling and reasoning tests for frontend on GPT-OSS (#3636)

Signed-off-by: zhongdaor <zhongdaor@nvidia.com> Signed-off-by: zhongdaor-nv <zhongdaor@nvidia.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>

test: add tool calling and reasoning tests for frontend on GPT-OSS (#3636)
Signed-off-by: zhongdaor <zhongdaor@nvidia.com> Signed-off-by: zhongdaor-nv <zhongdaor@nvidia.com> Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
ea07d51f · zhongdaor-nv · GitHub · ae4e96a2 · ea07d51f
Unverified Commit ea07d51f authored Oct 15, 2025 by zhongdaor-nv Committed by GitHub Oct 15, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 418 additions and 0 deletions

tests/frontend/test_vllm.py tests/frontend/test_vllm.py +418 -0

No files found.
--- a/tests/frontend/reasoning_effort/test_reasoning_effort.py
+++ b/tests/frontend/reasoning_effort/test_reasoning_effort.py
@@ -13,13 +13,46 @@ from typing import Any, Dict, Optional, Tuple
 import pytest
 import requests

+from tests.conftest import EtcdServer, NatsServer
 from tests.utils.constants import GPT_OSS
 from tests.utils.managed_process import ManagedProcess
 from tests.utils.payloads import check_models_api

 logger = logging.getLogger(__name__)

-REASONING_TEST_MODEL = GPT_OSS
+TEST_MODEL = GPT_OSS
+
+WEATHER_TOOL = {
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "The city and state, e.g. San Francisco, NY",
+                },
+                "format": {
+                    "type": "string",
+                    "enum": ["celsius", "fahrenheit"],
+                    "description": "The temperature unit",
+                },
+            },
+            "required": ["location", "format"],
+        },
+    },
+}
+
+SYSTEM_HEALTH_TOOL = {
+    "type": "function",
+    "function": {
+        "name": "get_system_health",
+        "description": "Returns the current health status of the LLM runtime—use before critical operations to verify the service is live.",
+        "parameters": {"type": "object", "properties": {}},
+    },
+}


 class DynamoFrontendProcess(ManagedProcess):
@@ -46,10 +79,10 @@ class DynamoFrontendProcess(ManagedProcess):
        )


-class GPTOSSWorkerProcess(ManagedProcess):
-    """Worker process for GPT-OSS model."""
+class VllmWorkerProcess(ManagedProcess):
+    """Vllm Worker process for GPT-OSS model."""

-    def __init__(self, request, worker_id: str = "reasoning-worker"):
+    def __init__(self, request, worker_id: str = "vllm-worker"):
        self.worker_id = worker_id

        command = [
@@ -57,10 +90,7 @@ class GPTOSSWorkerProcess(ManagedProcess):
            "-m",
            "dynamo.vllm",
            "--model",
-            REASONING_TEST_MODEL,
-            "--connector",
-            "none",  # skip nixl registration, noticing long startup times in CI. Potentially a bug...
-            "--enforce-eager",
+            TEST_MODEL,
            "--dyn-tool-call-parser",
            "harmony",
            "--dyn-reasoning-parser",
@@ -111,43 +141,39 @@ class GPTOSSWorkerProcess(ManagedProcess):


 def _send_chat_request(
-    prompt: str,
-    reasoning_effort: str,
+    payload: Dict[str, Any],
    timeout: int = 180,
 ) -> requests.Response:
-    """Send a chat completion request with a specific reasoning effort."""
-
-    payload: Dict[str, Any] = {
-        "model": REASONING_TEST_MODEL,
-        "messages": [
-            {
-                "role": "user",
-                "content": prompt,
-            }
-        ],
-        "max_tokens": 2000,
-        "chat_template_args": {"reasoning_effort": reasoning_effort},
-    }
-
+    """Send a chat completion request with a specific payload."""
    headers = {"Content-Type": "application/json"}

-    logger.info(
-        "Sending chat completion request with reasoning effort '%s'", reasoning_effort
-    )
    response = requests.post(
        "http://localhost:8000/v1/chat/completions",
        headers=headers,
        json=payload,
        timeout=timeout,
    )
-    logger.info(
-        "Received response for reasoning effort '%s' with status code %s",
-        reasoning_effort,
-        response.status_code,
-    )
    return response


+@pytest.fixture(scope="module")
+def runtime_services(request):
+    """Module-scoped runtime services for this test file."""
+    with NatsServer(request) as nats_process:
+        with EtcdServer(request) as etcd_process:
+            yield nats_process, etcd_process
+
+
+@pytest.fixture(scope="module")
+def start_services(request, runtime_services):
+    """Start frontend and worker processes once for this module's tests."""
+    with DynamoFrontendProcess(request):
+        logger.info("Frontend started for tests")
+        with VllmWorkerProcess(request):
+            logger.info("Vllm Worker started for tests")
+            yield
+
+
 def _extract_reasoning_metrics(data: Dict[str, Any]) -> Tuple[str, Optional[int]]:
    """Return the reasoning content and optional reasoning token count from a response."""
    choices = data.get("choices") or []
@@ -172,16 +198,17 @@ def _validate_chat_response(response: requests.Response) -> Dict[str, Any]:
    assert (
        response.status_code == 200
    ), f"Chat request failed with status {response.status_code}: {response.text}"
-    payload = response.json()
-    if "choices" not in payload:
-        raise AssertionError(f"Chat response missing 'choices': {payload}")
-    return payload
+    response_json = response.json()
+    if "choices" not in response_json:
+        raise AssertionError(f"Chat response missing 'choices': {response_json}")
+    return response_json


+@pytest.mark.usefixtures("start_services")
 @pytest.mark.vllm
 @pytest.mark.gpu_1
 @pytest.mark.e2e
-@pytest.mark.model(REASONING_TEST_MODEL)
+@pytest.mark.model(TEST_MODEL)
 def test_reasoning_effort(request, runtime_services, predownload_models) -> None:
    """High reasoning effort should yield more detailed reasoning than low effort."""

@@ -190,37 +217,202 @@ def test_reasoning_effort(request, runtime_services, predownload_models) -> None
        "Focus on life-support, energy, and redundancy considerations."
    )

-    with DynamoFrontendProcess(request):
-        logger.info("Frontend started for reasoning effort test")
-
-        with GPTOSSWorkerProcess(request):
-            logger.info("Worker started for reasoning effort test")
-
-            low_response = _send_chat_request(prompt, reasoning_effort="low")
-            low_payload = _validate_chat_response(low_response)
-            low_reasoning_text, low_reasoning_tokens = _extract_reasoning_metrics(
-                low_payload
-            )
-
-            high_response = _send_chat_request(prompt, reasoning_effort="high")
-            high_payload = _validate_chat_response(high_response)
-            high_reasoning_text, high_reasoning_tokens = _extract_reasoning_metrics(
-                high_payload
-            )
-
-            logger.info(
-                "Low effort reasoning tokens: %s, High effort reasoning tokens: %s",
-                low_reasoning_tokens,
-                high_reasoning_tokens,
-            )
-
-            if low_reasoning_tokens is not None and high_reasoning_tokens is not None:
-                assert high_reasoning_tokens >= low_reasoning_tokens, (
-                    "Expected high reasoning effort to use at least as many reasoning tokens "
-                    f"as low effort (low={low_reasoning_tokens}, high={high_reasoning_tokens})"
-                )
-            else:
-                assert len(high_reasoning_text) > len(low_reasoning_text), (
-                    "Expected high reasoning effort response to include longer reasoning "
-                    "content than low effort"
-                )
+    logger.info("Start to test reasoning effort")
+    high_payload = {
+        "model": TEST_MODEL,
+        "messages": [
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ],
+        "max_tokens": 2000,
+        "chat_template_args": {"reasoning_effort": "high"},
+    }
+
+    low_payload = {
+        "model": TEST_MODEL,
+        "messages": [
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ],
+        "max_tokens": 2000,
+        "chat_template_args": {"reasoning_effort": "low"},
+    }
+
+    high_response = _send_chat_request(high_payload)
+    high_reasoning_text, high_reasoning_tokens = _extract_reasoning_metrics(
+        _validate_chat_response(high_response)
+    )
+
+    low_response = _send_chat_request(low_payload)
+    low_reasoning_text, low_reasoning_tokens = _extract_reasoning_metrics(
+        _validate_chat_response(low_response)
+    )
+
+    logger.info(
+        "Low effort reasoning tokens: %s, High effort reasoning tokens: %s",
+        low_reasoning_tokens,
+        high_reasoning_tokens,
+    )
+
+    if low_reasoning_tokens is not None and high_reasoning_tokens is not None:
+        assert high_reasoning_tokens >= low_reasoning_tokens, (
+            "Expected high reasoning effort to use at least as many reasoning tokens "
+            f"as low effort (low={low_reasoning_tokens}, high={high_reasoning_tokens})"
+        )
+    else:
+        assert len(high_reasoning_text) > len(low_reasoning_text), (
+            "Expected high reasoning effort response to include longer reasoning "
+            "content than low effort"
+        )
+
+
+@pytest.mark.usefixtures("start_services")
+@pytest.mark.vllm
+@pytest.mark.gpu_1
+@pytest.mark.e2e
+@pytest.mark.model(TEST_MODEL)
+def test_tool_calling(request, runtime_services, predownload_models) -> None:
+    """Test tool calling functionality with weather and system health tools."""
+
+    payload = {
+        "model": TEST_MODEL,
+        "messages": [
+            {
+                "role": "user",
+                "content": "What is the weather like in San Francisco today?",
+            }
+        ],
+        "max_tokens": 2000,
+        "tools": [
+            WEATHER_TOOL,
+            SYSTEM_HEALTH_TOOL,
+        ],
+        "tool_choice": "auto",
+        "response_format": {"type": "text"},
+    }
+
+    response = _send_chat_request(payload)
+    response_data = _validate_chat_response(response)
+
+    logger.info("Tool call response: %s", response_data)
+
+    choices = response_data.get("choices", [])
+    assert choices, "Response missing choices"
+
+    message = choices[0].get("message", {})
+    tool_calls = message.get("tool_calls", [])
+
+    assert tool_calls, "Expected model to generate tool calls for weather query"
+    assert any(
+        tc.get("function", {}).get("name") == "get_current_weather" for tc in tool_calls
+    ), "Expected get_current_weather tool to be called"
+
+
+@pytest.mark.usefixtures("start_services")
+@pytest.mark.vllm
+@pytest.mark.gpu_1
+@pytest.mark.e2e
+@pytest.mark.model(TEST_MODEL)
+def test_tool_calling_second_round(
+    request, runtime_services, predownload_models
+) -> None:
+    """Test tool calling with a follow-up message containing assistant's prior tool calls."""
+
+    payload = {
+        "model": TEST_MODEL,
+        "messages": [
+            # First message
+            {
+                "role": "user",
+                "content": "What is the weather like in San Francisco today?",
+            },
+            # Assistant message with tool calls
+            {
+                "role": "assistant",
+                "tool_calls": [
+                    {
+                        "id": "call-1",
+                        "type": "function",
+                        "function": {
+                            "name": "get_current_weather",
+                            "arguments": '{"format":"celsius","location":"San Francisco"}',
+                        },
+                    }
+                ],
+            },
+            # Tool message with tool call result
+            {
+                "role": "tool",
+                "tool_call_id": "call-1",
+                "content": '{"celsius":"20"}',
+            },
+        ],
+        "max_tokens": 2000,
+        "tools": [
+            WEATHER_TOOL,
+            SYSTEM_HEALTH_TOOL,
+        ],
+        "tool_choice": "auto",
+        "response_format": {"type": "text"},
+    }
+
+    response = _send_chat_request(payload)
+    response_data = _validate_chat_response(response)
+
+    logger.info("Tool call second round response: %s", response_data)
+
+    choices = response_data.get("choices", [])
+    assert choices, "Response missing choices"
+
+    message = choices[0].get("message", {})
+    content = message.get("content", "").strip()
+
+    assert content, "Expected model to generate a response with content"
+    assert "20" in content and any(
+        temp_word in content.lower()
+        for temp_word in ["celsius", "temperature", "degrees", "°c", "20°"]
+    ), "Expected response to include temperature information from tool call result (20°C)"
+
+
+@pytest.mark.usefixtures("start_services")
+@pytest.mark.vllm
+@pytest.mark.gpu_1
+@pytest.mark.e2e
+@pytest.mark.model(TEST_MODEL)
+def test_reasoning(request, runtime_services, predownload_models) -> None:
+    """Test reasoning functionality with a mathematical problem."""
+
+    payload = {
+        "model": TEST_MODEL,
+        "messages": [
+            {
+                "role": "user",
+                "content": (
+                    "I'm playing assetto corsa competizione, and I need you to tell me "
+                    "how many liters of fuel to take in a race. The qualifying time was "
+                    "2:04.317, the race is 20 minutes long, and the car uses 2.73 liters per lap."
+                ),
+            }
+        ],
+        "max_tokens": 2000,
+    }
+
+    response = _send_chat_request(payload)
+    response_data = _validate_chat_response(response)
+
+    logger.info("Reasoning response: %s", response_data)
+
+    choices = response_data.get("choices", [])
+    assert choices, "Response missing choices"
+
+    message = choices[0].get("message", {})
+    content = message.get("content", "").strip()
+
+    assert content, "Expected model to generate a response with content"
+    assert any(
+        char.isdigit() for char in content
+    ), "Expected response to contain numerical calculations"