Unverified Commit ea07d51f authored by zhongdaor-nv's avatar zhongdaor-nv Committed by GitHub
Browse files

test: add tool calling and reasoning tests for frontend on GPT-OSS (#3636)


Signed-off-by: default avatarzhongdaor <zhongdaor@nvidia.com>
Signed-off-by: default avatarzhongdaor-nv <zhongdaor@nvidia.com>
Co-authored-by: default avatarcoderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
parent ae4e96a2
...@@ -13,13 +13,46 @@ from typing import Any, Dict, Optional, Tuple ...@@ -13,13 +13,46 @@ from typing import Any, Dict, Optional, Tuple
import pytest import pytest
import requests import requests
from tests.conftest import EtcdServer, NatsServer
from tests.utils.constants import GPT_OSS from tests.utils.constants import GPT_OSS
from tests.utils.managed_process import ManagedProcess from tests.utils.managed_process import ManagedProcess
from tests.utils.payloads import check_models_api from tests.utils.payloads import check_models_api
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
REASONING_TEST_MODEL = GPT_OSS TEST_MODEL = GPT_OSS
WEATHER_TOOL = {
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, NY",
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit",
},
},
"required": ["location", "format"],
},
},
}
SYSTEM_HEALTH_TOOL = {
"type": "function",
"function": {
"name": "get_system_health",
"description": "Returns the current health status of the LLM runtime—use before critical operations to verify the service is live.",
"parameters": {"type": "object", "properties": {}},
},
}
class DynamoFrontendProcess(ManagedProcess): class DynamoFrontendProcess(ManagedProcess):
...@@ -46,10 +79,10 @@ class DynamoFrontendProcess(ManagedProcess): ...@@ -46,10 +79,10 @@ class DynamoFrontendProcess(ManagedProcess):
) )
class GPTOSSWorkerProcess(ManagedProcess): class VllmWorkerProcess(ManagedProcess):
"""Worker process for GPT-OSS model.""" """Vllm Worker process for GPT-OSS model."""
def __init__(self, request, worker_id: str = "reasoning-worker"): def __init__(self, request, worker_id: str = "vllm-worker"):
self.worker_id = worker_id self.worker_id = worker_id
command = [ command = [
...@@ -57,10 +90,7 @@ class GPTOSSWorkerProcess(ManagedProcess): ...@@ -57,10 +90,7 @@ class GPTOSSWorkerProcess(ManagedProcess):
"-m", "-m",
"dynamo.vllm", "dynamo.vllm",
"--model", "--model",
REASONING_TEST_MODEL, TEST_MODEL,
"--connector",
"none", # skip nixl registration, noticing long startup times in CI. Potentially a bug...
"--enforce-eager",
"--dyn-tool-call-parser", "--dyn-tool-call-parser",
"harmony", "harmony",
"--dyn-reasoning-parser", "--dyn-reasoning-parser",
...@@ -111,43 +141,39 @@ class GPTOSSWorkerProcess(ManagedProcess): ...@@ -111,43 +141,39 @@ class GPTOSSWorkerProcess(ManagedProcess):
def _send_chat_request( def _send_chat_request(
prompt: str, payload: Dict[str, Any],
reasoning_effort: str,
timeout: int = 180, timeout: int = 180,
) -> requests.Response: ) -> requests.Response:
"""Send a chat completion request with a specific reasoning effort.""" """Send a chat completion request with a specific payload."""
payload: Dict[str, Any] = {
"model": REASONING_TEST_MODEL,
"messages": [
{
"role": "user",
"content": prompt,
}
],
"max_tokens": 2000,
"chat_template_args": {"reasoning_effort": reasoning_effort},
}
headers = {"Content-Type": "application/json"} headers = {"Content-Type": "application/json"}
logger.info(
"Sending chat completion request with reasoning effort '%s'", reasoning_effort
)
response = requests.post( response = requests.post(
"http://localhost:8000/v1/chat/completions", "http://localhost:8000/v1/chat/completions",
headers=headers, headers=headers,
json=payload, json=payload,
timeout=timeout, timeout=timeout,
) )
logger.info(
"Received response for reasoning effort '%s' with status code %s",
reasoning_effort,
response.status_code,
)
return response return response
@pytest.fixture(scope="module")
def runtime_services(request):
"""Module-scoped runtime services for this test file."""
with NatsServer(request) as nats_process:
with EtcdServer(request) as etcd_process:
yield nats_process, etcd_process
@pytest.fixture(scope="module")
def start_services(request, runtime_services):
"""Start frontend and worker processes once for this module's tests."""
with DynamoFrontendProcess(request):
logger.info("Frontend started for tests")
with VllmWorkerProcess(request):
logger.info("Vllm Worker started for tests")
yield
def _extract_reasoning_metrics(data: Dict[str, Any]) -> Tuple[str, Optional[int]]: def _extract_reasoning_metrics(data: Dict[str, Any]) -> Tuple[str, Optional[int]]:
"""Return the reasoning content and optional reasoning token count from a response.""" """Return the reasoning content and optional reasoning token count from a response."""
choices = data.get("choices") or [] choices = data.get("choices") or []
...@@ -172,16 +198,17 @@ def _validate_chat_response(response: requests.Response) -> Dict[str, Any]: ...@@ -172,16 +198,17 @@ def _validate_chat_response(response: requests.Response) -> Dict[str, Any]:
assert ( assert (
response.status_code == 200 response.status_code == 200
), f"Chat request failed with status {response.status_code}: {response.text}" ), f"Chat request failed with status {response.status_code}: {response.text}"
payload = response.json() response_json = response.json()
if "choices" not in payload: if "choices" not in response_json:
raise AssertionError(f"Chat response missing 'choices': {payload}") raise AssertionError(f"Chat response missing 'choices': {response_json}")
return payload return response_json
@pytest.mark.usefixtures("start_services")
@pytest.mark.vllm @pytest.mark.vllm
@pytest.mark.gpu_1 @pytest.mark.gpu_1
@pytest.mark.e2e @pytest.mark.e2e
@pytest.mark.model(REASONING_TEST_MODEL) @pytest.mark.model(TEST_MODEL)
def test_reasoning_effort(request, runtime_services, predownload_models) -> None: def test_reasoning_effort(request, runtime_services, predownload_models) -> None:
"""High reasoning effort should yield more detailed reasoning than low effort.""" """High reasoning effort should yield more detailed reasoning than low effort."""
...@@ -190,22 +217,39 @@ def test_reasoning_effort(request, runtime_services, predownload_models) -> None ...@@ -190,22 +217,39 @@ def test_reasoning_effort(request, runtime_services, predownload_models) -> None
"Focus on life-support, energy, and redundancy considerations." "Focus on life-support, energy, and redundancy considerations."
) )
with DynamoFrontendProcess(request): logger.info("Start to test reasoning effort")
logger.info("Frontend started for reasoning effort test") high_payload = {
"model": TEST_MODEL,
"messages": [
{
"role": "user",
"content": prompt,
}
],
"max_tokens": 2000,
"chat_template_args": {"reasoning_effort": "high"},
}
with GPTOSSWorkerProcess(request): low_payload = {
logger.info("Worker started for reasoning effort test") "model": TEST_MODEL,
"messages": [
{
"role": "user",
"content": prompt,
}
],
"max_tokens": 2000,
"chat_template_args": {"reasoning_effort": "low"},
}
low_response = _send_chat_request(prompt, reasoning_effort="low") high_response = _send_chat_request(high_payload)
low_payload = _validate_chat_response(low_response) high_reasoning_text, high_reasoning_tokens = _extract_reasoning_metrics(
low_reasoning_text, low_reasoning_tokens = _extract_reasoning_metrics( _validate_chat_response(high_response)
low_payload
) )
high_response = _send_chat_request(prompt, reasoning_effort="high") low_response = _send_chat_request(low_payload)
high_payload = _validate_chat_response(high_response) low_reasoning_text, low_reasoning_tokens = _extract_reasoning_metrics(
high_reasoning_text, high_reasoning_tokens = _extract_reasoning_metrics( _validate_chat_response(low_response)
high_payload
) )
logger.info( logger.info(
...@@ -224,3 +268,151 @@ def test_reasoning_effort(request, runtime_services, predownload_models) -> None ...@@ -224,3 +268,151 @@ def test_reasoning_effort(request, runtime_services, predownload_models) -> None
"Expected high reasoning effort response to include longer reasoning " "Expected high reasoning effort response to include longer reasoning "
"content than low effort" "content than low effort"
) )
@pytest.mark.usefixtures("start_services")
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(TEST_MODEL)
def test_tool_calling(request, runtime_services, predownload_models) -> None:
"""Test tool calling functionality with weather and system health tools."""
payload = {
"model": TEST_MODEL,
"messages": [
{
"role": "user",
"content": "What is the weather like in San Francisco today?",
}
],
"max_tokens": 2000,
"tools": [
WEATHER_TOOL,
SYSTEM_HEALTH_TOOL,
],
"tool_choice": "auto",
"response_format": {"type": "text"},
}
response = _send_chat_request(payload)
response_data = _validate_chat_response(response)
logger.info("Tool call response: %s", response_data)
choices = response_data.get("choices", [])
assert choices, "Response missing choices"
message = choices[0].get("message", {})
tool_calls = message.get("tool_calls", [])
assert tool_calls, "Expected model to generate tool calls for weather query"
assert any(
tc.get("function", {}).get("name") == "get_current_weather" for tc in tool_calls
), "Expected get_current_weather tool to be called"
@pytest.mark.usefixtures("start_services")
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(TEST_MODEL)
def test_tool_calling_second_round(
request, runtime_services, predownload_models
) -> None:
"""Test tool calling with a follow-up message containing assistant's prior tool calls."""
payload = {
"model": TEST_MODEL,
"messages": [
# First message
{
"role": "user",
"content": "What is the weather like in San Francisco today?",
},
# Assistant message with tool calls
{
"role": "assistant",
"tool_calls": [
{
"id": "call-1",
"type": "function",
"function": {
"name": "get_current_weather",
"arguments": '{"format":"celsius","location":"San Francisco"}',
},
}
],
},
# Tool message with tool call result
{
"role": "tool",
"tool_call_id": "call-1",
"content": '{"celsius":"20"}',
},
],
"max_tokens": 2000,
"tools": [
WEATHER_TOOL,
SYSTEM_HEALTH_TOOL,
],
"tool_choice": "auto",
"response_format": {"type": "text"},
}
response = _send_chat_request(payload)
response_data = _validate_chat_response(response)
logger.info("Tool call second round response: %s", response_data)
choices = response_data.get("choices", [])
assert choices, "Response missing choices"
message = choices[0].get("message", {})
content = message.get("content", "").strip()
assert content, "Expected model to generate a response with content"
assert "20" in content and any(
temp_word in content.lower()
for temp_word in ["celsius", "temperature", "degrees", "°c", "20°"]
), "Expected response to include temperature information from tool call result (20°C)"
@pytest.mark.usefixtures("start_services")
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(TEST_MODEL)
def test_reasoning(request, runtime_services, predownload_models) -> None:
"""Test reasoning functionality with a mathematical problem."""
payload = {
"model": TEST_MODEL,
"messages": [
{
"role": "user",
"content": (
"I'm playing assetto corsa competizione, and I need you to tell me "
"how many liters of fuel to take in a race. The qualifying time was "
"2:04.317, the race is 20 minutes long, and the car uses 2.73 liters per lap."
),
}
],
"max_tokens": 2000,
}
response = _send_chat_request(payload)
response_data = _validate_chat_response(response)
logger.info("Reasoning response: %s", response_data)
choices = response_data.get("choices", [])
assert choices, "Response missing choices"
message = choices[0].get("message", {})
content = message.get("content", "").strip()
assert content, "Expected model to generate a response with content"
assert any(
char.isdigit() for char in content
), "Expected response to contain numerical calculations"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment