Unverified Commit ea07d51f authored by zhongdaor-nv's avatar zhongdaor-nv Committed by GitHub
Browse files

test: add tool calling and reasoning tests for frontend on GPT-OSS (#3636)


Signed-off-by: default avatarzhongdaor <zhongdaor@nvidia.com>
Signed-off-by: default avatarzhongdaor-nv <zhongdaor@nvidia.com>
Co-authored-by: default avatarcoderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
parent ae4e96a2
......@@ -13,13 +13,46 @@ from typing import Any, Dict, Optional, Tuple
import pytest
import requests
from tests.conftest import EtcdServer, NatsServer
from tests.utils.constants import GPT_OSS
from tests.utils.managed_process import ManagedProcess
from tests.utils.payloads import check_models_api
logger = logging.getLogger(__name__)
REASONING_TEST_MODEL = GPT_OSS
TEST_MODEL = GPT_OSS
WEATHER_TOOL = {
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, NY",
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit",
},
},
"required": ["location", "format"],
},
},
}
SYSTEM_HEALTH_TOOL = {
"type": "function",
"function": {
"name": "get_system_health",
"description": "Returns the current health status of the LLM runtime—use before critical operations to verify the service is live.",
"parameters": {"type": "object", "properties": {}},
},
}
class DynamoFrontendProcess(ManagedProcess):
......@@ -46,10 +79,10 @@ class DynamoFrontendProcess(ManagedProcess):
)
class GPTOSSWorkerProcess(ManagedProcess):
"""Worker process for GPT-OSS model."""
class VllmWorkerProcess(ManagedProcess):
"""Vllm Worker process for GPT-OSS model."""
def __init__(self, request, worker_id: str = "reasoning-worker"):
def __init__(self, request, worker_id: str = "vllm-worker"):
self.worker_id = worker_id
command = [
......@@ -57,10 +90,7 @@ class GPTOSSWorkerProcess(ManagedProcess):
"-m",
"dynamo.vllm",
"--model",
REASONING_TEST_MODEL,
"--connector",
"none", # skip nixl registration, noticing long startup times in CI. Potentially a bug...
"--enforce-eager",
TEST_MODEL,
"--dyn-tool-call-parser",
"harmony",
"--dyn-reasoning-parser",
......@@ -111,43 +141,39 @@ class GPTOSSWorkerProcess(ManagedProcess):
def _send_chat_request(
prompt: str,
reasoning_effort: str,
payload: Dict[str, Any],
timeout: int = 180,
) -> requests.Response:
"""Send a chat completion request with a specific reasoning effort."""
payload: Dict[str, Any] = {
"model": REASONING_TEST_MODEL,
"messages": [
{
"role": "user",
"content": prompt,
}
],
"max_tokens": 2000,
"chat_template_args": {"reasoning_effort": reasoning_effort},
}
"""Send a chat completion request with a specific payload."""
headers = {"Content-Type": "application/json"}
logger.info(
"Sending chat completion request with reasoning effort '%s'", reasoning_effort
)
response = requests.post(
"http://localhost:8000/v1/chat/completions",
headers=headers,
json=payload,
timeout=timeout,
)
logger.info(
"Received response for reasoning effort '%s' with status code %s",
reasoning_effort,
response.status_code,
)
return response
@pytest.fixture(scope="module")
def runtime_services(request):
"""Module-scoped runtime services for this test file."""
with NatsServer(request) as nats_process:
with EtcdServer(request) as etcd_process:
yield nats_process, etcd_process
@pytest.fixture(scope="module")
def start_services(request, runtime_services):
"""Start frontend and worker processes once for this module's tests."""
with DynamoFrontendProcess(request):
logger.info("Frontend started for tests")
with VllmWorkerProcess(request):
logger.info("Vllm Worker started for tests")
yield
def _extract_reasoning_metrics(data: Dict[str, Any]) -> Tuple[str, Optional[int]]:
"""Return the reasoning content and optional reasoning token count from a response."""
choices = data.get("choices") or []
......@@ -172,16 +198,17 @@ def _validate_chat_response(response: requests.Response) -> Dict[str, Any]:
assert (
response.status_code == 200
), f"Chat request failed with status {response.status_code}: {response.text}"
payload = response.json()
if "choices" not in payload:
raise AssertionError(f"Chat response missing 'choices': {payload}")
return payload
response_json = response.json()
if "choices" not in response_json:
raise AssertionError(f"Chat response missing 'choices': {response_json}")
return response_json
@pytest.mark.usefixtures("start_services")
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(REASONING_TEST_MODEL)
@pytest.mark.model(TEST_MODEL)
def test_reasoning_effort(request, runtime_services, predownload_models) -> None:
"""High reasoning effort should yield more detailed reasoning than low effort."""
......@@ -190,37 +217,202 @@ def test_reasoning_effort(request, runtime_services, predownload_models) -> None
"Focus on life-support, energy, and redundancy considerations."
)
with DynamoFrontendProcess(request):
logger.info("Frontend started for reasoning effort test")
with GPTOSSWorkerProcess(request):
logger.info("Worker started for reasoning effort test")
low_response = _send_chat_request(prompt, reasoning_effort="low")
low_payload = _validate_chat_response(low_response)
low_reasoning_text, low_reasoning_tokens = _extract_reasoning_metrics(
low_payload
)
high_response = _send_chat_request(prompt, reasoning_effort="high")
high_payload = _validate_chat_response(high_response)
high_reasoning_text, high_reasoning_tokens = _extract_reasoning_metrics(
high_payload
)
logger.info(
"Low effort reasoning tokens: %s, High effort reasoning tokens: %s",
low_reasoning_tokens,
high_reasoning_tokens,
)
if low_reasoning_tokens is not None and high_reasoning_tokens is not None:
assert high_reasoning_tokens >= low_reasoning_tokens, (
"Expected high reasoning effort to use at least as many reasoning tokens "
f"as low effort (low={low_reasoning_tokens}, high={high_reasoning_tokens})"
)
else:
assert len(high_reasoning_text) > len(low_reasoning_text), (
"Expected high reasoning effort response to include longer reasoning "
"content than low effort"
)
logger.info("Start to test reasoning effort")
high_payload = {
"model": TEST_MODEL,
"messages": [
{
"role": "user",
"content": prompt,
}
],
"max_tokens": 2000,
"chat_template_args": {"reasoning_effort": "high"},
}
low_payload = {
"model": TEST_MODEL,
"messages": [
{
"role": "user",
"content": prompt,
}
],
"max_tokens": 2000,
"chat_template_args": {"reasoning_effort": "low"},
}
high_response = _send_chat_request(high_payload)
high_reasoning_text, high_reasoning_tokens = _extract_reasoning_metrics(
_validate_chat_response(high_response)
)
low_response = _send_chat_request(low_payload)
low_reasoning_text, low_reasoning_tokens = _extract_reasoning_metrics(
_validate_chat_response(low_response)
)
logger.info(
"Low effort reasoning tokens: %s, High effort reasoning tokens: %s",
low_reasoning_tokens,
high_reasoning_tokens,
)
if low_reasoning_tokens is not None and high_reasoning_tokens is not None:
assert high_reasoning_tokens >= low_reasoning_tokens, (
"Expected high reasoning effort to use at least as many reasoning tokens "
f"as low effort (low={low_reasoning_tokens}, high={high_reasoning_tokens})"
)
else:
assert len(high_reasoning_text) > len(low_reasoning_text), (
"Expected high reasoning effort response to include longer reasoning "
"content than low effort"
)
@pytest.mark.usefixtures("start_services")
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(TEST_MODEL)
def test_tool_calling(request, runtime_services, predownload_models) -> None:
"""Test tool calling functionality with weather and system health tools."""
payload = {
"model": TEST_MODEL,
"messages": [
{
"role": "user",
"content": "What is the weather like in San Francisco today?",
}
],
"max_tokens": 2000,
"tools": [
WEATHER_TOOL,
SYSTEM_HEALTH_TOOL,
],
"tool_choice": "auto",
"response_format": {"type": "text"},
}
response = _send_chat_request(payload)
response_data = _validate_chat_response(response)
logger.info("Tool call response: %s", response_data)
choices = response_data.get("choices", [])
assert choices, "Response missing choices"
message = choices[0].get("message", {})
tool_calls = message.get("tool_calls", [])
assert tool_calls, "Expected model to generate tool calls for weather query"
assert any(
tc.get("function", {}).get("name") == "get_current_weather" for tc in tool_calls
), "Expected get_current_weather tool to be called"
@pytest.mark.usefixtures("start_services")
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(TEST_MODEL)
def test_tool_calling_second_round(
request, runtime_services, predownload_models
) -> None:
"""Test tool calling with a follow-up message containing assistant's prior tool calls."""
payload = {
"model": TEST_MODEL,
"messages": [
# First message
{
"role": "user",
"content": "What is the weather like in San Francisco today?",
},
# Assistant message with tool calls
{
"role": "assistant",
"tool_calls": [
{
"id": "call-1",
"type": "function",
"function": {
"name": "get_current_weather",
"arguments": '{"format":"celsius","location":"San Francisco"}',
},
}
],
},
# Tool message with tool call result
{
"role": "tool",
"tool_call_id": "call-1",
"content": '{"celsius":"20"}',
},
],
"max_tokens": 2000,
"tools": [
WEATHER_TOOL,
SYSTEM_HEALTH_TOOL,
],
"tool_choice": "auto",
"response_format": {"type": "text"},
}
response = _send_chat_request(payload)
response_data = _validate_chat_response(response)
logger.info("Tool call second round response: %s", response_data)
choices = response_data.get("choices", [])
assert choices, "Response missing choices"
message = choices[0].get("message", {})
content = message.get("content", "").strip()
assert content, "Expected model to generate a response with content"
assert "20" in content and any(
temp_word in content.lower()
for temp_word in ["celsius", "temperature", "degrees", "°c", "20°"]
), "Expected response to include temperature information from tool call result (20°C)"
@pytest.mark.usefixtures("start_services")
@pytest.mark.vllm
@pytest.mark.gpu_1
@pytest.mark.e2e
@pytest.mark.model(TEST_MODEL)
def test_reasoning(request, runtime_services, predownload_models) -> None:
"""Test reasoning functionality with a mathematical problem."""
payload = {
"model": TEST_MODEL,
"messages": [
{
"role": "user",
"content": (
"I'm playing assetto corsa competizione, and I need you to tell me "
"how many liters of fuel to take in a race. The qualifying time was "
"2:04.317, the race is 20 minutes long, and the car uses 2.73 liters per lap."
),
}
],
"max_tokens": 2000,
}
response = _send_chat_request(payload)
response_data = _validate_chat_response(response)
logger.info("Reasoning response: %s", response_data)
choices = response_data.get("choices", [])
assert choices, "Response missing choices"
message = choices[0].get("message", {})
content = message.get("content", "").strip()
assert content, "Expected model to generate a response with content"
assert any(
char.isdigit() for char in content
), "Expected response to contain numerical calculations"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment