test: Multimodal Tool Calling vLLM Test (#4663)

Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>

test: Multimodal Tool Calling vLLM Test (#4663)
Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com>
0651a4fe · Indrajit Bhosale · GitHub · be67f67b · 0651a4fe · 0651a4fe
Unverified Commit 0651a4fe authored Dec 03, 2025 by Indrajit Bhosale Committed by GitHub Dec 03, 2025
Showing with 128 additions and 11 deletions

examples/backends/vllm/launch/agg_multimodal.sh examples/backends/vllm/launch/agg_multimodal.sh +16 -11

tests/serve/test_vllm.py tests/serve/test_vllm.py +69 -0

tests/utils/payloads.py tests/utils/payloads.py +43 -0

No files found.
--- a/examples/backends/vllm/launch/agg_multimodal.sh
+++ b/examples/backends/vllm/launch/agg_multimodal.sh
@@ -18,6 +18,8 @@ trap 'echo Cleaning up...; kill 0' EXIT
 MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"

 # Parse command line arguments
+# Extra arguments are passed through to the vLLM worker
+EXTRA_ARGS=()
 while [[ $# -gt 0 ]]; do
    case $1 in
        --model)
@@ -25,16 +27,18 @@ while [[ $# -gt 0 ]]; do
            shift 2
            ;;
        -h|--help)
-            echo "Usage: $0 [OPTIONS]"
+            echo "Usage: $0 [OPTIONS] [-- EXTRA_VLLM_ARGS]"
            echo "Options:"
-            echo "  --model <model_name> Specify the VLM model to use (default: $MODEL_NAME)"
-            echo "  -h, --help           Show this help message"
+            echo "  --model <model_name>   Specify the VLM model to use (default: $MODEL_NAME)"
+            echo "  -h, --help             Show this help message"
+            echo ""
+            echo "Any additional arguments are passed through to the vLLM worker."
+            echo "Example: $0 --model Qwen/Qwen3-VL-30B-A3B-Instruct-FP8 --dyn-tool-call-parser hermes"
            exit 0
            ;;
        *)
-            echo "Unknown option: $1"
-            echo "Use --help for usage information"
-            exit 1
+            EXTRA_ARGS+=("$1")
+            shift
            ;;
    esac
 done
@@ -48,20 +52,21 @@ export DYN_REQUEST_PLANE=tcp
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
 python -m dynamo.frontend &

-# Configure GPU memory optimization for specific models
-EXTRA_ARGS=""
+# Configure GPU memory optimization for specific models (if no extra args override)
+MODEL_SPECIFIC_ARGS=""
 if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
-    EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096"
+    MODEL_SPECIFIC_ARGS="--gpu-memory-utilization 0.85 --max-model-len 4096"
 elif [[ "$MODEL_NAME" == "llava-hf/llava-1.5-7b-hf" ]]; then
-    EXTRA_ARGS="--gpu-memory-utilization 0.85 --max-model-len 2048"
+    MODEL_SPECIFIC_ARGS="--gpu-memory-utilization 0.85 --max-model-len 2048"
 fi

 # Start vLLM worker with vision model
 # Multimodal data (images) are decoded in the backend worker using ImageLoader
 # --enforce-eager: Quick deployment (remove for production)
 # --connector none: No KV transfer needed for aggregated serving
+# Extra args from command line come last to allow overrides
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
-    python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME --enforce-eager --connector none $EXTRA_ARGS
+    python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME --enforce-eager --connector none $MODEL_SPECIFIC_ARGS "${EXTRA_ARGS[@]}"

 # Wait for all background processes to complete
 wait

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -22,6 +22,7 @@ from tests.utils.payload_builder import (
    completion_payload_default,
    metric_payload_default,
 )
+from tests.utils.payloads import ToolCallingChatPayload

 logger = logging.getLogger(__name__)

@@ -333,6 +334,74 @@ vllm_configs = {
            )
        ],
    ),
+    "aggregated_toolcalling": VLLMConfig(
+        name="aggregated_toolcalling",
+        directory=vllm_dir,
+        script_name="agg_multimodal.sh",
+        marks=[pytest.mark.gpu_2, pytest.mark.multimodal],
+        model="Qwen/Qwen3-VL-30B-A3B-Instruct-FP8",
+        script_args=[
+            "--model",
+            "Qwen/Qwen3-VL-30B-A3B-Instruct-FP8",
+            "--max-model-len",
+            "10000",
+            "--dyn-tool-call-parser",
+            "hermes",
+        ],
+        delayed_start=0,
+        timeout=600,
+        request_payloads=[
+            ToolCallingChatPayload(
+                body={
+                    "messages": [
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "text",
+                                    "text": "Describe what you see in this image in detail.",
+                                },
+                                {
+                                    "type": "image_url",
+                                    "image_url": {"url": MULTIMODAL_IMG_URL},
+                                },
+                            ],
+                        }
+                    ],
+                    "tools": [
+                        {
+                            "type": "function",
+                            "function": {
+                                "name": "describe_image",
+                                "description": "Provides detailed description of objects and scenes in an image",
+                                "parameters": {
+                                    "type": "object",
+                                    "properties": {
+                                        "objects": {
+                                            "type": "array",
+                                            "items": {"type": "string"},
+                                            "description": "List of objects detected in the image",
+                                        },
+                                        "scene": {
+                                            "type": "string",
+                                            "description": "Overall scene description",
+                                        },
+                                    },
+                                    "required": ["objects", "scene"],
+                                },
+                            },
+                        }
+                    ],
+                    "tool_choice": "auto",
+                    "max_tokens": 1024,
+                },
+                repeat_count=1,
+                expected_response=["purple"],  # Validate image understanding
+                expected_log=[],
+                expected_tool_name="describe_image",  # Validate tool call happened
+            )
+        ],
+    ),
    # TODO: Enable this test case when we have 4 GPUs runners.
    # "multimodal_disagg": VLLMConfig(
    #     name="multimodal_disagg",

--- a/tests/utils/payloads.py
+++ b/tests/utils/payloads.py
@@ -155,6 +155,49 @@ class ChatPayload(BasePayload):
        return ChatPayload.extract_content(response)


+@dataclass
+class ToolCallingChatPayload(ChatPayload):
+    """ChatPayload that validates tool calls in the response."""
+
+    def __init__(self, *args, expected_tool_name: Optional[str] = None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.expected_tool_name = expected_tool_name
+
+    def validate(self, response, content: str) -> None:
+        """Validate that tool calls exist in the response."""
+        # First run the standard validation
+        super().validate(response, content)
+
+        # Then validate tool calls specifically
+        response_data = response.json()
+        choices = response_data.get("choices", [])
+        assert choices, "Response missing choices"
+
+        message = choices[0].get("message", {})
+        tool_calls = message.get("tool_calls", [])
+
+        assert tool_calls, "Expected model to generate tool calls but none found"
+        logger.info(f"Tool calls detected: {len(tool_calls)} call(s)")
+
+        # Validate tool call structure
+        for i, tc in enumerate(tool_calls):
+            assert "function" in tc, f"Tool call {i} missing 'function' field"
+            function = tc.get("function", {})
+            assert "name" in function, f"Tool call {i} missing function name"
+            assert "arguments" in function, f"Tool call {i} missing function arguments"
+            logger.info(
+                f"  [{i}] Function: {function.get('name')}, Args: {function.get('arguments')[:100]}..."
+            )
+
+        # If expected tool name is provided, validate it
+        if self.expected_tool_name:
+            tool_names = [tc.get("function", {}).get("name") for tc in tool_calls]
+            assert (
+                self.expected_tool_name in tool_names
+            ), f"Expected tool '{self.expected_tool_name}' not found. Available tools: {tool_names}"
+            logger.info(f"Expected tool '{self.expected_tool_name}' was called")
+
+
 @dataclass
 class CompletionPayload(BasePayload):
    """Payload for completions endpoint."""