fix: Fix vllm multimodal tests (#3361)

Signed-off-by: krishung5 <krish@nvidia.com>

fix: Fix vllm multimodal tests (#3361)
Signed-off-by: krishung5 <krish@nvidia.com>
069434b4 · Kris Hung · GitHub · de6fdf0c · 069434b4 · 069434b4
Unverified Commit 069434b4 authored Oct 06, 2025 by Kris Hung Committed by GitHub Oct 06, 2025
3 changed files
--- a/examples/multimodal/components/processor.py
+++ b/examples/multimodal/components/processor.py
@@ -229,10 +229,13 @@ class Processor(ProcessMixIn):
            "content": prompt,
        }

+        # Set stream=True - the http frontend will handle aggregation of
+        # streamed chunks into a single http response, or stream them
+        # back as SSE responses based on the stream flag in the request.
        chat_request = ChatCompletionRequest(
            model=raw_request.model,
            messages=[msg],
-            stream=raw_request.stream,
+            stream=True,
            max_tokens=raw_request.max_tokens,
            temperature=raw_request.temperature,
            request_id=str(uuid.uuid4()),

--- a/examples/multimodal/components/worker.py
+++ b/examples/multimodal/components/worker.py
@@ -266,8 +266,10 @@ class VllmPDWorker(VllmBaseWorker):
                request = vLLMMultimodalRequest.model_validate(request)
        logger.debug(f"Received PD request: {{ id: {request.request_id} }}.")

-        embeddings, descriptor = None, None
-
+        if (
+            request.multimodal_input.image_url is None
+            and request.multimodal_input.video_url is None
+        ):
            # Process embeddings using the connector
            # Create a descriptor based on the embedding shape.
            embeddings = torch.empty(
@@ -277,10 +279,6 @@ class VllmPDWorker(VllmBaseWorker):
            )
            descriptor = connect.Descriptor(embeddings)

-        if (
-            request.multimodal_input.image_url is None
-            and request.multimodal_input.video_url is None
-        ):
            if descriptor is None:
                raise RuntimeError(
                    "Descriptor is None in PD worker - cannot process embeddings"

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -169,6 +169,7 @@ vllm_configs = {
                ],
                repeat_count=1,
                expected_response=["rabbit"],
+                temperature=0.7,
            )
        ],
    ),