fix: Fix vllm multimodal tests (#3361)

Signed-off-by: krishung5 <krish@nvidia.com>

fix: Fix vllm multimodal tests (#3361)
Signed-off-by: krishung5 <krish@nvidia.com>
069434b4 · Kris Hung · GitHub · de6fdf0c · 069434b4 · 069434b4
Unverified Commit 069434b4 authored Oct 06, 2025 by Kris Hung Committed by GitHub Oct 06, 2025
3 changed files
--- a/examples/multimodal/components/processor.py
+++ b/examples/multimodal/components/processor.py
@@ -229,10 +229,13 @@ class Processor(ProcessMixIn):
            "content": prompt,
        }
+        # Set stream=True - the http frontend will handle aggregation of
+        # streamed chunks into a single http response, or stream them
+        # back as SSE responses based on the stream flag in the request.
        chat_request = ChatCompletionRequest(
            model=raw_request.model,
            messages=[msg],
-            stream=raw_request.stream,
+            stream=True,
            max_tokens=raw_request.max_tokens,
            temperature=raw_request.temperature,
            request_id=str(uuid.uuid4()),

--- a/examples/multimodal/components/worker.py
+++ b/examples/multimodal/components/worker.py
@@ -266,21 +266,19 @@ class VllmPDWorker(VllmBaseWorker):
                request = vLLMMultimodalRequest.model_validate(request)
        logger.debug(f"Received PD request: {{ id: {request.request_id} }}.")
-        embeddings, descriptor = None, None
-        # Process embeddings using the connector
-        # Create a descriptor based on the embedding shape.
-        embeddings = torch.empty(
-            request.embeddings_shape,
-            dtype=self.EMBEDDINGS_DTYPE,
-            device=self.EMBEDDINGS_DEVICE,
-        )
-        descriptor = connect.Descriptor(embeddings)
        if (
            request.multimodal_input.image_url is None
            and request.multimodal_input.video_url is None
        ):
+            # Process embeddings using the connector
+            # Create a descriptor based on the embedding shape.
+            embeddings = torch.empty(
+                request.embeddings_shape,
+                dtype=self.EMBEDDINGS_DTYPE,
+                device=self.EMBEDDINGS_DEVICE,
+            )
+            descriptor = connect.Descriptor(embeddings)
            if descriptor is None:
                raise RuntimeError(
                    "Descriptor is None in PD worker - cannot process embeddings"

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -169,6 +169,7 @@ vllm_configs = {
                ],
                repeat_count=1,
                expected_response=["rabbit"],
+                temperature=0.7,
            )
        ],
    ),