Unverified Commit 069434b4 authored by Kris Hung's avatar Kris Hung Committed by GitHub
Browse files

fix: Fix vllm multimodal tests (#3361)


Signed-off-by: default avatarkrishung5 <krish@nvidia.com>
parent de6fdf0c
......@@ -229,10 +229,13 @@ class Processor(ProcessMixIn):
"content": prompt,
}
# Set stream=True - the http frontend will handle aggregation of
# streamed chunks into a single http response, or stream them
# back as SSE responses based on the stream flag in the request.
chat_request = ChatCompletionRequest(
model=raw_request.model,
messages=[msg],
stream=raw_request.stream,
stream=True,
max_tokens=raw_request.max_tokens,
temperature=raw_request.temperature,
request_id=str(uuid.uuid4()),
......
......@@ -266,8 +266,10 @@ class VllmPDWorker(VllmBaseWorker):
request = vLLMMultimodalRequest.model_validate(request)
logger.debug(f"Received PD request: {{ id: {request.request_id} }}.")
embeddings, descriptor = None, None
if (
request.multimodal_input.image_url is None
and request.multimodal_input.video_url is None
):
# Process embeddings using the connector
# Create a descriptor based on the embedding shape.
embeddings = torch.empty(
......@@ -277,10 +279,6 @@ class VllmPDWorker(VllmBaseWorker):
)
descriptor = connect.Descriptor(embeddings)
if (
request.multimodal_input.image_url is None
and request.multimodal_input.video_url is None
):
if descriptor is None:
raise RuntimeError(
"Descriptor is None in PD worker - cannot process embeddings"
......
......@@ -169,6 +169,7 @@ vllm_configs = {
],
repeat_count=1,
expected_response=["rabbit"],
temperature=0.7,
)
],
),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment