Unverified Commit cfba042e authored by KrishnanPrash's avatar KrishnanPrash Committed by GitHub
Browse files

fix: mm_item keys for SGLang API (#5981)


Signed-off-by: default avatarKrishnan Prashanth <kprashanth@nvidia.com>
parent 4c200e79
...@@ -114,16 +114,28 @@ class EmbeddingsProcessor: ...@@ -114,16 +114,28 @@ class EmbeddingsProcessor:
def create_multimodal_item( def create_multimodal_item(
embeddings: torch.Tensor, request: SglangMultimodalRequest embeddings: torch.Tensor, request: SglangMultimodalRequest
) -> dict: ) -> dict:
"""Create multimodal item for SGLang generation""" """
Create multimodal item for SGLang generation.
Uses format="precomputed_embedding" since Dynamo's Encoder has already
run the vision encoder. SGLang expects 2D embeddings (num_patches, hidden_dim).
"""
precomputed = embeddings.to(MultimodalConfig.EMBEDDINGS_DTYPE)
# SGLang expects 2D tensor for precomputed_embedding format
# Encoder outputs 3D (1, num_patches, hidden_dim) for internal consistency
# Squeeze batch dimension at SGLang boundary
if precomputed.dim() == 3 and precomputed.shape[0] == 1:
precomputed = precomputed.squeeze(0)
precomputed_embeddings = embeddings.to(MultimodalConfig.EMBEDDINGS_DTYPE)
grid_thw_tensor = torch.tensor(request.image_grid_thw) grid_thw_tensor = torch.tensor(request.image_grid_thw)
mm_item = dict( mm_item = {
modality="IMAGE", "format": "precomputed_embedding",
image_grid_thw=grid_thw_tensor, "feature": precomputed,
precomputed_embeddings=precomputed_embeddings, "image_grid_thw": grid_thw_tensor,
) "modality": "IMAGE",
}
return mm_item return mm_item
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment