[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)

b3cf368d · lkchen · GitHub · c8525f06 · b3cf368d · b3cf368d
Unverified Commit b3cf368d authored Mar 04, 2025 by lkchen Committed by GitHub Mar 04, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 2 deletions

vllm/model_executor/models/ultravox.py vllm/model_executor/models/ultravox.py +3 -1

vllm/model_executor/models/whisper.py vllm/model_executor/models/whisper.py +3 -1

No files found.
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -476,7 +476,9 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
        return result
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
        audio_input = self._parse_and_validate_audio_input(**kwargs)
        if audio_input is None:
            return None

--- a/vllm/model_executor/models/whisper.py
+++ b/vllm/model_executor/models/whisper.py
@@ -692,7 +692,9 @@ class WhisperForConditionalGeneration(nn.Module, SupportsTranscription,
        )
        return decoder_outputs
-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
        # TODO: This method does not obey the interface for SupportsMultiModal.
        # Refactor this once encoder/decoder support is implemented in V1.
        audio_input = self._parse_and_validate_audio_input(**kwargs)