[Bugfix] Clean up MiniCPM-V (#6939)

Co-authored-by: hezhihui <hzh7269@modelbest.cn> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>

[Bugfix] Clean up MiniCPM-V (#6939)
Co-authored-by: hezhihui <hzh7269@modelbest.cn> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2f4e108f · Alphi · GitHub · 6512937d · 2f4e108f · 2f4e108f
Unverified Commit 2f4e108f authored Jul 31, 2024 by Alphi Committed by GitHub Jul 31, 2024
6 changed files
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -222,9 +222,13 @@ Vision Language Models
    -
  * - :code:`MiniCPM-V`
    - MiniCPM-V
-    - :code:`openbmb/MiniCPM-V-2`, :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc.
+    - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc.
    -

+.. note::
+  For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
+  For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
+
 ----

 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.

--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -418,11 +418,9 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
        kv_caches: List[torch.Tensor],
        attn_metadata: AttentionMetadata,
        intermediate_tensors: Optional[IntermediateTensors] = None,
-        input_embeds: Optional[torch.Tensor] = None
    ) -> Union[torch.Tensor, IntermediateTensors]:
        model_output = self.model(input_ids, positions, kv_caches,
-                                  attn_metadata, intermediate_tensors,
-                                  input_embeds)
+                                  attn_metadata, intermediate_tensors)
        return model_output

    def compute_logits(self, hidden_states: torch.Tensor,

--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -370,6 +370,7 @@ class MiniCPMModel(nn.Module):
        positions: torch.Tensor,
        kv_caches: List[torch.Tensor],
        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        if inputs_embeds is not None:
@@ -463,11 +464,10 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA):
        positions: torch.Tensor,
        kv_caches: List[torch.Tensor],
        attn_metadata: AttentionMetadata,
-        input_embeds: Optional[torch.Tensor] = None,
        intermediate_tensors: Optional[IntermediateTensors] = None,
    ) -> torch.Tensor:
        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, input_embeds)
+                                   attn_metadata, intermediate_tensors)
        return hidden_states

    def compute_logits(self, hidden_states: torch.Tensor,

--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
--- a/vllm/model_executor/models/na_vit.py
+++ b/vllm/model_executor/models/na_vit.py
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -342,7 +342,7 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA):
        intermediate_tensors: Optional[IntermediateTensors] = None,
    ) -> torch.Tensor:
        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata)
+                                   attn_metadata, intermediate_tensors)
        return hidden_states

    def compute_logits(self, hidden_states: torch.Tensor,