[Bugfix] Fix weight loading for Chameleon when TP>1 (#7410)

7025b11d · Cyrus Leung · GitHub · 5469146b · 7025b11d · 7025b11d
Unverified Commit 7025b11d authored Aug 13, 2024 by Cyrus Leung Committed by GitHub Aug 13, 2024
20 changed files
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -395,8 +395,11 @@ class DeepseekForCausalLM(nn.Module):
                                   attn_metadata)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -505,8 +505,11 @@ class DeepseekV2ForCausalLM(nn.Module):
                                   attn_metadata, intermediate_tensors)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -420,8 +420,11 @@ class FalconForCausalLM(nn.Module):
        )
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -287,8 +287,11 @@ class FuyuForCausalLM(nn.Module, SupportsVision):
        )
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.language_model.logits_processor(
            self.language_model.lm_head, hidden_states, sampling_metadata)
        return logits

--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -352,8 +352,11 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA):
                                   attn_metadata)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.model.embed_tokens, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -343,8 +343,11 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA):
                                   attn_metadata)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.model.embed_tokens, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -265,8 +265,11 @@ class GPT2LMHeadModel(nn.Module):
                                         attn_metadata, intermediate_tensors)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -279,8 +279,11 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA):
                                         attn_metadata)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -246,8 +246,11 @@ class GPTJForCausalLM(nn.Module):
                                         attn_metadata)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata, self.lm_head.bias)
        return logits

--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -258,8 +258,11 @@ class GPTNeoXForCausalLM(nn.Module):
                                      attn_metadata)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.embed_out, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -279,8 +279,11 @@ class InternLM2ForCausalLM(nn.Module):
                                   attn_metadata)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.output, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -466,8 +466,11 @@ class InternVLChatModel(nn.Module, SupportsVision):
                                                  inputs_embeds=inputs_embeds)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        return self.language_model.compute_logits(hidden_states,
                                                  sampling_metadata)

--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -295,8 +295,11 @@ class JAISLMHeadModel(nn.Module):
                                         attn_metadata)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -861,8 +861,11 @@ class JambaForCausalLM(nn.Module, HasInnerState):
                                        dtype=dtype,
                                        device="cuda"))
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -430,8 +430,11 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
                                  attn_metadata, intermediate_tensors)
        return model_output
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -355,8 +355,11 @@ class LlavaForConditionalGeneration(nn.Module, SupportsVision):
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        return self.language_model.compute_logits(hidden_states,
                                                  sampling_metadata)

--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -588,8 +588,11 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        return self.language_model.compute_logits(hidden_states,
                                                  sampling_metadata)

--- a/vllm/model_executor/models/medusa.py
+++ b/vllm/model_executor/models/medusa.py
@@ -65,22 +65,28 @@ class Medusa(nn.Module):
    def compute_logits(
            self, hidden_states: List[torch.Tensor],
            sampling_metadata: SamplingMetadata) -> List[torch.Tensor]:
-        logits = []
+        logits_lst: List[torch.Tensor] = []
        for hs, lm_head in zip(hidden_states, self.lm_heads):
            _logits = self.logits_processor(lm_head, hs, sampling_metadata)
+            if _logits is None:
+                # _logits should only be None on rank > 0, in which case
+                # it should remain true for every lm_head
+                assert len(logits_lst) == 0
+                continue
            if self.token_map is None:
-                logits.append(_logits)
+                logits_lst.append(_logits)
            else:
-                logits.append(-torch.inf * torch.ones(
+                logits_lst.append(-torch.inf * torch.ones(
                    size=(*_logits.shape[:-1], self.orig_vocab_size),
                    device=_logits.device,
                    dtype=_logits.dtype))
-                logits[-1][..., self.token_map] = _logits
+                logits_lst[-1][..., self.token_map] = _logits
-        return logits
+        return logits_lst
    def sample(
        self,

--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -470,8 +470,11 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA):
                                   attn_metadata, intermediate_tensors)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        hidden_states = hidden_states / self.scale_width
        if self.config.tie_word_embeddings:
            lm_head = self.model.embed_tokens

--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -630,8 +630,11 @@ class MiniCPMVBaseModel(nn.Module, SupportsVision):
        )
        return output
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits