[Bugfix] Fix weight loading for Chameleon when TP>1 (#7410)

7025b11d · Cyrus Leung · GitHub · 5469146b · 7025b11d · 7025b11d
Unverified Commit 7025b11d authored Aug 13, 2024 by Cyrus Leung Committed by GitHub Aug 13, 2024
19 changed files
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -375,8 +375,11 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA):
                                   attn_metadata, intermediate_tensors)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -362,8 +362,11 @@ class MixtralForCausalLM(nn.Module):
                                   attn_metadata)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -279,8 +279,11 @@ class MPTForCausalLM(nn.Module):
                                         attn_metadata)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -453,8 +453,11 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA):
                                  attn_metadata, intermediate_tensors)
        return model_output
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -311,8 +311,11 @@ class OlmoForCausalLM(nn.Module):
        )
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -323,8 +323,11 @@ class OPTForCausalLM(nn.Module):
                                   attn_metadata)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -277,8 +277,11 @@ class OrionForCausalLM(nn.Module):
                                   attn_metadata)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -262,8 +262,11 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsVision):
        return hidden_states
    # Copied from vllm/model_executor/models/gemma.py
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.language_model.embed_tokens,
                                       hidden_states, sampling_metadata)
        return logits

--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -285,8 +285,11 @@ class PersimmonForCausalLM(nn.Module):
        )
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -286,8 +286,11 @@ class PhiForCausalLM(nn.Module, SupportsLoRA):
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata, self.lm_head.bias)
        return logits

--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -399,8 +399,11 @@ class Phi3SmallForCausalLM(nn.Module):
    def get_decoder(self):
        return self.model
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        if self.dummy_token_indices is not None and logits is not None:

--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -584,8 +584,11 @@ class Phi3VForCausalLM(nn.Module, SupportsVision):
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -281,8 +281,11 @@ class QWenLMHeadModel(nn.Module):
                        device=device),
        })
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -362,8 +362,11 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA):
                                   attn_metadata, intermediate_tensors)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -400,8 +400,11 @@ class Qwen2MoeForCausalLM(nn.Module):
                                   attn_metadata, intermediate_tensors)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -258,8 +258,11 @@ class StablelmForCausalLM(nn.Module):
                                   attn_metadata)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -268,8 +268,11 @@ class Starcoder2ForCausalLM(nn.Module):
                                   attn_metadata)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/model_executor/models/xverse.py
+++ b/vllm/model_executor/models/xverse.py
@@ -328,8 +328,11 @@ class XverseForCausalLM(nn.Module, SupportsLoRA):
                                   attn_metadata)
        return hidden_states
-    def compute_logits(self, hidden_states: torch.Tensor,
+    def compute_logits(
-                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
        logits = self.logits_processor(self.lm_head, hidden_states,
                                       sampling_metadata)
        return logits

--- a/vllm/outputs.py
+++ b/vllm/outputs.py
 import time
 from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional
+from typing import Sequence as GenericSequence
+from typing import Union
 from vllm.lora.request import LoRARequest
 from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs,
@@ -28,7 +30,7 @@ class CompletionOutput:
    index: int
    text: str
-    token_ids: Tuple[int, ...]
+    token_ids: GenericSequence[int]
    cumulative_logprob: Optional[float]
    logprobs: Optional[SampleLogprobs]
    finish_reason: Optional[str] = None
@@ -139,7 +141,7 @@ class RequestOutput:
            CompletionOutput(
                seqs.index(seq),
                seq.get_output_text_to_return(text_buffer_length),
-                seq.data._output_token_ids,  # type: ignore
+                seq.data._output_token_ids,
                seq.get_cumulative_logprob() if include_logprobs else None,
                seq.output_logprobs if include_logprobs else None,
                SequenceStatus.get_finished_reason(seq.status),