[Models] Add remaining model PP support (#7168)

Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Signed-off-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>

[Models] Add remaining model PP support (#7168)
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai> Signed-off-by: Murali Andoorveedu <muralidhar.andoorveedu@centml.ai> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
0f6d7a9a · Murali Andoorveedu · GitHub · 303d4479 · 0f6d7a9a · 0f6d7a9a
Unverified Commit 0f6d7a9a authored Oct 03, 2024 by Murali Andoorveedu Committed by GitHub Oct 04, 2024
9 changed files
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -55,7 +55,6 @@ from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models.interfaces import SupportsMultiModal
 from vllm.model_executor.models.qwen2 import Qwen2Model
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
                             MultiModalInputs)
@@ -68,6 +67,7 @@ from vllm.transformers_utils.configs.qwen2vl import (Qwen2VLConfig,
 from vllm.transformers_utils.processor import get_processor
 from vllm.utils import is_cpu
+from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (PPMissingLayer, is_pp_missing_parameter,
                    make_empty_intermediate_tensors_factory)
@@ -883,7 +883,8 @@ def input_processor_for_qwen2_vl(ctx: InputContext,
    "video", get_max_qwen2_vl_video_tokens)
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_vl)
 @INPUT_REGISTRY.register_input_processor(input_processor_for_qwen2_vl)
-class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal):
+class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                      SupportsPP):
    def __init__(self,
                 config: Qwen2VLConfig,
@@ -1027,7 +1028,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal):
        attn_metadata: AttentionMetadata,
        intermediate_tensors: Optional[IntermediateTensors] = None,
        **kwargs: object,
-    ) -> SamplerOutput:
+    ) -> Union[torch.Tensor, IntermediateTensors]:
        """Run forward pass for Qwen2-VL.
        Args:
@@ -1047,16 +1048,18 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal):
            video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM.
                `None` if no videos are passed.
        """
+        if intermediate_tensors is not None:
+            input_ids = None
+            inputs_embeds = None
+        else:
            image_input = self._parse_and_validate_image_input(**kwargs)
            video_input = self._parse_and_validate_video_input(**kwargs)
-        if (image_input is None
+            if image_input is None and video_input is None:
-                and video_input is None) or not get_pp_group().is_first_rank:
                inputs_embeds = None
            else:
-            if getattr(self.config, "rope_scaling", {}).get("type",
+                rope_scaling = getattr(self.config, "rope_scaling", {})
-                                                            None) == "mrope":
+                if rope_scaling.get("type", None) == "mrope":
                    assert positions.ndim == 2 and positions.size(0) == 3, (
                        "multimodal section rotary embedding requires "
                        f"(3, seq_len) positions, but got {positions.size()}")

--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
--- a/vllm/model_executor/models/xverse.py
+++ b/vllm/model_executor/models/xverse.py