[Model][Gemma3] Cast image pixel values already on CPU (#18732)

Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>

[Model][Gemma3] Cast image pixel values already on CPU (#18732)
Signed-off-by: Lukas Geiger <lukas.geiger94@gmail.com>
b50602d5 · Lukas Geiger · GitHub · 1f1b1bc0 · b50602d5
Unverified Commit b50602d5 authored May 27, 2025 by Lukas Geiger Committed by GitHub May 27, 2025
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 3 deletions

vllm/model_executor/models/gemma3_mm.py vllm/model_executor/models/gemma3_mm.py +6 -3

No files found.
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -263,6 +263,11 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
            mm_data,
            mm_kwargs,
        )
+        if "pixel_values" in processed_outputs:
+            # Cast pixel values to model dtype already here,
+            # so we need to transfer less data to the GPU
+            processed_outputs["pixel_values"] = processed_outputs[
+                "pixel_values"].to(self.info.ctx.model_config.dtype)

        # HF processor pops the `num_crops` kwarg, which is needed by vLLM
        if (images := mm_data.get("images")) is not None:
@@ -543,9 +548,7 @@ class Gemma3ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP,
        vision_tower: SiglipVisionModel,
        pixel_values: torch.Tensor,
    ) -> torch.Tensor:
-        target_dtype = vision_tower.get_input_embeddings().weight.dtype
-        image_features = vision_tower(pixel_values.to(dtype=target_dtype))
-        return image_features
+        return vision_tower(pixel_values)

    def _process_image_input(
        self,