[Pixtral-Large] Pixtral actually has no bias in vision-lang adapter (#10449)

11fd7ea6 · Patrick von Platen · GitHub · f028dff3 · 11fd7ea6
Unverified Commit 11fd7ea6 authored Nov 19, 2024 by Patrick von Platen Committed by GitHub Nov 19, 2024
Show whitespace changes
Inline Side-by-side

Showing with 3 additions and 2 deletions

vllm/model_executor/models/pixtral.py vllm/model_executor/models/pixtral.py +3 -2

No files found.
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -331,6 +331,7 @@ class VisionEncoderArgs:
    num_attention_heads: int
    rope_theta: float  # for rope-2D
    image_token_id: int
+    adapter_bias: bool = True


 def _reshape_for_broadcast(freqs_cis: torch.Tensor,
@@ -595,10 +596,10 @@ class VisionLanguageAdapter(nn.Module):
        self.w_in = nn.Linear(
            args.hidden_size,
            dim,
-            bias=True,
+            bias=args.adapter_bias,
        )
        self.gelu = nn.GELU()
-        self.w_out = nn.Linear(dim, dim, bias=True)
+        self.w_out = nn.Linear(dim, dim, bias=args.adapter_bias)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.w_out(self.gelu(self.w_in(x)))