@@ -576,3 +586,397 @@ class VisionLanguageAdapter(nn.Module):
...
@@ -576,3 +586,397 @@ class VisionLanguageAdapter(nn.Module):
defforward(self,x:torch.Tensor)->torch.Tensor:
defforward(self,x:torch.Tensor)->torch.Tensor:
returnself.w_out(self.gelu(self.w_in(x)))
returnself.w_out(self.gelu(self.w_in(x)))
#### HF Transformers version of Pixtral ####
# Based off https://github.com/huggingface/transformers/blob/d7950bff82b18c823193d17d72188c5e46d06c83/src/transformers/models/pixtral/modeling_pixtral.py
# This model follows the Llava family, meaning image embeddings are placed
# instead of the `[IMG]` token placeholders.
# The model uses [`PixtralVisionModel`] for its vision encoder,
# and [`MistralForCausalLM`] for its language decoder.