Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
da1a844e
Unverified
Commit
da1a844e
authored
Sep 10, 2024
by
Cyrus Leung
Committed by
GitHub
Sep 10, 2024
Browse files
[Bugfix] Fix missing `post_layernorm` in CLIP (#8155)
parent
a1d87422
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
42 additions
and
19 deletions
+42
-19
vllm/model_executor/models/clip.py
vllm/model_executor/models/clip.py
+25
-4
vllm/model_executor/models/siglip.py
vllm/model_executor/models/siglip.py
+17
-15
No files found.
vllm/model_executor/models/clip.py
View file @
da1a844e
...
@@ -355,6 +355,19 @@ class CLIPVisionTransformer(nn.Module):
...
@@ -355,6 +355,19 @@ class CLIPVisionTransformer(nn.Module):
quant_config
=
quant_config
,
quant_config
=
quant_config
,
num_hidden_layers_override
=
num_hidden_layers_override
)
num_hidden_layers_override
=
num_hidden_layers_override
)
if
len
(
self
.
encoder
.
layers
)
>
config
.
num_hidden_layers
:
raise
ValueError
(
f
"The original encoder only has
{
config
.
num_hidden_layers
}
"
f
"layers, but you requested
{
len
(
self
.
encoder
.
layers
)
}
layers."
)
elif
len
(
self
.
encoder
.
layers
)
==
config
.
num_hidden_layers
:
self
.
post_layernorm
=
nn
.
LayerNorm
(
embed_dim
,
eps
=
config
.
layer_norm_eps
)
else
:
# post_layernorm is unused when we extract intermediate features
# In this case, we can skip it to conserve memory
self
.
post_layernorm
=
None
def
forward
(
def
forward
(
self
,
self
,
pixel_values
:
torch
.
Tensor
,
pixel_values
:
torch
.
Tensor
,
...
@@ -364,7 +377,10 @@ class CLIPVisionTransformer(nn.Module):
...
@@ -364,7 +377,10 @@ class CLIPVisionTransformer(nn.Module):
hidden_states
=
self
.
pre_layrnorm
(
hidden_states
)
hidden_states
=
self
.
pre_layrnorm
(
hidden_states
)
hidden_states
=
self
.
encoder
(
inputs_embeds
=
hidden_states
)
hidden_states
=
self
.
encoder
(
inputs_embeds
=
hidden_states
)
return
hidden_states
if
self
.
post_layernorm
is
None
:
return
hidden_states
return
self
.
post_layernorm
(
hidden_states
)
class
CLIPVisionModel
(
nn
.
Module
):
class
CLIPVisionModel
(
nn
.
Module
):
...
@@ -386,9 +402,12 @@ class CLIPVisionModel(nn.Module):
...
@@ -386,9 +402,12 @@ class CLIPVisionModel(nn.Module):
quant_config
=
quant_config
,
quant_config
=
quant_config
,
num_hidden_layers_override
=
num_hidden_layers_override
)
num_hidden_layers_override
=
num_hidden_layers_override
)
def
forward
(
self
,
pixel_values
:
Optional
[
torch
.
Tensor
]
=
None
):
@
property
def
_require_post_layernorm
(
self
)
->
bool
:
return
self
.
vision_model
.
post_layernorm
is
not
None
return
self
.
vision_model
(
pixel_values
=
pixel_values
)
def
forward
(
self
,
pixel_values
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
vision_model
(
pixel_values
)
@
property
@
property
def
device
(
self
):
def
device
(
self
):
...
@@ -408,8 +427,10 @@ class CLIPVisionModel(nn.Module):
...
@@ -408,8 +427,10 @@ class CLIPVisionModel(nn.Module):
for
name
,
loaded_weight
in
weights
:
for
name
,
loaded_weight
in
weights
:
# post_layernorm is not needed in CLIPVisionModel
# post_layernorm is not needed in CLIPVisionModel
if
"vision_model.post_layernorm"
in
name
:
if
(
"vision_model.post_layernorm"
in
name
and
not
self
.
_require_post_layernorm
):
continue
continue
# omit layers when num_hidden_layers_override is set
# omit layers when num_hidden_layers_override is set
if
"vision_model.encoder.layers."
in
name
:
if
"vision_model.encoder.layers."
in
name
:
layer_idx
=
int
(
name
.
split
(
"."
)[
3
])
layer_idx
=
int
(
name
.
split
(
"."
)[
3
])
...
...
vllm/model_executor/models/siglip.py
View file @
da1a844e
...
@@ -443,27 +443,26 @@ class SiglipVisionTransformer(nn.Module):
...
@@ -443,27 +443,26 @@ class SiglipVisionTransformer(nn.Module):
self
.
config
=
config
self
.
config
=
config
embed_dim
=
config
.
hidden_size
embed_dim
=
config
.
hidden_size
if
(
num_hidden_layers_override
is
None
or
num_hidden_layers_override
==
config
.
num_hidden_layers
):
self
.
need_post_layernorm
=
True
elif
num_hidden_layers_override
>
config
.
num_hidden_layers
:
raise
ValueError
(
"num_hidden_layers_override cannot be greater than "
"num_hidden_layers"
)
else
:
self
.
need_post_layernorm
=
False
self
.
embeddings
=
SiglipVisionEmbeddings
(
config
)
self
.
embeddings
=
SiglipVisionEmbeddings
(
config
)
self
.
encoder
=
SiglipEncoder
(
self
.
encoder
=
SiglipEncoder
(
config
,
config
,
quant_config
=
quant_config
,
quant_config
=
quant_config
,
num_hidden_layers_override
=
num_hidden_layers_override
,
num_hidden_layers_override
=
num_hidden_layers_override
,
)
)
if
self
.
need_post_layernorm
:
if
len
(
self
.
encoder
.
layers
)
>
config
.
num_hidden_layers
:
raise
ValueError
(
f
"The original encoder only has
{
config
.
num_hidden_layers
}
"
f
"layers, but you requested
{
len
(
self
.
encoder
.
layers
)
}
layers."
)
elif
len
(
self
.
encoder
.
layers
)
==
config
.
num_hidden_layers
:
self
.
post_layernorm
=
nn
.
LayerNorm
(
embed_dim
,
self
.
post_layernorm
=
nn
.
LayerNorm
(
embed_dim
,
eps
=
config
.
layer_norm_eps
)
eps
=
config
.
layer_norm_eps
)
else
:
else
:
self
.
post_layernorm
=
nn
.
Identity
()
# post_layernorm is unused when we extract intermediate features
# In this case, we can skip it to conserve memory
self
.
post_layernorm
=
None
self
.
use_head
=
(
True
if
not
hasattr
(
config
,
"vision_use_head"
)
else
self
.
use_head
=
(
True
if
not
hasattr
(
config
,
"vision_use_head"
)
else
config
.
vision_use_head
)
config
.
vision_use_head
)
if
self
.
use_head
:
if
self
.
use_head
:
...
@@ -482,6 +481,9 @@ class SiglipVisionTransformer(nn.Module):
...
@@ -482,6 +481,9 @@ class SiglipVisionTransformer(nn.Module):
encoder_outputs
=
self
.
encoder
(
inputs_embeds
=
hidden_states
)
encoder_outputs
=
self
.
encoder
(
inputs_embeds
=
hidden_states
)
if
self
.
post_layernorm
is
None
:
return
encoder_outputs
last_hidden_state
=
self
.
post_layernorm
(
encoder_outputs
)
last_hidden_state
=
self
.
post_layernorm
(
encoder_outputs
)
# TODO: add this back when pooled_output is used in inference
# TODO: add this back when pooled_output is used in inference
# if self.use_head:
# if self.use_head:
...
@@ -512,8 +514,8 @@ class SiglipVisionModel(nn.Module):
...
@@ -512,8 +514,8 @@ class SiglipVisionModel(nn.Module):
)
)
@
property
@
property
def
need
_post_layernorm
(
self
):
def
_require
_post_layernorm
(
self
)
->
bool
:
return
self
.
vision_model
.
need_
post_layernorm
return
self
.
vision_model
.
post_layernorm
is
not
None
def
get_input_embeddings
(
self
)
->
nn
.
Module
:
def
get_input_embeddings
(
self
)
->
nn
.
Module
:
return
self
.
vision_model
.
embeddings
.
patch_embedding
return
self
.
vision_model
.
embeddings
.
patch_embedding
...
@@ -541,7 +543,7 @@ class SiglipVisionModel(nn.Module):
...
@@ -541,7 +543,7 @@ class SiglipVisionModel(nn.Module):
for
name
,
loaded_weight
in
weights
:
for
name
,
loaded_weight
in
weights
:
# post_layernorm is optional in SiglipVisionModel
# post_layernorm is optional in SiglipVisionModel
if
(
"vision_model.post_layernorm"
in
name
if
(
"vision_model.post_layernorm"
in
name
and
not
self
.
need
_post_layernorm
):
and
not
self
.
_require
_post_layernorm
):
continue
continue
# omit layers when num_hidden_layers_override is set
# omit layers when num_hidden_layers_override is set
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment