Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f5845496
Unverified
Commit
f5845496
authored
Oct 24, 2024
by
litianjian
Committed by
GitHub
Oct 24, 2024
Browse files
[Bugfix]Disable the post_norm layer of the vision encoder for LLaVA models (#9653)
parent
b979143d
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
8 additions
and
4 deletions
+8
-4
vllm/model_executor/models/llava.py
vllm/model_executor/models/llava.py
+2
-1
vllm/model_executor/models/llava_next.py
vllm/model_executor/models/llava_next.py
+2
-1
vllm/model_executor/models/llava_next_video.py
vllm/model_executor/models/llava_next_video.py
+2
-1
vllm/model_executor/models/llava_onevision.py
vllm/model_executor/models/llava_onevision.py
+2
-1
No files found.
vllm/model_executor/models/llava.py
View file @
f5845496
...
@@ -273,7 +273,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -273,7 +273,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
config
.
projector_hidden_act
=
"gelu"
config
.
projector_hidden_act
=
"gelu"
# TODO: Optionally initializes this for supporting embeddings.
# TODO: Optionally initializes this for supporting embeddings.
self
.
vision_tower
=
init_vision_tower_for_llava
(
config
,
quant_config
)
self
.
vision_tower
=
init_vision_tower_for_llava
(
config
,
quant_config
,
require_post_norm
=
False
)
self
.
multi_modal_projector
=
LlavaMultiModalProjector
(
self
.
multi_modal_projector
=
LlavaMultiModalProjector
(
vision_hidden_size
=
config
.
vision_config
.
hidden_size
,
vision_hidden_size
=
config
.
vision_config
.
hidden_size
,
text_hidden_size
=
config
.
text_config
.
hidden_size
,
text_hidden_size
=
config
.
text_config
.
hidden_size
,
...
...
vllm/model_executor/models/llava_next.py
View file @
f5845496
...
@@ -277,7 +277,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -277,7 +277,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
self
.
multimodal_config
=
multimodal_config
self
.
multimodal_config
=
multimodal_config
# TODO: Optionally initializes this for supporting embeddings.
# TODO: Optionally initializes this for supporting embeddings.
self
.
vision_tower
=
init_vision_tower_for_llava
(
config
,
quant_config
)
self
.
vision_tower
=
init_vision_tower_for_llava
(
config
,
quant_config
,
require_post_norm
=
False
)
self
.
image_newline
=
nn
.
Parameter
(
self
.
image_newline
=
nn
.
Parameter
(
torch
.
empty
(
config
.
text_config
.
hidden_size
))
torch
.
empty
(
config
.
text_config
.
hidden_size
))
self
.
multi_modal_projector
=
LlavaMultiModalProjector
(
self
.
multi_modal_projector
=
LlavaMultiModalProjector
(
...
...
vllm/model_executor/models/llava_next_video.py
View file @
f5845496
...
@@ -256,7 +256,8 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -256,7 +256,8 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
self
.
multimodal_config
=
multimodal_config
self
.
multimodal_config
=
multimodal_config
# Initialize the vision tower only up to the required feature layer
# Initialize the vision tower only up to the required feature layer
self
.
vision_tower
=
init_vision_tower_for_llava
(
config
,
quant_config
)
self
.
vision_tower
=
init_vision_tower_for_llava
(
config
,
quant_config
,
require_post_norm
=
False
)
self
.
vision_resampler
=
LlavaNextVideoPooler
(
config
)
self
.
vision_resampler
=
LlavaNextVideoPooler
(
config
)
self
.
multi_modal_projector
=
LlavaNextMultiModalProjector
(
self
.
multi_modal_projector
=
LlavaNextMultiModalProjector
(
vision_hidden_size
=
config
.
vision_config
.
hidden_size
,
vision_hidden_size
=
config
.
vision_config
.
hidden_size
,
...
...
vllm/model_executor/models/llava_onevision.py
View file @
f5845496
...
@@ -400,7 +400,8 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -400,7 +400,8 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
self
.
multimodal_config
=
multimodal_config
self
.
multimodal_config
=
multimodal_config
# Initialize the vision tower only up to the required feature layer
# Initialize the vision tower only up to the required feature layer
self
.
vision_tower
=
init_vision_tower_for_llava
(
config
,
quant_config
)
self
.
vision_tower
=
init_vision_tower_for_llava
(
config
,
quant_config
,
require_post_norm
=
False
)
self
.
multi_modal_projector
=
LlavaOnevisionMultiModalProjector
(
config
)
self
.
multi_modal_projector
=
LlavaOnevisionMultiModalProjector
(
config
)
self
.
language_model
=
init_vllm_registered_model
(
self
.
language_model
=
init_vllm_registered_model
(
config
.
text_config
,
cache_config
,
quant_config
)
config
.
text_config
,
cache_config
,
quant_config
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment