Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e661d594
Commit
e661d594
authored
Aug 12, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.5.4' into v0.5.4-dtk24.04.1
parents
6b16ea2e
4db5176d
Changes
374
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4567 additions
and
192 deletions
+4567
-192
vllm/model_executor/models/__init__.py
vllm/model_executor/models/__init__.py
+7
-4
vllm/model_executor/models/blip.py
vllm/model_executor/models/blip.py
+269
-0
vllm/model_executor/models/blip2.py
vllm/model_executor/models/blip2.py
+669
-0
vllm/model_executor/models/chameleon.py
vllm/model_executor/models/chameleon.py
+10
-3
vllm/model_executor/models/deepseek_v2.py
vllm/model_executor/models/deepseek_v2.py
+114
-39
vllm/model_executor/models/fuyu.py
vllm/model_executor/models/fuyu.py
+1
-1
vllm/model_executor/models/gemma.py
vllm/model_executor/models/gemma.py
+3
-3
vllm/model_executor/models/gemma2.py
vllm/model_executor/models/gemma2.py
+11
-5
vllm/model_executor/models/idefics2_vision_model.py
vllm/model_executor/models/idefics2_vision_model.py
+296
-0
vllm/model_executor/models/intern_vit.py
vllm/model_executor/models/intern_vit.py
+279
-0
vllm/model_executor/models/internlm2.py
vllm/model_executor/models/internlm2.py
+9
-1
vllm/model_executor/models/internvl.py
vllm/model_executor/models/internvl.py
+445
-0
vllm/model_executor/models/jamba.py
vllm/model_executor/models/jamba.py
+54
-108
vllm/model_executor/models/llama.py
vllm/model_executor/models/llama.py
+5
-0
vllm/model_executor/models/llava_next.py
vllm/model_executor/models/llava_next.py
+22
-26
vllm/model_executor/models/minicpm.py
vllm/model_executor/models/minicpm.py
+7
-2
vllm/model_executor/models/minicpmv.py
vllm/model_executor/models/minicpmv.py
+1026
-0
vllm/model_executor/models/na_vit.py
vllm/model_executor/models/na_vit.py
+804
-0
vllm/model_executor/models/nemotron.py
vllm/model_executor/models/nemotron.py
+531
-0
vllm/model_executor/models/olmo.py
vllm/model_executor/models/olmo.py
+5
-0
No files found.
vllm/model_executor/models/__init__.py
View file @
e661d594
...
@@ -16,8 +16,8 @@ _GENERATION_MODELS = {
...
@@ -16,8 +16,8 @@ _GENERATION_MODELS = {
"BaiChuanForCausalLM"
:
(
"baichuan"
,
"BaiChuanForCausalLM"
),
# baichuan-7b
"BaiChuanForCausalLM"
:
(
"baichuan"
,
"BaiChuanForCausalLM"
),
# baichuan-7b
"BaichuanForCausalLM"
:
(
"baichuan"
,
"BaichuanForCausalLM"
),
# baichuan-13b
"BaichuanForCausalLM"
:
(
"baichuan"
,
"BaichuanForCausalLM"
),
# baichuan-13b
"BloomForCausalLM"
:
(
"bloom"
,
"BloomForCausalLM"
),
"BloomForCausalLM"
:
(
"bloom"
,
"BloomForCausalLM"
),
#TODO(ywang96): remove this when huggingface fixes the model repo
"Blip2ForConditionalGeneration"
:
"ChameleonForCausalLM"
:
(
"chameleon"
,
"Chameleon
ForConditionalGeneration"
),
(
"blip2"
,
"Blip2
ForConditionalGeneration"
),
"ChameleonForConditionalGeneration"
:
"ChameleonForConditionalGeneration"
:
(
"chameleon"
,
"ChameleonForConditionalGeneration"
),
(
"chameleon"
,
"ChameleonForConditionalGeneration"
),
"ChatGLMModel"
:
(
"chatglm"
,
"ChatGLMForCausalLM"
),
"ChatGLMModel"
:
(
"chatglm"
,
"ChatGLMForCausalLM"
),
...
@@ -37,6 +37,7 @@ _GENERATION_MODELS = {
...
@@ -37,6 +37,7 @@ _GENERATION_MODELS = {
"GPTNeoXForCausalLM"
:
(
"gpt_neox"
,
"GPTNeoXForCausalLM"
),
"GPTNeoXForCausalLM"
:
(
"gpt_neox"
,
"GPTNeoXForCausalLM"
),
"InternLMForCausalLM"
:
(
"llama"
,
"LlamaForCausalLM"
),
"InternLMForCausalLM"
:
(
"llama"
,
"LlamaForCausalLM"
),
"InternLM2ForCausalLM"
:
(
"internlm2"
,
"InternLM2ForCausalLM"
),
"InternLM2ForCausalLM"
:
(
"internlm2"
,
"InternLM2ForCausalLM"
),
"InternVLChatModel"
:
(
"internvl"
,
"InternVLChatModel"
),
"JAISLMHeadModel"
:
(
"jais"
,
"JAISLMHeadModel"
),
"JAISLMHeadModel"
:
(
"jais"
,
"JAISLMHeadModel"
),
"LlamaForCausalLM"
:
(
"llama"
,
"LlamaForCausalLM"
),
"LlamaForCausalLM"
:
(
"llama"
,
"LlamaForCausalLM"
),
"LlavaForConditionalGeneration"
:
"LlavaForConditionalGeneration"
:
...
@@ -52,12 +53,14 @@ _GENERATION_MODELS = {
...
@@ -52,12 +53,14 @@ _GENERATION_MODELS = {
"MptForCausalLM"
:
(
"mpt"
,
"MPTForCausalLM"
),
"MptForCausalLM"
:
(
"mpt"
,
"MPTForCausalLM"
),
"MPTForCausalLM"
:
(
"mpt"
,
"MPTForCausalLM"
),
"MPTForCausalLM"
:
(
"mpt"
,
"MPTForCausalLM"
),
"MiniCPMForCausalLM"
:
(
"minicpm"
,
"MiniCPMForCausalLM"
),
"MiniCPMForCausalLM"
:
(
"minicpm"
,
"MiniCPMForCausalLM"
),
"MiniCPMV"
:
(
"minicpmv"
,
"MiniCPMV"
),
"NemotronForCausalLM"
:
(
"nemotron"
,
"NemotronForCausalLM"
),
"OlmoForCausalLM"
:
(
"olmo"
,
"OlmoForCausalLM"
),
"OlmoForCausalLM"
:
(
"olmo"
,
"OlmoForCausalLM"
),
"OPTForCausalLM"
:
(
"opt"
,
"OPTForCausalLM"
),
"OPTForCausalLM"
:
(
"opt"
,
"OPTForCausalLM"
),
"OrionForCausalLM"
:
(
"orion"
,
"OrionForCausalLM"
),
"OrionForCausalLM"
:
(
"orion"
,
"OrionForCausalLM"
),
"PersimmonForCausalLM"
:
(
"persimmon"
,
"PersimmonForCausalLM"
),
"PersimmonForCausalLM"
:
(
"persimmon"
,
"PersimmonForCausalLM"
),
"PaliGemmaForConditionalGeneration"
:
"PaliGemmaForConditionalGeneration"
:
(
"paligemma"
,
(
"paligemma"
,
"PaliGemmaForConditionalGeneration"
),
"PaliGemmaForConditionalGeneration"
),
"PhiForCausalLM"
:
(
"phi"
,
"PhiForCausalLM"
),
"PhiForCausalLM"
:
(
"phi"
,
"PhiForCausalLM"
),
"Phi3ForCausalLM"
:
(
"llama"
,
"LlamaForCausalLM"
),
"Phi3ForCausalLM"
:
(
"llama"
,
"LlamaForCausalLM"
),
"Phi3VForCausalLM"
:
(
"phi3v"
,
"Phi3VForCausalLM"
),
"Phi3VForCausalLM"
:
(
"phi3v"
,
"Phi3VForCausalLM"
),
...
...
vllm/model_executor/models/blip.py
0 → 100644
View file @
e661d594
"""Minimal implementation of BlipVisionModel intended to be only used
within a vision language model."""
from
typing
import
Optional
,
Union
import
torch
import
torch.nn
as
nn
from
PIL
import
Image
from
transformers
import
Blip2VisionConfig
,
BlipVisionConfig
from
transformers.models.blip.modeling_blip
import
BlipAttention
from
vllm.config
import
ModelConfig
from
vllm.inputs
import
LLMInputs
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
RowParallelLinear
)
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.multimodal.image
import
(
cached_get_tokenizer
,
repeat_and_pad_image_tokens
)
from
vllm.sequence
import
SequenceData
def
get_blip_patch_grid_length
(
*
,
image_size
:
int
,
patch_size
:
int
)
->
int
:
assert
image_size
%
patch_size
==
0
return
image_size
//
patch_size
def
get_blip_num_patches
(
*
,
image_size
:
int
,
patch_size
:
int
)
->
int
:
grid_length
=
get_blip_patch_grid_length
(
image_size
=
image_size
,
patch_size
=
patch_size
)
return
grid_length
*
grid_length
def
get_blip_image_feature_size
(
hf_config
:
Union
[
BlipVisionConfig
,
Blip2VisionConfig
],
)
->
int
:
return
get_blip_num_patches
(
image_size
=
hf_config
.
image_size
,
patch_size
=
hf_config
.
patch_size
)
def
get_max_blip_image_tokens
(
hf_config
:
Union
[
BlipVisionConfig
,
Blip2VisionConfig
],
)
->
int
:
return
get_blip_image_feature_size
(
hf_config
)
def
dummy_seq_data_for_blip
(
hf_config
:
Union
[
BlipVisionConfig
,
Blip2VisionConfig
],
seq_len
:
int
,
*
,
image_token_id
:
int
,
image_feature_size_override
:
Optional
[
int
]
=
None
,
):
if
image_feature_size_override
is
None
:
image_feature_size
=
get_blip_image_feature_size
(
hf_config
)
else
:
image_feature_size
=
image_feature_size_override
token_ids
=
[
image_token_id
]
*
image_feature_size
token_ids
+=
[
0
]
*
(
seq_len
-
image_feature_size
)
return
SequenceData
(
token_ids
)
def
dummy_image_for_blip
(
hf_config
:
Union
[
BlipVisionConfig
,
Blip2VisionConfig
],
*
,
image_width_override
:
Optional
[
int
]
=
None
,
image_height_override
:
Optional
[
int
]
=
None
,
):
width
=
height
=
hf_config
.
image_size
if
image_width_override
is
not
None
:
width
=
image_width_override
if
image_height_override
is
not
None
:
height
=
image_height_override
image
=
Image
.
new
(
"RGB"
,
(
width
,
height
),
color
=
0
)
return
{
"image"
:
image
}
def
input_processor_for_blip
(
model_config
:
ModelConfig
,
hf_config
:
Union
[
BlipVisionConfig
,
Blip2VisionConfig
],
llm_inputs
:
LLMInputs
,
*
,
image_token_id
:
int
,
image_feature_size_override
:
Optional
[
int
]
=
None
,
):
multi_modal_data
=
llm_inputs
.
get
(
"multi_modal_data"
)
if
multi_modal_data
is
None
or
"image"
not
in
multi_modal_data
:
return
llm_inputs
tokenizer
=
cached_get_tokenizer
(
model_config
.
tokenizer
)
if
image_feature_size_override
is
None
:
image_feature_size
=
get_blip_image_feature_size
(
hf_config
)
else
:
image_feature_size
=
image_feature_size_override
new_prompt
,
new_token_ids
=
repeat_and_pad_image_tokens
(
tokenizer
,
llm_inputs
.
get
(
"prompt"
),
llm_inputs
[
"prompt_token_ids"
],
image_token_id
=
image_token_id
,
repeat_count
=
image_feature_size
,
)
# NOTE: Create a defensive copy of the original inputs
return
LLMInputs
(
prompt_token_ids
=
new_token_ids
,
prompt
=
new_prompt
,
multi_modal_data
=
multi_modal_data
)
# Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa
class
BlipVisionEmbeddings
(
nn
.
Module
):
def
__init__
(
self
,
config
:
BlipVisionConfig
):
super
().
__init__
()
self
.
config
=
config
self
.
embed_dim
=
config
.
hidden_size
self
.
image_size
=
config
.
image_size
self
.
patch_size
=
config
.
patch_size
self
.
class_embedding
=
nn
.
Parameter
(
torch
.
randn
(
1
,
1
,
self
.
embed_dim
))
self
.
patch_embedding
=
nn
.
Conv2d
(
in_channels
=
3
,
out_channels
=
self
.
embed_dim
,
kernel_size
=
self
.
patch_size
,
stride
=
self
.
patch_size
,
)
self
.
num_patches
=
get_blip_num_patches
(
image_size
=
self
.
image_size
,
patch_size
=
self
.
patch_size
)
self
.
num_positions
=
self
.
num_patches
+
1
self
.
position_embedding
=
nn
.
Parameter
(
torch
.
randn
(
1
,
self
.
num_positions
,
self
.
embed_dim
))
def
forward
(
self
,
pixel_values
:
torch
.
Tensor
)
->
torch
.
Tensor
:
batch_size
=
pixel_values
.
shape
[
0
]
target_dtype
=
self
.
patch_embedding
.
weight
.
dtype
patch_embeds
=
self
.
patch_embedding
(
pixel_values
.
to
(
dtype
=
target_dtype
))
# shape = [*, width, grid, grid]
patch_embeds
=
patch_embeds
.
flatten
(
2
).
transpose
(
1
,
2
)
class_embeds
=
self
.
class_embedding
.
expand
(
batch_size
,
1
,
-
1
)
embeddings
=
torch
.
cat
([
class_embeds
,
patch_embeds
],
dim
=
1
)
position_embeds
=
self
.
position_embedding
.
to
(
target_dtype
)
embeddings
=
embeddings
+
position_embeds
[:,
:
embeddings
.
size
(
1
),
:]
return
embeddings
class
BlipMLP
(
nn
.
Module
):
def
__init__
(
self
,
config
:
BlipVisionConfig
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
):
super
().
__init__
()
self
.
config
=
config
self
.
activation_fn
=
get_act_fn
(
config
.
hidden_act
)
self
.
fc1
=
ColumnParallelLinear
(
config
.
hidden_size
,
config
.
intermediate_size
,
bias
=
True
,
quant_config
=
quant_config
)
self
.
fc2
=
RowParallelLinear
(
config
.
intermediate_size
,
config
.
hidden_size
,
bias
=
True
,
quant_config
=
quant_config
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
hidden_states
,
_
=
self
.
fc1
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
hidden_states
)
hidden_states
,
_
=
self
.
fc2
(
hidden_states
)
return
hidden_states
class
BlipEncoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
BlipVisionConfig
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
):
super
().
__init__
()
self
.
self_attn
=
BlipAttention
(
config
)
self
.
layer_norm1
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
self
.
mlp
=
BlipMLP
(
config
,
quant_config
=
quant_config
)
self
.
layer_norm2
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
residual
=
hidden_states
hidden_states
=
self
.
layer_norm1
(
hidden_states
)
hidden_states
,
_
=
self
.
self_attn
(
hidden_states
=
hidden_states
)
hidden_states
=
residual
+
hidden_states
residual
=
hidden_states
hidden_states
=
self
.
layer_norm2
(
hidden_states
)
hidden_states
=
self
.
mlp
(
hidden_states
)
hidden_states
=
residual
+
hidden_states
return
hidden_states
class
BlipEncoder
(
nn
.
Module
):
"""
Transformer encoder consisting of `config.num_hidden_layers` self
attention layers. Each layer is a [`BlipEncoderLayer`].
Args:
config: BlipConfig
"""
def
__init__
(
self
,
config
:
BlipVisionConfig
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
num_hidden_layers_override
:
Optional
[
int
]
=
None
):
super
().
__init__
()
self
.
config
=
config
if
num_hidden_layers_override
is
None
:
num_hidden_layers
=
config
.
num_hidden_layers
else
:
num_hidden_layers
=
num_hidden_layers_override
self
.
layers
=
nn
.
ModuleList
([
BlipEncoderLayer
(
config
=
config
,
quant_config
=
quant_config
)
for
_
in
range
(
num_hidden_layers
)
])
def
forward
(
self
,
inputs_embeds
:
torch
.
Tensor
):
hidden_states
=
inputs_embeds
for
encoder_layer
in
self
.
layers
:
hidden_states
=
encoder_layer
(
hidden_states
)
return
hidden_states
class
BlipVisionModel
(
nn
.
Module
):
config_class
=
BlipVisionConfig
main_input_name
=
"pixel_values"
def
__init__
(
self
,
config
:
BlipVisionConfig
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
num_hidden_layers_override
:
Optional
[
int
]
=
None
):
super
().
__init__
()
self
.
config
=
config
self
.
embeddings
=
BlipVisionEmbeddings
(
config
)
self
.
encoder
=
BlipEncoder
(
config
=
config
,
quant_config
=
quant_config
,
num_hidden_layers_override
=
num_hidden_layers_override
,
)
self
.
post_layernorm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
def
forward
(
self
,
pixel_values
:
torch
.
Tensor
)
->
torch
.
Tensor
:
hidden_states
=
self
.
embeddings
(
pixel_values
)
hidden_states
=
self
.
encoder
(
inputs_embeds
=
hidden_states
)
return
self
.
post_layernorm
(
hidden_states
)
vllm/model_executor/models/blip2.py
0 → 100644
View file @
e661d594
from
typing
import
Iterable
,
List
,
Literal
,
Optional
,
Tuple
,
TypedDict
import
torch
import
torch.nn
as
nn
from
transformers
import
(
Blip2Config
,
Blip2QFormerConfig
,
Blip2VisionConfig
,
apply_chunking_to_forward
)
from
vllm.attention
import
AttentionMetadata
from
vllm.config
import
CacheConfig
,
MultiModalConfig
from
vllm.inputs
import
INPUT_REGISTRY
,
InputContext
,
LLMInputs
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.opt
import
OPTModel
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
,
SequenceData
from
.blip
import
(
BlipVisionModel
,
dummy_image_for_blip
,
get_max_blip_image_tokens
)
from
.interfaces
import
SupportsVision
from
.utils
import
merge_vision_embeddings
_KEYS_TO_MODIFY_MAPPING
=
{
"language_model.lm_head"
:
"lm_head"
,
"language_model.model"
:
"language_model"
,
}
class
Blip2QFormerMultiHeadAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
:
Blip2QFormerConfig
,
*
,
quant_config
:
Optional
[
QuantizationConfig
],
cache_config
:
Optional
[
CacheConfig
],
is_cross_attention
:
bool
=
False
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
if
config
.
hidden_size
%
config
.
num_attention_heads
!=
0
:
raise
ValueError
(
f
"The hidden size (
{
config
.
hidden_size
}
) is not a multiple of "
f
"the number of attention heads (
{
config
.
num_attention_heads
}
)"
)
self
.
num_attention_heads
=
config
.
num_attention_heads
self
.
attention_head_size
=
(
config
.
hidden_size
//
config
.
num_attention_heads
)
self
.
all_head_size
=
self
.
num_attention_heads
*
self
.
attention_head_size
self
.
scaling
=
self
.
attention_head_size
**-
0.5
self
.
query
=
nn
.
Linear
(
config
.
hidden_size
,
self
.
all_head_size
)
if
is_cross_attention
:
kv_hidden_size
=
config
.
encoder_hidden_size
else
:
kv_hidden_size
=
config
.
hidden_size
self
.
key
=
nn
.
Linear
(
kv_hidden_size
,
self
.
all_head_size
)
self
.
value
=
nn
.
Linear
(
kv_hidden_size
,
self
.
all_head_size
)
self
.
position_embedding_type
=
getattr
(
config
,
"position_embedding_type"
,
"absolute"
)
if
self
.
position_embedding_type
!=
"absolute"
:
raise
NotImplementedError
(
"Unsupported position_embedding_type: "
f
"
{
self
.
position_embedding_type
}
"
)
self
.
dropout
=
nn
.
Dropout
(
config
.
attention_probs_dropout_prob
)
def
transpose_for_scores
(
self
,
x
):
x
=
x
.
view
(
*
x
.
size
()[:
-
1
],
self
.
num_attention_heads
,
self
.
attention_head_size
)
return
x
.
permute
(
0
,
2
,
1
,
3
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
encoder_hidden_states
:
Optional
[
torch
.
FloatTensor
]
=
None
,
):
is_cross_attention
=
encoder_hidden_states
is
not
None
if
is_cross_attention
:
key_layer
=
self
.
transpose_for_scores
(
self
.
key
(
encoder_hidden_states
))
value_layer
=
self
.
transpose_for_scores
(
self
.
value
(
encoder_hidden_states
))
else
:
key_layer
=
self
.
transpose_for_scores
(
self
.
key
(
hidden_states
))
value_layer
=
self
.
transpose_for_scores
(
self
.
value
(
hidden_states
))
mixed_query_layer
=
self
.
query
(
hidden_states
)
query_layer
=
self
.
transpose_for_scores
(
mixed_query_layer
)
attention_scores
=
torch
.
matmul
(
query_layer
,
key_layer
.
transpose
(
-
1
,
-
2
))
attention_probs
=
torch
.
softmax
(
attention_scores
*
self
.
scaling
,
dim
=-
1
)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs_dropped
=
self
.
dropout
(
attention_probs
)
context_layer
=
torch
.
matmul
(
attention_probs_dropped
,
value_layer
)
context_layer
=
context_layer
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
context_layer
=
context_layer
.
view
(
*
context_layer
.
size
()[:
-
2
],
self
.
all_head_size
)
return
context_layer
class
Blip2QFormerSelfOutput
(
nn
.
Module
):
def
__init__
(
self
,
config
:
Blip2QFormerConfig
)
->
None
:
super
().
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
hidden_size
)
self
.
LayerNorm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
input_tensor
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
+
input_tensor
)
return
hidden_states
class
Blip2QFormerAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
:
Blip2QFormerConfig
,
*
,
quant_config
:
Optional
[
QuantizationConfig
],
cache_config
:
Optional
[
CacheConfig
],
is_cross_attention
:
bool
=
False
,
)
->
None
:
super
().
__init__
()
self
.
attention
=
Blip2QFormerMultiHeadAttention
(
config
,
quant_config
=
quant_config
,
cache_config
=
cache_config
,
is_cross_attention
=
is_cross_attention
,
)
self
.
output
=
Blip2QFormerSelfOutput
(
config
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
encoder_hidden_states
:
Optional
[
torch
.
FloatTensor
]
=
None
,
)
->
Tuple
[
torch
.
Tensor
]:
self_output
=
self
.
attention
(
hidden_states
,
encoder_hidden_states
=
encoder_hidden_states
,
)
attention_output
=
self
.
output
(
self_output
,
hidden_states
)
return
attention_output
class
Blip2QFormerIntermediate
(
nn
.
Module
):
def
__init__
(
self
,
config
:
Blip2QFormerConfig
)
->
None
:
super
().
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
intermediate_size
)
self
.
intermediate_act_fn
=
get_act_fn
(
config
.
hidden_act
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
intermediate_act_fn
(
hidden_states
)
return
hidden_states
class
Blip2QFormerOutput
(
nn
.
Module
):
def
__init__
(
self
,
config
:
Blip2QFormerConfig
)
->
None
:
super
().
__init__
()
self
.
dense
=
nn
.
Linear
(
config
.
intermediate_size
,
config
.
hidden_size
)
self
.
LayerNorm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
input_tensor
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
hidden_states
=
self
.
dense
(
hidden_states
)
hidden_states
=
self
.
dropout
(
hidden_states
)
hidden_states
=
self
.
LayerNorm
(
hidden_states
+
input_tensor
)
return
hidden_states
class
Blip2QFormerLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
Blip2QFormerConfig
,
*
,
quant_config
:
Optional
[
QuantizationConfig
],
cache_config
:
Optional
[
CacheConfig
],
layer_idx
:
int
,
)
->
None
:
super
().
__init__
()
self
.
chunk_size_feed_forward
=
config
.
chunk_size_feed_forward
self
.
seq_len_dim
=
1
self
.
attention
=
Blip2QFormerAttention
(
config
,
quant_config
=
quant_config
,
cache_config
=
cache_config
)
self
.
layer_idx
=
layer_idx
if
layer_idx
%
config
.
cross_attention_frequency
==
0
:
self
.
crossattention
=
Blip2QFormerAttention
(
config
,
quant_config
=
quant_config
,
cache_config
=
cache_config
,
is_cross_attention
=
True
)
self
.
has_cross_attention
=
True
else
:
self
.
has_cross_attention
=
False
self
.
intermediate_query
=
Blip2QFormerIntermediate
(
config
)
self
.
output_query
=
Blip2QFormerOutput
(
config
)
def
forward
(
self
,
hidden_states
:
torch
.
FloatTensor
,
encoder_hidden_states
:
torch
.
FloatTensor
,
query_length
:
int
,
):
attention_output
=
self
.
attention
(
hidden_states
)
if
query_length
>
0
:
query_attention_output
=
attention_output
[:,
:
query_length
,
:]
if
self
.
has_cross_attention
:
query_attention_output
=
self
.
crossattention
(
query_attention_output
,
encoder_hidden_states
=
encoder_hidden_states
,
)
layer_output
=
apply_chunking_to_forward
(
self
.
feed_forward_chunk_query
,
self
.
chunk_size_feed_forward
,
self
.
seq_len_dim
,
query_attention_output
,
)
if
attention_output
.
shape
[
1
]
>
query_length
:
layer_output_text
=
apply_chunking_to_forward
(
self
.
feed_forward_chunk
,
self
.
chunk_size_feed_forward
,
self
.
seq_len_dim
,
attention_output
[:,
query_length
:,
:],
)
layer_output
=
torch
.
cat
([
layer_output
,
layer_output_text
],
dim
=
1
)
else
:
layer_output
=
apply_chunking_to_forward
(
self
.
feed_forward_chunk
,
self
.
chunk_size_feed_forward
,
self
.
seq_len_dim
,
attention_output
,
)
return
layer_output
def
feed_forward_chunk
(
self
,
attention_output
:
torch
.
Tensor
)
->
torch
.
Tensor
:
intermediate_output
=
self
.
intermediate
(
attention_output
)
layer_output
=
self
.
output
(
intermediate_output
,
attention_output
)
return
layer_output
def
feed_forward_chunk_query
(
self
,
attention_output
:
torch
.
Tensor
)
->
torch
.
Tensor
:
intermediate_output
=
self
.
intermediate_query
(
attention_output
)
layer_output
=
self
.
output_query
(
intermediate_output
,
attention_output
)
return
layer_output
class
Blip2QFormerEncoder
(
nn
.
Module
):
def
__init__
(
self
,
config
:
Blip2QFormerConfig
,
*
,
quant_config
:
Optional
[
QuantizationConfig
],
cache_config
:
Optional
[
CacheConfig
],
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
layer
=
nn
.
ModuleList
([
Blip2QFormerLayer
(
config
,
quant_config
=
quant_config
,
cache_config
=
cache_config
,
layer_idx
=
layer_idx
)
for
layer_idx
in
range
(
config
.
num_hidden_layers
)
])
def
forward
(
self
,
hidden_states
:
torch
.
FloatTensor
,
encoder_hidden_states
:
torch
.
FloatTensor
,
query_length
:
int
,
)
->
torch
.
Tensor
:
for
i
in
range
(
self
.
config
.
num_hidden_layers
):
layer_module
=
self
.
layer
[
i
]
hidden_states
=
layer_module
(
hidden_states
,
encoder_hidden_states
=
encoder_hidden_states
,
query_length
=
query_length
,
)
return
hidden_states
# Adapted from https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1025
class
Blip2QFormerModel
(
nn
.
Module
):
def
__init__
(
self
,
config
:
Blip2QFormerConfig
,
*
,
quant_config
:
Optional
[
QuantizationConfig
],
cache_config
:
Optional
[
CacheConfig
],
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
layernorm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
layer_norm_eps
)
self
.
dropout
=
nn
.
Dropout
(
config
.
hidden_dropout_prob
)
self
.
encoder
=
Blip2QFormerEncoder
(
config
,
quant_config
=
quant_config
,
cache_config
=
cache_config
)
def
forward
(
self
,
query_embeds
:
torch
.
FloatTensor
,
encoder_hidden_states
:
torch
.
FloatTensor
,
)
->
torch
.
Tensor
:
query_length
=
query_embeds
.
shape
[
1
]
embedding_output
=
self
.
layernorm
(
query_embeds
)
embedding_output
=
self
.
dropout
(
embedding_output
)
sequence_output
=
self
.
encoder
(
embedding_output
,
encoder_hidden_states
=
encoder_hidden_states
,
query_length
=
query_length
,
)
return
sequence_output
class
Blip2ImagePixelInputs
(
TypedDict
):
type
:
Literal
[
"pixel_values"
]
data
:
torch
.
Tensor
"""Shape: (batch_size, num_channels, height, width)"""
Blip2ImageInputs
=
Blip2ImagePixelInputs
# We use this internally as placeholders since there is no image token
# defined on the HuggingFace repo
BLIP2_IMAGE_TOKEN
=
"<image>"
BLIP2_IMAGE_TOKEN_ID
=
50265
def
get_blip2_image_feature_size
(
hf_config
:
Blip2Config
)
->
int
:
return
hf_config
.
num_query_tokens
def
get_max_blip2_image_tokens
(
ctx
:
InputContext
):
hf_config
=
ctx
.
get_hf_config
(
Blip2Config
)
vision_config
=
hf_config
.
vision_config
if
isinstance
(
vision_config
,
Blip2VisionConfig
):
return
get_max_blip_image_tokens
(
vision_config
)
msg
=
f
"Unsupported vision config:
{
type
(
vision_config
)
}
"
raise
NotImplementedError
(
msg
)
def
dummy_data_for_blip2
(
ctx
:
InputContext
,
seq_len
:
int
):
hf_config
=
ctx
.
get_hf_config
(
Blip2Config
)
vision_config
=
hf_config
.
vision_config
image_feature_size
=
get_blip2_image_feature_size
(
hf_config
)
token_ids
=
[
BLIP2_IMAGE_TOKEN_ID
]
*
image_feature_size
token_ids
+=
[
0
]
*
(
seq_len
-
image_feature_size
)
seq_data
=
SequenceData
(
token_ids
)
if
isinstance
(
vision_config
,
Blip2VisionConfig
):
mm_data
=
dummy_image_for_blip
(
vision_config
)
return
seq_data
,
mm_data
msg
=
f
"Unsupported vision config:
{
type
(
vision_config
)
}
"
raise
NotImplementedError
(
msg
)
def
input_processor_for_blip2
(
ctx
:
InputContext
,
llm_inputs
:
LLMInputs
):
multi_modal_data
=
llm_inputs
.
get
(
"multi_modal_data"
)
if
multi_modal_data
is
None
or
"image"
not
in
multi_modal_data
:
return
llm_inputs
hf_config
=
ctx
.
get_hf_config
(
Blip2Config
)
image_feature_size
=
get_blip2_image_feature_size
(
hf_config
)
# The original model places image tokens at the front
# https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1514
new_token_ids
=
[
BLIP2_IMAGE_TOKEN_ID
]
*
image_feature_size
new_token_ids
+=
llm_inputs
[
"prompt_token_ids"
]
new_prompt
=
llm_inputs
.
get
(
"prompt"
)
if
new_prompt
is
not
None
:
new_prompt
=
BLIP2_IMAGE_TOKEN
*
image_feature_size
+
new_prompt
return
LLMInputs
(
prompt_token_ids
=
new_token_ids
,
prompt
=
new_prompt
,
multi_modal_data
=
multi_modal_data
)
@
MULTIMODAL_REGISTRY
.
register_image_input_mapper
()
@
MULTIMODAL_REGISTRY
.
register_max_image_tokens
(
get_max_blip2_image_tokens
)
@
INPUT_REGISTRY
.
register_dummy_data
(
dummy_data_for_blip2
)
@
INPUT_REGISTRY
.
register_input_processor
(
input_processor_for_blip2
)
class
Blip2ForConditionalGeneration
(
nn
.
Module
,
SupportsVision
):
def
__init__
(
self
,
config
:
Blip2Config
,
multimodal_config
:
MultiModalConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
multimodal_config
=
multimodal_config
# TODO: Optionally initializes this for supporting embeddings.
self
.
vision_model
=
BlipVisionModel
(
config
.
vision_config
)
self
.
query_tokens
=
nn
.
Parameter
(
torch
.
zeros
(
1
,
config
.
num_query_tokens
,
config
.
qformer_config
.
hidden_size
))
self
.
qformer
=
Blip2QFormerModel
(
config
.
qformer_config
,
cache_config
=
cache_config
,
quant_config
=
quant_config
)
self
.
language_projection
=
nn
.
Linear
(
config
.
qformer_config
.
hidden_size
,
config
.
text_config
.
hidden_size
,
bias
=
True
,
)
self
.
quant_config
=
quant_config
self
.
language_model
=
OPTModel
(
config
.
text_config
,
cache_config
,
quant_config
)
self
.
unpadded_vocab_size
=
config
.
text_config
.
vocab_size
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
)
self
.
sampler
=
Sampler
()
def
get_lm_head
(
self
):
return
self
.
language_model
.
decoder
.
embed_tokens
def
_validate_pixel_values
(
self
,
data
:
torch
.
Tensor
)
->
torch
.
Tensor
:
h
=
w
=
self
.
config
.
vision_config
.
image_size
expected_dims
=
(
3
,
h
,
w
)
actual_dims
=
tuple
(
data
.
shape
[
1
:])
if
actual_dims
!=
expected_dims
:
expected_expr
=
(
"batch_size"
,
*
map
(
str
,
expected_dims
))
raise
ValueError
(
f
"The expected shape of pixel values is
{
expected_expr
}
. "
f
"You supplied
{
tuple
(
data
.
shape
)
}
."
)
return
data
def
_parse_and_validate_image_input
(
self
,
**
kwargs
:
object
)
->
Optional
[
Blip2ImageInputs
]:
pixel_values
=
kwargs
.
pop
(
"pixel_values"
,
None
)
if
pixel_values
is
None
:
return
None
if
not
isinstance
(
pixel_values
,
torch
.
Tensor
):
raise
ValueError
(
"Incorrect type of pixel values. "
f
"Got type:
{
type
(
pixel_values
)
}
"
)
return
Blip2ImagePixelInputs
(
type
=
"pixel_values"
,
data
=
self
.
_validate_pixel_values
(
pixel_values
),
)
def
_image_pixels_to_features
(
self
,
vision_model
:
BlipVisionModel
,
pixel_values
:
torch
.
Tensor
)
->
torch
.
Tensor
:
# NOTE: we skip the step to select the vision feature layer since
# this is already done inside the vision tower
image_features
=
vision_model
(
pixel_values
)
return
image_features
def
_process_image_pixels
(
self
,
inputs
:
Blip2ImagePixelInputs
)
->
torch
.
Tensor
:
assert
self
.
vision_model
is
not
None
pixel_values
=
inputs
[
"data"
]
return
self
.
_image_pixels_to_features
(
self
.
vision_model
,
pixel_values
)
def
_process_image_input
(
self
,
image_input
:
Blip2ImageInputs
)
->
torch
.
Tensor
:
assert
self
.
vision_model
is
not
None
image_features
=
self
.
_process_image_pixels
(
image_input
)
query_tokens
=
self
.
query_tokens
.
expand
(
image_features
.
shape
[
0
],
-
1
,
-
1
)
query_output
=
self
.
qformer
(
query_embeds
=
query_tokens
,
encoder_hidden_states
=
image_features
,
)
return
self
.
language_projection
(
query_output
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
**
kwargs
:
object
,
)
->
SamplerOutput
:
"""Run forward pass for BLIP-2.
One key thing to understand is the `input_ids` already accounts for the
positions of the to-be-inserted image embeddings.
Concretely, consider a text prompt:
`"Question: What's the content of the image? Answer:"`.
Tokenizer outputs:
`[2, 45641, 35, 653, 18, 5, 1383, 9, 5, 2274, 116, 31652, 35]`.
To reserve space in KV cache, we have to insert placeholder tokens
before they are inputted to the model, so the input processor prepends
dummy tokens (denoted as `50265`), resulting in:
`[50265, ..., 50265, 2, 45641, 35, ..., 31652, 35]`.
We insert 32 tokens since it corresponds to the number of query
embeddings outputted by the Q-Former and inputted to the language model.
This way, the `positions` and `attn_metadata` are consistent
with the `input_ids`.
Args:
input_ids: Flattened (concatenated) input_ids corresponding to a
batch.
pixel_values: The pixels in each input image.
See also:
:class:`Blip2ImageInputs`
"""
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
not
None
:
vision_embeddings
=
self
.
_process_image_input
(
image_input
)
inputs_embeds
=
self
.
language_model
.
get_input_embeddings
(
input_ids
)
inputs_embeds
=
merge_vision_embeddings
(
input_ids
,
inputs_embeds
,
vision_embeddings
,
BLIP2_IMAGE_TOKEN_ID
)
input_ids
=
None
else
:
inputs_embeds
=
None
hidden_states
=
self
.
language_model
(
input_ids
,
positions
,
kv_caches
,
attn_metadata
,
inputs_embeds
=
inputs_embeds
)
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
torch
.
Tensor
:
logits
=
self
.
logits_processor
(
self
.
get_lm_head
(),
hidden_states
,
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
# only doing this for language model part for now.
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
"qkv_proj"
,
"q_proj"
,
"q"
),
(
"qkv_proj"
,
"k_proj"
,
"k"
),
(
"qkv_proj"
,
"v_proj"
,
"v"
),
(
"gate_up_proj"
,
"gate_proj"
,
0
),
(
"gate_up_proj"
,
"up_proj"
,
1
),
]
params_dict
=
dict
(
self
.
named_parameters
())
for
name
,
loaded_weight
in
weights
:
if
"lm_head.weight"
in
name
:
continue
if
"rotary_emb.inv_freq"
in
name
:
continue
for
key_to_modify
,
new_key
in
_KEYS_TO_MODIFY_MAPPING
.
items
():
if
key_to_modify
in
name
:
name
=
name
.
replace
(
key_to_modify
,
new_key
)
use_default_weight_loading
=
False
if
"vision"
in
name
:
if
self
.
vision_model
is
not
None
:
# We only do sharding for language model and
# not vision model for now.
use_default_weight_loading
=
True
else
:
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
param
=
params_dict
[
name
.
replace
(
weight_name
,
param_name
)]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
use_default_weight_loading
=
True
if
use_default_weight_loading
:
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
vllm/model_executor/models/chameleon.py
View file @
e661d594
...
@@ -6,6 +6,7 @@ import torch
...
@@ -6,6 +6,7 @@ import torch
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
PIL
import
Image
from
PIL
import
Image
from
torch
import
nn
from
torch
import
nn
from
transformers
import
ChameleonConfig
,
ChameleonVQVAEConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.config
import
CacheConfig
,
MultiModalConfig
from
vllm.config
import
CacheConfig
,
MultiModalConfig
...
@@ -30,8 +31,6 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
...
@@ -30,8 +31,6 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
(
cached_get_tokenizer
,
from
vllm.multimodal.image
import
(
cached_get_tokenizer
,
repeat_and_pad_image_tokens
)
repeat_and_pad_image_tokens
)
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
,
SequenceData
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
,
SequenceData
from
vllm.transformers_utils.configs
import
(
ChameleonConfig
,
ChameleonVQVAEConfig
)
from
vllm.utils
import
print_warning_once
from
vllm.utils
import
print_warning_once
from
.interfaces
import
SupportsVision
from
.interfaces
import
SupportsVision
...
@@ -126,7 +125,8 @@ def input_processor_for_chameleon(ctx: InputContext, llm_inputs: LLMInputs):
...
@@ -126,7 +125,8 @@ def input_processor_for_chameleon(ctx: InputContext, llm_inputs: LLMInputs):
# Appending sep token for chat mode to follow default processor
# Appending sep token for chat mode to follow default processor
# behavior
# behavior
new_prompt
+=
tokenizer
.
sep_token
if
new_prompt
is
not
None
:
new_prompt
+=
tokenizer
.
sep_token
new_token_ids
+=
[
CHAMELEON_SEP_TOKEN_ID
]
new_token_ids
+=
[
CHAMELEON_SEP_TOKEN_ID
]
# NOTE: Create a defensive copy of the original inputs
# NOTE: Create a defensive copy of the original inputs
...
@@ -998,6 +998,13 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsVision):
...
@@ -998,6 +998,13 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsVision):
# Models trained using ColossalAI may include these tensors in
# Models trained using ColossalAI may include these tensors in
# the checkpoint. Skip them.
# the checkpoint. Skip them.
continue
continue
# With tie_word_embeddings, we can skip lm_head.weight
# The weight might appear unnecessarily in the files if the model is
# processed with quantization, LoRA, fine-tuning, etc.
if
self
.
config
.
tie_word_embeddings
and
"lm_head.weight"
in
name
:
continue
use_default_weight_loading
=
False
use_default_weight_loading
=
False
if
"vqmodel"
in
name
:
if
"vqmodel"
in
name
:
if
self
.
model
.
vqmodel
is
not
None
:
if
self
.
model
.
vqmodel
is
not
None
:
...
...
vllm/model_executor/models/deepseek_v2.py
View file @
e661d594
...
@@ -29,7 +29,8 @@ from transformers import PretrainedConfig
...
@@ -29,7 +29,8 @@ from transformers import PretrainedConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.config
import
CacheConfig
from
vllm.config
import
CacheConfig
from
vllm.distributed
import
(
get_tensor_model_parallel_world_size
,
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_reduce
)
tensor_model_parallel_all_reduce
)
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
...
@@ -49,6 +50,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
...
@@ -49,6 +50,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
.utils
import
PPMissingLayer
,
is_pp_missing_parameter
,
make_layers
class
DeepseekV2MLP
(
nn
.
Module
):
class
DeepseekV2MLP
(
nn
.
Module
):
...
@@ -59,17 +62,20 @@ class DeepseekV2MLP(nn.Module):
...
@@ -59,17 +62,20 @@ class DeepseekV2MLP(nn.Module):
hidden_act
:
str
,
hidden_act
:
str
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
reduce_results
:
bool
=
True
,
reduce_results
:
bool
=
True
,
prefix
:
str
=
""
,
)
->
None
:
)
->
None
:
super
().
__init__
()
super
().
__init__
()
self
.
gate_up_proj
=
MergedColumnParallelLinear
(
self
.
gate_up_proj
=
MergedColumnParallelLinear
(
hidden_size
,
[
intermediate_size
]
*
2
,
hidden_size
,
[
intermediate_size
]
*
2
,
bias
=
False
,
bias
=
False
,
quant_config
=
quant_config
)
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.gate_up_proj"
)
self
.
down_proj
=
RowParallelLinear
(
intermediate_size
,
self
.
down_proj
=
RowParallelLinear
(
intermediate_size
,
hidden_size
,
hidden_size
,
bias
=
False
,
bias
=
False
,
quant_config
=
quant_config
,
quant_config
=
quant_config
,
reduce_results
=
reduce_results
)
reduce_results
=
reduce_results
,
prefix
=
f
"
{
prefix
}
.down_proj"
)
if
hidden_act
!=
"silu"
:
if
hidden_act
!=
"silu"
:
raise
ValueError
(
f
"Unsupported activation:
{
hidden_act
}
. "
raise
ValueError
(
f
"Unsupported activation:
{
hidden_act
}
. "
"Only silu is supported for now."
)
"Only silu is supported for now."
)
...
@@ -88,6 +94,7 @@ class DeepseekV2MoE(nn.Module):
...
@@ -88,6 +94,7 @@ class DeepseekV2MoE(nn.Module):
self
,
self
,
config
:
PretrainedConfig
,
config
:
PretrainedConfig
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
):
):
super
().
__init__
()
super
().
__init__
()
self
.
tp_size
=
get_tensor_model_parallel_world_size
()
self
.
tp_size
=
get_tensor_model_parallel_world_size
()
...
@@ -112,12 +119,14 @@ class DeepseekV2MoE(nn.Module):
...
@@ -112,12 +119,14 @@ class DeepseekV2MoE(nn.Module):
quant_config
=
quant_config
,
quant_config
=
quant_config
,
use_grouped_topk
=
True
,
use_grouped_topk
=
True
,
num_expert_group
=
config
.
n_group
,
num_expert_group
=
config
.
n_group
,
topk_group
=
config
.
topk_group
)
topk_group
=
config
.
topk_group
,
prefix
=
f
"
{
prefix
}
.experts"
)
self
.
gate
=
ReplicatedLinear
(
config
.
hidden_size
,
self
.
gate
=
ReplicatedLinear
(
config
.
hidden_size
,
config
.
n_routed_experts
,
config
.
n_routed_experts
,
bias
=
False
,
bias
=
False
,
quant_config
=
None
)
quant_config
=
None
,
prefix
=
f
"
{
prefix
}
.gate"
)
if
config
.
n_shared_experts
is
not
None
:
if
config
.
n_shared_experts
is
not
None
:
intermediate_size
=
(
config
.
moe_intermediate_size
*
intermediate_size
=
(
config
.
moe_intermediate_size
*
config
.
n_shared_experts
)
config
.
n_shared_experts
)
...
@@ -172,10 +181,9 @@ class DeepseekV2Attention(nn.Module):
...
@@ -172,10 +181,9 @@ class DeepseekV2Attention(nn.Module):
max_position_embeddings
:
int
=
8192
,
max_position_embeddings
:
int
=
8192
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
layer_idx
=
None
,
prefix
:
str
=
""
,
)
->
None
:
)
->
None
:
super
().
__init__
()
super
().
__init__
()
self
.
layer_idx
=
layer_idx
self
.
hidden_size
=
hidden_size
self
.
hidden_size
=
hidden_size
self
.
qk_nope_head_dim
=
qk_nope_head_dim
self
.
qk_nope_head_dim
=
qk_nope_head_dim
self
.
qk_rope_head_dim
=
qk_rope_head_dim
self
.
qk_rope_head_dim
=
qk_rope_head_dim
...
@@ -195,38 +203,44 @@ class DeepseekV2Attention(nn.Module):
...
@@ -195,38 +203,44 @@ class DeepseekV2Attention(nn.Module):
self
.
q_a_proj
=
ReplicatedLinear
(
self
.
hidden_size
,
self
.
q_a_proj
=
ReplicatedLinear
(
self
.
hidden_size
,
self
.
q_lora_rank
,
self
.
q_lora_rank
,
bias
=
False
,
bias
=
False
,
quant_config
=
quant_config
)
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.q_a_proj"
)
self
.
q_a_layernorm
=
RMSNorm
(
self
.
q_lora_rank
,
self
.
q_a_layernorm
=
RMSNorm
(
self
.
q_lora_rank
,
eps
=
config
.
rms_norm_eps
)
eps
=
config
.
rms_norm_eps
)
self
.
q_b_proj
=
ColumnParallelLinear
(
q_lora_rank
,
self
.
q_b_proj
=
ColumnParallelLinear
(
q_lora_rank
,
self
.
num_heads
*
self
.
num_heads
*
self
.
qk_head_dim
,
self
.
qk_head_dim
,
bias
=
False
,
bias
=
False
,
quant_config
=
quant_config
)
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.q_b_proj"
)
else
:
else
:
self
.
q_proj
=
ColumnParallelLinear
(
self
.
hidden_size
,
self
.
q_proj
=
ColumnParallelLinear
(
self
.
hidden_size
,
self
.
num_heads
*
self
.
num_heads
*
self
.
qk_head_dim
,
self
.
qk_head_dim
,
bias
=
False
,
bias
=
False
,
quant_config
=
quant_config
)
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.q_proj"
)
self
.
kv_a_proj_with_mqa
=
ReplicatedLinear
(
self
.
hidden_size
,
self
.
kv_a_proj_with_mqa
=
ReplicatedLinear
(
self
.
kv_lora_rank
+
self
.
hidden_size
,
self
.
qk_rope_head_dim
,
self
.
kv_lora_rank
+
self
.
qk_rope_head_dim
,
bias
=
False
,
bias
=
False
,
quant_config
=
quant_config
)
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.kv_a_proj_with_mqa"
)
self
.
kv_a_layernorm
=
RMSNorm
(
self
.
kv_lora_rank
,
self
.
kv_a_layernorm
=
RMSNorm
(
self
.
kv_lora_rank
,
eps
=
config
.
rms_norm_eps
)
eps
=
config
.
rms_norm_eps
)
self
.
kv_b_proj
=
ColumnParallelLinear
(
self
.
kv_b_proj
=
ColumnParallelLinear
(
self
.
kv_lora_rank
,
self
.
kv_lora_rank
,
self
.
num_heads
*
(
self
.
qk_nope_head_dim
+
self
.
v_head_dim
),
self
.
num_heads
*
(
self
.
qk_nope_head_dim
+
self
.
v_head_dim
),
bias
=
False
,
bias
=
False
,
quant_config
=
quant_config
)
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.kv_b_proj"
)
# O projection.
# O projection.
self
.
o_proj
=
RowParallelLinear
(
self
.
num_heads
*
self
.
v_head_dim
,
self
.
o_proj
=
RowParallelLinear
(
self
.
num_heads
*
self
.
v_head_dim
,
self
.
hidden_size
,
self
.
hidden_size
,
bias
=
False
,
bias
=
False
,
quant_config
=
quant_config
)
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.o_proj"
)
rope_scaling
[
'type'
]
=
'deepseek_yarn'
rope_scaling
[
'type'
]
=
'deepseek_yarn'
self
.
rotary_emb
=
get_rope
(
qk_rope_head_dim
,
self
.
rotary_emb
=
get_rope
(
qk_rope_head_dim
,
rotary_dim
=
qk_rope_head_dim
,
rotary_dim
=
qk_rope_head_dim
,
...
@@ -308,7 +322,7 @@ class DeepseekV2DecoderLayer(nn.Module):
...
@@ -308,7 +322,7 @@ class DeepseekV2DecoderLayer(nn.Module):
def
__init__
(
def
__init__
(
self
,
self
,
config
:
PretrainedConfig
,
config
:
PretrainedConfig
,
layer_idx
:
int
,
prefix
:
str
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
)
->
None
:
)
->
None
:
...
@@ -318,6 +332,9 @@ class DeepseekV2DecoderLayer(nn.Module):
...
@@ -318,6 +332,9 @@ class DeepseekV2DecoderLayer(nn.Module):
rope_scaling
=
getattr
(
config
,
"rope_scaling"
,
None
)
rope_scaling
=
getattr
(
config
,
"rope_scaling"
,
None
)
max_position_embeddings
=
getattr
(
config
,
"max_position_embeddings"
,
max_position_embeddings
=
getattr
(
config
,
"max_position_embeddings"
,
8192
)
8192
)
# DecoderLayers are created with `make_layers` which passes the prefix
# with the layer's index.
layer_idx
=
int
(
prefix
.
split
(
sep
=
'.'
)[
-
1
])
self
.
self_attn
=
DeepseekV2Attention
(
self
.
self_attn
=
DeepseekV2Attention
(
config
=
config
,
config
=
config
,
hidden_size
=
self
.
hidden_size
,
hidden_size
=
self
.
hidden_size
,
...
@@ -333,18 +350,23 @@ class DeepseekV2DecoderLayer(nn.Module):
...
@@ -333,18 +350,23 @@ class DeepseekV2DecoderLayer(nn.Module):
max_position_embeddings
=
max_position_embeddings
,
max_position_embeddings
=
max_position_embeddings
,
cache_config
=
cache_config
,
cache_config
=
cache_config
,
quant_config
=
quant_config
,
quant_config
=
quant_config
,
layer_idx
=
layer_idx
,
prefix
=
f
"
{
prefix
}
.self_attn"
,
)
)
if
(
config
.
n_routed_experts
is
not
None
if
(
config
.
n_routed_experts
is
not
None
and
layer_idx
>=
config
.
first_k_dense_replace
and
layer_idx
>=
config
.
first_k_dense_replace
and
layer_idx
%
config
.
moe_layer_freq
==
0
):
and
layer_idx
%
config
.
moe_layer_freq
==
0
):
self
.
mlp
=
DeepseekV2MoE
(
config
=
config
,
quant_config
=
quant_config
)
self
.
mlp
=
DeepseekV2MoE
(
config
=
config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.mlp"
,
)
else
:
else
:
self
.
mlp
=
DeepseekV2MLP
(
self
.
mlp
=
DeepseekV2MLP
(
hidden_size
=
config
.
hidden_size
,
hidden_size
=
config
.
hidden_size
,
intermediate_size
=
config
.
intermediate_size
,
intermediate_size
=
config
.
intermediate_size
,
hidden_act
=
config
.
hidden_act
,
hidden_act
=
config
.
hidden_act
,
quant_config
=
quant_config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.mlp"
,
)
)
self
.
input_layernorm
=
RMSNorm
(
config
.
hidden_size
,
self
.
input_layernorm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
eps
=
config
.
rms_norm_eps
)
...
@@ -389,23 +411,34 @@ class DeepseekV2Model(nn.Module):
...
@@ -389,23 +411,34 @@ class DeepseekV2Model(nn.Module):
config
:
PretrainedConfig
,
config
:
PretrainedConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
)
->
None
:
)
->
None
:
super
().
__init__
()
super
().
__init__
()
self
.
padding_idx
=
config
.
pad_token_id
self
.
padding_idx
=
config
.
pad_token_id
self
.
vocab_size
=
config
.
vocab_size
self
.
vocab_size
=
config
.
vocab_size
self
.
embed_tokens
=
VocabParallelEmbedding
(
if
get_pp_group
().
is_first_rank
:
config
.
vocab_size
,
self
.
embed_tokens
=
VocabParallelEmbedding
(
config
.
hidden_size
,
config
.
vocab_size
,
)
config
.
hidden_size
,
self
.
layers
=
nn
.
ModuleList
([
)
DeepseekV2DecoderLayer
(
config
,
else
:
layer_idx
,
self
.
embed_tokens
=
PPMissingLayer
()
cache_config
=
cache_config
,
quant_config
=
quant_config
)
self
.
start_layer
,
self
.
end_layer
,
self
.
layers
=
make_layers
(
for
layer_idx
in
range
(
config
.
num_hidden_layers
)
config
.
num_hidden_layers
,
])
lambda
prefix
:
DeepseekV2DecoderLayer
(
self
.
norm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
config
,
prefix
,
cache_config
=
cache_config
,
quant_config
=
quant_config
,
),
prefix
=
f
"
{
prefix
}
.layers"
)
if
get_pp_group
().
is_last_rank
:
self
.
norm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
else
:
self
.
norm
=
PPMissingLayer
()
def
forward
(
def
forward
(
self
,
self
,
...
@@ -413,14 +446,28 @@ class DeepseekV2Model(nn.Module):
...
@@ -413,14 +446,28 @@ class DeepseekV2Model(nn.Module):
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
],
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
hidden_states
=
self
.
embed_tokens
(
input_ids
)
if
get_pp_group
().
is_first_rank
:
residual
=
None
hidden_states
=
self
.
embed_tokens
(
input_ids
)
for
i
in
range
(
len
(
self
.
layers
)):
residual
=
None
else
:
assert
intermediate_tensors
is
not
None
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
residual
=
intermediate_tensors
[
"residual"
]
for
i
in
range
(
self
.
start_layer
,
self
.
end_layer
):
layer
=
self
.
layers
[
i
]
layer
=
self
.
layers
[
i
]
hidden_states
,
residual
=
layer
(
positions
,
hidden_states
,
hidden_states
,
residual
=
layer
(
positions
,
hidden_states
,
kv_caches
[
i
],
attn_metadata
,
kv_caches
[
i
-
self
.
start_layer
],
residual
)
attn_metadata
,
residual
)
if
not
get_pp_group
().
is_last_rank
:
return
IntermediateTensors
({
"hidden_states"
:
hidden_states
,
"residual"
:
residual
})
hidden_states
,
_
=
self
.
norm
(
hidden_states
,
residual
)
hidden_states
,
_
=
self
.
norm
(
hidden_states
,
residual
)
return
hidden_states
return
hidden_states
...
@@ -436,7 +483,10 @@ class DeepseekV2ForCausalLM(nn.Module):
...
@@ -436,7 +483,10 @@ class DeepseekV2ForCausalLM(nn.Module):
super
().
__init__
()
super
().
__init__
()
self
.
config
=
config
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
quant_config
=
quant_config
self
.
model
=
DeepseekV2Model
(
config
,
cache_config
,
quant_config
)
self
.
model
=
DeepseekV2Model
(
config
,
cache_config
,
quant_config
,
prefix
=
"model"
)
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
hidden_size
,
config
.
hidden_size
,
quant_config
=
quant_config
)
quant_config
=
quant_config
)
...
@@ -452,7 +502,7 @@ class DeepseekV2ForCausalLM(nn.Module):
...
@@ -452,7 +502,7 @@ class DeepseekV2ForCausalLM(nn.Module):
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
hidden_states
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
hidden_states
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
attn_metadata
)
attn_metadata
,
intermediate_tensors
)
return
hidden_states
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
...
@@ -469,6 +519,20 @@ class DeepseekV2ForCausalLM(nn.Module):
...
@@ -469,6 +519,20 @@ class DeepseekV2ForCausalLM(nn.Module):
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
return
next_tokens
def
make_empty_intermediate_tensors
(
self
,
batch_size
:
int
,
dtype
:
torch
.
dtype
,
device
:
torch
.
device
)
->
IntermediateTensors
:
return
IntermediateTensors
({
"hidden_states"
:
torch
.
zeros
((
batch_size
,
self
.
config
.
hidden_size
),
dtype
=
dtype
,
device
=
device
),
"residual"
:
torch
.
zeros
((
batch_size
,
self
.
config
.
hidden_size
),
dtype
=
dtype
,
device
=
device
),
})
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
stacked_params_mapping
=
[
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
# (param_name, shard_name, shard_id)
...
@@ -504,6 +568,10 @@ class DeepseekV2ForCausalLM(nn.Module):
...
@@ -504,6 +568,10 @@ class DeepseekV2ForCausalLM(nn.Module):
# Skip loading extra bias for GPTQ models.
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
continue
if
is_pp_missing_parameter
(
name
,
self
):
continue
param
=
params_dict
[
name
]
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
weight_loader
(
param
,
loaded_weight
,
shard_id
)
...
@@ -514,6 +582,10 @@ class DeepseekV2ForCausalLM(nn.Module):
...
@@ -514,6 +582,10 @@ class DeepseekV2ForCausalLM(nn.Module):
if
weight_name
not
in
name
:
if
weight_name
not
in
name
:
continue
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
name
=
name
.
replace
(
weight_name
,
param_name
)
if
is_pp_missing_parameter
(
name
,
self
):
continue
param
=
params_dict
[
name
]
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
weight_loader
(
param
,
...
@@ -527,6 +599,9 @@ class DeepseekV2ForCausalLM(nn.Module):
...
@@ -527,6 +599,9 @@ class DeepseekV2ForCausalLM(nn.Module):
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
continue
if
is_pp_missing_parameter
(
name
,
self
):
continue
param
=
params_dict
[
name
]
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
default_weight_loader
)
...
...
vllm/model_executor/models/fuyu.py
View file @
e661d594
...
@@ -169,7 +169,7 @@ def input_processor_for_fuyu(ctx: InputContext, llm_inputs: LLMInputs):
...
@@ -169,7 +169,7 @@ def input_processor_for_fuyu(ctx: InputContext, llm_inputs: LLMInputs):
raise
TypeError
(
f
"Invalid image type:
{
type
(
image_data
)
}
"
)
raise
TypeError
(
f
"Invalid image type:
{
type
(
image_data
)
}
"
)
# process prompts
# process prompts
prompt
=
llm_inputs
[
"prompt"
]
prompt
=
llm_inputs
.
get
(
"prompt"
)
prompt_token_ids
=
llm_inputs
[
"prompt_token_ids"
]
prompt_token_ids
=
llm_inputs
[
"prompt_token_ids"
]
tokenizer
=
cached_get_tokenizer
(
model_config
.
model
)
tokenizer
=
cached_get_tokenizer
(
model_config
.
model
)
# dim0 is batch_size, dim1 is subseq_size which will always be 1
# dim0 is batch_size, dim1 is subseq_size which will always be 1
...
...
vllm/model_executor/models/gemma.py
View file @
e661d594
...
@@ -404,6 +404,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA):
...
@@ -404,6 +404,6 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA):
loaded_params
.
add
(
name
)
loaded_params
.
add
(
name
)
unloaded_params
=
params_dict
.
keys
()
-
loaded_params
unloaded_params
=
params_dict
.
keys
()
-
loaded_params
if
unloaded_params
:
if
unloaded_params
:
raise
RuntimeError
(
logger
.
warning
(
"Some weights are not initialized from checkpoints:
"
"Some weights are not initialized from checkpoints:
%s"
,
f
"
{
unloaded_params
}
"
)
unloaded_params
)
vllm/model_executor/models/gemma2.py
View file @
e661d594
...
@@ -23,6 +23,7 @@ from transformers import Gemma2Config
...
@@ -23,6 +23,7 @@ from transformers import Gemma2Config
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.config
import
CacheConfig
,
LoRAConfig
from
vllm.config
import
CacheConfig
,
LoRAConfig
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.activation
import
GeluAndMul
from
vllm.model_executor.layers.activation
import
GeluAndMul
from
vllm.model_executor.layers.layernorm
import
GemmaRMSNorm
from
vllm.model_executor.layers.layernorm
import
GemmaRMSNorm
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
...
@@ -41,6 +42,8 @@ from vllm.sequence import IntermediateTensors, SamplerOutput
...
@@ -41,6 +42,8 @@ from vllm.sequence import IntermediateTensors, SamplerOutput
from
.interfaces
import
SupportsLoRA
from
.interfaces
import
SupportsLoRA
logger
=
init_logger
(
__name__
)
class
Gemma2MLP
(
nn
.
Module
):
class
Gemma2MLP
(
nn
.
Module
):
...
@@ -87,7 +90,8 @@ class Gemma2Attention(nn.Module):
...
@@ -87,7 +90,8 @@ class Gemma2Attention(nn.Module):
max_position_embeddings
:
int
,
max_position_embeddings
:
int
,
rope_theta
:
float
,
rope_theta
:
float
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
)
->
None
:
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
attn_logits_soft_cap
:
Optional
[
float
]
=
None
)
->
None
:
super
().
__init__
()
super
().
__init__
()
self
.
layer_idx
=
layer_idx
self
.
layer_idx
=
layer_idx
self
.
config
=
config
self
.
config
=
config
...
@@ -147,7 +151,8 @@ class Gemma2Attention(nn.Module):
...
@@ -147,7 +151,8 @@ class Gemma2Attention(nn.Module):
self
.
scaling
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
,
num_kv_heads
=
self
.
num_kv_heads
,
cache_config
=
cache_config
,
cache_config
=
cache_config
,
quant_config
=
quant_config
)
quant_config
=
quant_config
,
logits_soft_cap
=
attn_logits_soft_cap
)
def
forward
(
def
forward
(
self
,
self
,
...
@@ -186,6 +191,7 @@ class Gemma2DecoderLayer(nn.Module):
...
@@ -186,6 +191,7 @@ class Gemma2DecoderLayer(nn.Module):
rope_theta
=
config
.
rope_theta
,
rope_theta
=
config
.
rope_theta
,
cache_config
=
cache_config
,
cache_config
=
cache_config
,
quant_config
=
quant_config
,
quant_config
=
quant_config
,
attn_logits_soft_cap
=
config
.
attn_logit_softcapping
,
)
)
self
.
hidden_size
=
config
.
hidden_size
self
.
hidden_size
=
config
.
hidden_size
self
.
mlp
=
Gemma2MLP
(
self
.
mlp
=
Gemma2MLP
(
...
@@ -390,6 +396,6 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA):
...
@@ -390,6 +396,6 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA):
unloaded_params
=
params_dict
.
keys
()
-
loaded_params
unloaded_params
=
params_dict
.
keys
()
-
loaded_params
if
unloaded_params
:
if
unloaded_params
:
raise
RuntimeError
(
logger
.
warning
(
"Some weights are not initialized from checkpoints:
"
"Some weights are not initialized from checkpoints:
%s"
,
f
"
{
unloaded_params
}
"
)
unloaded_params
)
vllm/model_executor/models/idefics2_vision_model.py
0 → 100644
View file @
e661d594
# coding=utf-8
# adapted from https://github.com/huggingface/transformers/blob/v4.43.2/src/transformers/models/idefics2/modeling_idefics2.py
# Copyright 2024 The vLLM team.
# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch Idefics2 model."""
from
typing
import
Optional
import
torch
from
torch
import
nn
from
transformers.models.idefics2.configuration_idefics2
import
(
Idefics2Config
,
Idefics2VisionConfig
)
from
xformers
import
ops
as
xops
from
vllm.distributed
import
divide
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
QKVParallelLinear
,
RowParallelLinear
)
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
class
Idefics2VisionEmbeddings
(
nn
.
Module
):
"""
This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings
` to enable images of variable
resolution.
The modifications are adapted from [Patch n' Pack: NaViT, a Vision
Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
which allows treating images in their native aspect ratio and without the
need to resize them to the same fixed size. In particular, we start from the
original pre-trained SigLIP model(which uses images of fixed-size square
images) and adapt it by training on images of variable resolutions.
"""
def
__init__
(
self
,
config
:
Idefics2VisionConfig
):
super
().
__init__
()
self
.
embed_dim
=
config
.
hidden_size
self
.
image_size
=
config
.
image_size
self
.
patch_size
=
config
.
patch_size
self
.
patch_embedding
=
nn
.
Conv2d
(
in_channels
=
config
.
num_channels
,
out_channels
=
self
.
embed_dim
,
kernel_size
=
self
.
patch_size
,
stride
=
self
.
patch_size
,
padding
=
"valid"
,
)
self
.
num_patches_per_side
=
self
.
image_size
//
self
.
patch_size
self
.
num_patches
=
self
.
num_patches_per_side
**
2
self
.
num_positions
=
self
.
num_patches
self
.
position_embedding
=
nn
.
Embedding
(
self
.
num_positions
,
self
.
embed_dim
)
def
forward
(
self
,
pixel_values
:
torch
.
FloatTensor
,
patch_attention_mask
:
torch
.
BoolTensor
,
)
->
torch
.
Tensor
:
batch_size
,
_
,
max_im_h
,
max_im_w
=
pixel_values
.
shape
patch_embeds
=
self
.
patch_embedding
(
pixel_values
)
embeddings
=
patch_embeds
.
flatten
(
2
).
transpose
(
1
,
2
)
max_nb_patches_h
,
max_nb_patches_w
=
(
max_im_h
//
self
.
patch_size
,
max_im_w
//
self
.
patch_size
,
)
boundaries
=
torch
.
arange
(
1
/
self
.
num_patches_per_side
,
1.0
,
1
/
self
.
num_patches_per_side
)
position_ids
=
torch
.
full
(
size
=
(
batch_size
,
max_nb_patches_h
*
max_nb_patches_w
),
fill_value
=
0
)
for
batch_idx
,
p_attn_mask
in
enumerate
(
patch_attention_mask
):
nb_patches_h
=
p_attn_mask
[:,
0
].
sum
()
nb_patches_w
=
p_attn_mask
[
0
].
sum
()
fractional_coords_h
=
torch
.
arange
(
0
,
1
-
1e-6
,
1
/
nb_patches_h
)
fractional_coords_w
=
torch
.
arange
(
0
,
1
-
1e-6
,
1
/
nb_patches_w
)
bucket_coords_h
=
torch
.
bucketize
(
fractional_coords_h
,
boundaries
,
right
=
True
)
bucket_coords_w
=
torch
.
bucketize
(
fractional_coords_w
,
boundaries
,
right
=
True
)
pos_ids
=
(
bucket_coords_h
[:,
None
]
*
self
.
num_patches_per_side
+
bucket_coords_w
).
flatten
()
position_ids
[
batch_idx
][
p_attn_mask
.
view
(
-
1
).
cpu
()]
=
pos_ids
position_ids
=
position_ids
.
to
(
self
.
position_embedding
.
weight
.
device
)
embeddings
=
embeddings
+
self
.
position_embedding
(
position_ids
)
return
embeddings
class
Idefics2VisionAttention
(
nn
.
Module
):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def
__init__
(
self
,
config
:
Idefics2Config
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
):
super
().
__init__
()
self
.
config
=
config
self
.
embed_dim
=
config
.
hidden_size
self
.
num_heads
=
config
.
num_attention_heads
self
.
head_dim
=
self
.
embed_dim
//
self
.
num_heads
if
self
.
head_dim
*
self
.
num_heads
!=
self
.
embed_dim
:
raise
ValueError
(
f
"embed_dim must be divisible by num_heads (got `embed_dim`:
{
self
.
embed_dim
}
and `num_heads`:"
# noqa: E501
f
"
{
self
.
num_heads
}
)."
)
self
.
scale
=
self
.
head_dim
**-
0.5
self
.
dropout
=
config
.
attention_dropout
self
.
qkv_proj
=
QKVParallelLinear
(
self
.
embed_dim
,
self
.
head_dim
,
self
.
num_heads
,
quant_config
=
quant_config
,
)
self
.
out_proj
=
RowParallelLinear
(
self
.
embed_dim
,
self
.
embed_dim
,
bias
=
True
,
quant_config
=
quant_config
,
)
self
.
tp_size
=
get_tensor_model_parallel_world_size
()
self
.
num_heads_per_partition
=
divide
(
self
.
num_heads
,
self
.
tp_size
)
self
.
is_causal
=
False
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
batch_size
,
q_len
,
_
=
hidden_states
.
size
()
qkv
,
_
=
self
.
qkv_proj
(
hidden_states
)
# batch_size, q_len, 3 * num_heads_per_partition * head_dim
query_states
,
key_states
,
value_states
=
qkv
.
chunk
(
3
,
dim
=-
1
)
query_states
=
query_states
.
view
(
batch_size
,
q_len
,
self
.
num_heads_per_partition
,
self
.
head_dim
)
key_states
=
key_states
.
view
(
batch_size
,
q_len
,
self
.
num_heads_per_partition
,
self
.
head_dim
)
value_states
=
value_states
.
view
(
batch_size
,
q_len
,
self
.
num_heads_per_partition
,
self
.
head_dim
)
# see: https://facebookresearch.github.io/xformers/components/ops.html
out
=
xops
.
memory_efficient_attention_forward
(
query_states
,
key_states
,
value_states
,
p
=
self
.
dropout
,
scale
=
self
.
scale
,
)
out
=
out
.
view
(
batch_size
,
q_len
,
-
1
)
attn_output
,
_
=
self
.
out_proj
(
out
)
return
attn_output
class
Idefics2VisionMLP
(
nn
.
Module
):
def
__init__
(
self
,
config
:
Idefics2Config
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
):
super
().
__init__
()
self
.
config
=
config
self
.
activation_fn
=
get_act_fn
(
config
.
hidden_act
)
self
.
fc1
=
ColumnParallelLinear
(
config
.
hidden_size
,
config
.
intermediate_size
,
bias
=
True
,
quant_config
=
quant_config
,
)
self
.
fc2
=
RowParallelLinear
(
config
.
intermediate_size
,
config
.
hidden_size
,
bias
=
True
,
quant_config
=
quant_config
,
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
hidden_states
,
_
=
self
.
fc1
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
hidden_states
)
hidden_states
,
_
=
self
.
fc2
(
hidden_states
)
return
hidden_states
class
Idefics2EncoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
Idefics2Config
):
super
().
__init__
()
self
.
embed_dim
=
config
.
hidden_size
self
.
self_attn
=
Idefics2VisionAttention
(
config
)
self
.
layer_norm1
=
nn
.
LayerNorm
(
self
.
embed_dim
,
eps
=
config
.
layer_norm_eps
)
self
.
mlp
=
Idefics2VisionMLP
(
config
)
self
.
layer_norm2
=
nn
.
LayerNorm
(
self
.
embed_dim
,
eps
=
config
.
layer_norm_eps
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
"""
Args:
hidden_states (`torch.FloatTensor`):
Input to the layer of shape `(batch, seq_len, embed_dim)`.
"""
residual
=
hidden_states
hidden_states
=
self
.
layer_norm1
(
hidden_states
)
hidden_states
=
self
.
self_attn
(
hidden_states
)
hidden_states
=
residual
+
hidden_states
residual
=
hidden_states
hidden_states
=
self
.
layer_norm2
(
hidden_states
)
hidden_states
=
self
.
mlp
(
hidden_states
)
hidden_states
=
residual
+
hidden_states
return
hidden_states
class
Idefics2Encoder
(
nn
.
Module
):
"""
Transformer encoder consisting of `config.num_hidden_layers` self attention
layers. Each layer is a
[`Idefics2EncoderLayer`].
Args:
config: Idefics2Config
"""
def
__init__
(
self
,
config
:
Idefics2Config
):
super
().
__init__
()
self
.
config
=
config
self
.
layers
=
nn
.
ModuleList
([
Idefics2EncoderLayer
(
config
)
for
_
in
range
(
config
.
num_hidden_layers
)
])
def
forward
(
self
,
inputs_embeds
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
r
"""
Args:
inputs_embeds (torch.Tensor):
Optionally, instead of passing `input_ids` you can choose to
directly pass an embedded representation.
This is useful if you want more control over how to convert
`input_ids` indices into associated vectorsthan the model's
internal embedding lookup matrix.
"""
hidden_states
=
inputs_embeds
for
encoder_layer
in
self
.
layers
:
layer_outputs
=
encoder_layer
(
hidden_states
)
hidden_states
=
layer_outputs
return
hidden_states
class
Idefics2VisionTransformer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
Idefics2VisionConfig
):
super
().
__init__
()
embed_dim
=
config
.
hidden_size
self
.
config
=
config
self
.
embeddings
=
Idefics2VisionEmbeddings
(
config
)
self
.
encoder
=
Idefics2Encoder
(
config
)
self
.
post_layernorm
=
nn
.
LayerNorm
(
embed_dim
,
eps
=
config
.
layer_norm_eps
)
def
get_input_embeddings
(
self
):
return
self
.
embeddings
def
forward
(
self
,
pixel_values
,
patch_attention_mask
:
Optional
[
torch
.
BoolTensor
]
=
None
,
)
->
torch
.
tensor
:
hidden_states
=
self
.
embeddings
(
pixel_values
=
pixel_values
,
patch_attention_mask
=
patch_attention_mask
)
encoder_outputs
=
self
.
encoder
(
hidden_states
)
last_hidden_state
=
self
.
post_layernorm
(
encoder_outputs
)
return
last_hidden_state
vllm/model_executor/models/intern_vit.py
0 → 100644
View file @
e661d594
# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
# --------------------------------------------------------
# InternVL
# Copyright (c) 2023 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
from
typing
import
Iterable
,
Optional
,
Tuple
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
transformers
import
PretrainedConfig
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
RowParallelLinear
)
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
NORM2FN
=
{
'rms_norm'
:
RMSNorm
,
'layer_norm'
:
nn
.
LayerNorm
,
}
class
InternVisionEmbeddings
(
nn
.
Module
):
def
__init__
(
self
,
config
:
PretrainedConfig
):
super
().
__init__
()
self
.
config
=
config
self
.
embed_dim
=
config
.
hidden_size
self
.
image_size
=
config
.
image_size
self
.
patch_size
=
config
.
patch_size
self
.
class_embedding
=
nn
.
Parameter
(
torch
.
randn
(
1
,
1
,
self
.
embed_dim
))
self
.
patch_embedding
=
nn
.
Conv2d
(
in_channels
=
3
,
out_channels
=
self
.
embed_dim
,
kernel_size
=
self
.
patch_size
,
stride
=
self
.
patch_size
)
self
.
num_patches
=
(
self
.
image_size
//
self
.
patch_size
)
**
2
self
.
num_positions
=
self
.
num_patches
+
1
self
.
position_embedding
=
nn
.
Parameter
(
torch
.
randn
(
1
,
self
.
num_positions
,
self
.
embed_dim
))
def
_get_pos_embed
(
self
,
pos_embed
,
H
,
W
):
target_dtype
=
pos_embed
.
dtype
pos_embed
=
pos_embed
.
float
().
reshape
(
1
,
self
.
image_size
//
self
.
patch_size
,
self
.
image_size
//
self
.
patch_size
,
-
1
).
permute
(
0
,
3
,
1
,
2
)
pos_embed
=
F
.
interpolate
(
pos_embed
,
size
=
(
H
,
W
),
mode
=
'bicubic'
,
align_corners
=
False
)
pos_embed
=
pos_embed
.
reshape
(
1
,
-
1
,
H
*
W
).
permute
(
0
,
2
,
1
).
to
(
target_dtype
)
return
pos_embed
def
forward
(
self
,
pixel_values
:
torch
.
FloatTensor
)
->
torch
.
Tensor
:
target_dtype
=
self
.
patch_embedding
.
weight
.
dtype
patch_embeds
=
self
.
patch_embedding
(
pixel_values
.
to
(
target_dtype
))
# shape = [*, channel, width, height]
batch_size
,
_
,
height
,
width
=
patch_embeds
.
shape
patch_embeds
=
patch_embeds
.
flatten
(
2
).
transpose
(
1
,
2
)
class_embeds
=
self
.
class_embedding
.
expand
(
batch_size
,
1
,
-
1
).
to
(
target_dtype
)
embeddings
=
torch
.
cat
([
class_embeds
,
patch_embeds
],
dim
=
1
)
position_embedding
=
torch
.
cat
([
self
.
position_embedding
[:,
:
1
,
:],
self
.
_get_pos_embed
(
self
.
position_embedding
[:,
1
:,
:],
height
,
width
)
],
dim
=
1
)
embeddings
=
embeddings
+
position_embedding
.
to
(
target_dtype
)
return
embeddings
class
InternAttention
(
nn
.
Module
):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
def
__init__
(
self
,
config
:
PretrainedConfig
):
super
().
__init__
()
self
.
config
=
config
self
.
embed_dim
=
config
.
hidden_size
self
.
num_heads
=
config
.
num_attention_heads
self
.
head_dim
=
self
.
embed_dim
//
self
.
num_heads
if
self
.
head_dim
*
self
.
num_heads
!=
self
.
embed_dim
:
raise
ValueError
(
f
'embed_dim must be divisible by num_heads '
f
'(got `embed_dim`:
{
self
.
embed_dim
}
and `num_heads`:'
f
'
{
self
.
num_heads
}
).'
)
self
.
scale
=
self
.
head_dim
**-
0.5
self
.
qkv
=
nn
.
Linear
(
self
.
embed_dim
,
3
*
self
.
embed_dim
,
bias
=
config
.
qkv_bias
)
self
.
qk_normalization
=
config
.
qk_normalization
if
self
.
qk_normalization
:
self
.
q_norm
=
RMSNorm
(
self
.
embed_dim
,
eps
=
config
.
layer_norm_eps
)
self
.
k_norm
=
RMSNorm
(
self
.
embed_dim
,
eps
=
config
.
layer_norm_eps
)
self
.
proj
=
nn
.
Linear
(
self
.
embed_dim
,
self
.
embed_dim
)
def
forward
(
self
,
x
):
B
,
N
,
C
=
x
.
shape
qkv
=
self
.
qkv
(
x
).
reshape
(
B
,
N
,
3
,
self
.
num_heads
,
C
//
self
.
num_heads
).
permute
(
2
,
0
,
3
,
1
,
4
)
q
,
k
,
v
=
qkv
.
unbind
(
0
)
if
self
.
qk_normalization
:
B_
,
H_
,
N_
,
D_
=
q
.
shape
q
=
self
.
q_norm
.
forward_native
(
q
.
transpose
(
1
,
2
).
flatten
(
-
2
,
-
1
)).
view
(
B_
,
N_
,
H_
,
D_
).
transpose
(
1
,
2
)
k
=
self
.
k_norm
.
forward_native
(
k
.
transpose
(
1
,
2
).
flatten
(
-
2
,
-
1
)).
view
(
B_
,
N_
,
H_
,
D_
).
transpose
(
1
,
2
)
x
=
F
.
scaled_dot_product_attention
(
q
,
k
,
v
,
scale
=
self
.
scale
)
x
=
x
.
transpose
(
1
,
2
).
reshape
(
B
,
N
,
C
)
x
=
self
.
proj
(
x
)
return
x
class
InternMLP
(
nn
.
Module
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
):
super
().
__init__
()
self
.
config
=
config
self
.
activation_fn
=
get_act_fn
(
config
.
hidden_act
)
self
.
fc1
=
ColumnParallelLinear
(
config
.
hidden_size
,
config
.
intermediate_size
,
bias
=
True
,
quant_config
=
quant_config
)
self
.
fc2
=
RowParallelLinear
(
config
.
intermediate_size
,
config
.
hidden_size
,
bias
=
True
,
quant_config
=
quant_config
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
hidden_states
,
_
=
self
.
fc1
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
hidden_states
)
hidden_states
,
_
=
self
.
fc2
(
hidden_states
)
return
hidden_states
class
InternVisionEncoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
):
super
().
__init__
()
self
.
embed_dim
=
config
.
hidden_size
self
.
intermediate_size
=
config
.
intermediate_size
self
.
norm_type
=
config
.
norm_type
self
.
attn
=
InternAttention
(
config
)
self
.
mlp
=
InternMLP
(
config
,
quant_config
=
quant_config
)
self
.
norm1
=
NORM2FN
[
self
.
norm_type
](
self
.
embed_dim
,
eps
=
config
.
layer_norm_eps
)
self
.
norm2
=
NORM2FN
[
self
.
norm_type
](
self
.
embed_dim
,
eps
=
config
.
layer_norm_eps
)
self
.
ls1
=
nn
.
Parameter
(
config
.
initializer_factor
*
torch
.
ones
(
self
.
embed_dim
))
self
.
ls2
=
nn
.
Parameter
(
config
.
initializer_factor
*
torch
.
ones
(
self
.
embed_dim
))
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
):
hidden_states
=
hidden_states
+
self
.
attn
(
self
.
norm1
(
hidden_states
))
*
self
.
ls1
hidden_states
=
hidden_states
+
self
.
mlp
(
self
.
norm2
(
hidden_states
))
*
self
.
ls2
return
hidden_states
class
InternVisionEncoder
(
nn
.
Module
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
num_hidden_layers_override
:
Optional
[
int
]
=
None
):
super
().
__init__
()
self
.
config
=
config
if
num_hidden_layers_override
is
None
:
num_hidden_layers
=
config
.
num_hidden_layers
else
:
num_hidden_layers
=
num_hidden_layers_override
self
.
layers
=
nn
.
ModuleList
([
InternVisionEncoderLayer
(
config
=
config
,
quant_config
=
quant_config
)
for
_
in
range
(
num_hidden_layers
)
])
def
forward
(
self
,
inputs_embeds
:
torch
.
Tensor
):
hidden_states
=
inputs_embeds
for
encoder_layer
in
self
.
layers
:
hidden_states
=
encoder_layer
(
hidden_states
)
return
hidden_states
class
InternVisionModel
(
nn
.
Module
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
num_hidden_layers_override
:
Optional
[
int
]
=
None
):
super
().
__init__
()
self
.
config
=
config
self
.
embeddings
=
InternVisionEmbeddings
(
config
)
self
.
encoder
=
InternVisionEncoder
(
config
=
config
,
quant_config
=
quant_config
,
num_hidden_layers_override
=
num_hidden_layers_override
)
def
resize_pos_embeddings
(
self
,
old_size
,
new_size
,
patch_size
):
pos_emb
=
self
.
embeddings
.
position_embedding
_
,
num_positions
,
embed_dim
=
pos_emb
.
shape
cls_emb
=
pos_emb
[:,
:
1
,
:]
pos_emb
=
pos_emb
[:,
1
:,
:].
reshape
(
1
,
old_size
//
patch_size
,
old_size
//
patch_size
,
-
1
).
permute
(
0
,
3
,
1
,
2
)
pos_emb
=
F
.
interpolate
(
pos_emb
.
float
(),
size
=
new_size
//
patch_size
,
mode
=
'bicubic'
,
align_corners
=
False
)
pos_emb
=
pos_emb
.
to
(
cls_emb
.
dtype
).
reshape
(
1
,
embed_dim
,
-
1
).
permute
(
0
,
2
,
1
)
pos_emb
=
torch
.
cat
([
cls_emb
,
pos_emb
],
dim
=
1
)
self
.
embeddings
.
position_embedding
=
nn
.
Parameter
(
pos_emb
)
self
.
embeddings
.
image_size
=
new_size
def
get_input_embeddings
(
self
):
return
self
.
embeddings
def
forward
(
self
,
pixel_values
:
Optional
[
torch
.
Tensor
]
=
None
,
pixel_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
FloatTensor
:
if
pixel_values
is
None
and
pixel_embeds
is
None
:
raise
ValueError
(
'You have to specify pixel_values or pixel_embeds'
)
if
pixel_embeds
is
not
None
:
hidden_states
=
pixel_embeds
elif
pixel_values
is
not
None
:
if
pixel_values
.
ndim
==
4
:
hidden_states
=
self
.
embeddings
(
pixel_values
)
else
:
raise
ValueError
(
f
'wrong pixel_values size:
{
pixel_values
.
shape
}
'
)
encoder_outputs
=
self
.
encoder
(
inputs_embeds
=
hidden_states
)
return
encoder_outputs
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
params_dict
=
dict
(
self
.
named_parameters
())
for
name
,
loaded_weight
in
weights
:
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
vllm/model_executor/models/internlm2.py
View file @
e661d594
...
@@ -219,14 +219,22 @@ class InternLM2Model(nn.Module):
...
@@ -219,14 +219,22 @@ class InternLM2Model(nn.Module):
])
])
self
.
norm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
norm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
tok_embeddings
(
input_ids
)
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
IntermediateTensors
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
hidden_states
=
self
.
tok_embeddings
(
input_ids
)
if
inputs_embeds
is
not
None
:
hidden_states
=
inputs_embeds
else
:
hidden_states
=
self
.
tok_embeddings
(
input_ids
)
residual
=
None
residual
=
None
for
i
in
range
(
len
(
self
.
layers
)):
for
i
in
range
(
len
(
self
.
layers
)):
layer
=
self
.
layers
[
i
]
layer
=
self
.
layers
[
i
]
...
...
vllm/model_executor/models/internvl.py
0 → 100644
View file @
e661d594
# adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_internvl_chat.py
# --------------------------------------------------------
# InternVL
# Copyright (c) 2023 OpenGVLab
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import
itertools
from
typing
import
Iterable
,
List
,
Literal
,
Optional
,
Tuple
,
TypedDict
,
Union
import
torch
import
torch.nn
as
nn
import
torchvision.transforms
as
T
from
PIL
import
Image
from
transformers
import
PretrainedConfig
from
vllm.attention
import
AttentionMetadata
from
vllm.config
import
CacheConfig
,
MultiModalConfig
from
vllm.inputs
import
INPUT_REGISTRY
,
InputContext
,
LLMInputs
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models
import
ModelRegistry
from
vllm.model_executor.models.intern_vit
import
InternVisionModel
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.base
import
MultiModalInputs
from
vllm.multimodal.image
import
cached_get_tokenizer
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
.clip
import
(
dummy_image_for_clip
,
dummy_seq_data_for_clip
,
get_clip_num_patches
)
from
.interfaces
import
SupportsVision
from
.utils
import
merge_vision_embeddings
IMG_START
=
'<img>'
IMG_END
=
'</img>'
IMG_CONTEXT
=
'<IMG_CONTEXT>'
IMAGENET_MEAN
=
(
0.485
,
0.456
,
0.406
)
IMAGENET_STD
=
(
0.229
,
0.224
,
0.225
)
MAX_IMAGE_FEATURE_SIZE_WIDTH
=
3000
MAX_IMAGE_FEATURE_SIZE_HEIGHT
=
500
class
InternVLImagePixelInputs
(
TypedDict
):
type
:
Literal
[
"pixel_values"
]
data
:
Union
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]]
"""
Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
Note that `num_patches` may be different for each batch, in which case
the data is passed as a list instead of a batched tensor.
"""
# copied from https://huggingface.co/OpenGVLab/InternVL2-1B
def
build_transform
(
input_size
):
MEAN
,
STD
=
IMAGENET_MEAN
,
IMAGENET_STD
transform
=
T
.
Compose
([
T
.
Lambda
(
lambda
img
:
img
.
convert
(
'RGB'
)
if
img
.
mode
!=
'RGB'
else
img
),
T
.
Resize
((
input_size
,
input_size
),
interpolation
=
T
.
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
T
.
Normalize
(
mean
=
MEAN
,
std
=
STD
)
])
return
transform
# copied from https://huggingface.co/OpenGVLab/InternVL2-1B
def
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
,
height
,
image_size
):
best_ratio_diff
=
float
(
'inf'
)
best_ratio
=
(
1
,
1
)
area
=
width
*
height
for
ratio
in
target_ratios
:
target_aspect_ratio
=
ratio
[
0
]
/
ratio
[
1
]
ratio_diff
=
abs
(
aspect_ratio
-
target_aspect_ratio
)
if
ratio_diff
<
best_ratio_diff
:
best_ratio_diff
=
ratio_diff
best_ratio
=
ratio
elif
ratio_diff
==
best_ratio_diff
:
if
area
>
0.5
*
image_size
*
image_size
*
ratio
[
0
]
*
ratio
[
1
]:
best_ratio
=
ratio
return
best_ratio
def
calculate_num_blocks
(
orig_width
:
int
,
orig_height
:
int
,
min_num
=
1
,
max_num
=
6
,
image_size
=
448
):
aspect_ratio
=
orig_width
/
orig_height
# calculate the existing image aspect ratio
target_ratios
=
set
((
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
i
*
j
<=
max_num
and
i
*
j
>=
min_num
)
target_ratios
=
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
orig_width
,
orig_height
,
image_size
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
return
blocks
,
target_width
,
target_height
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
dynamic_preprocess
(
image
,
min_num
=
1
,
max_num
=
6
,
image_size
=
448
,
use_thumbnail
=
False
):
orig_width
,
orig_height
=
image
.
size
blocks
,
target_width
,
target_height
=
calculate_num_blocks
(
orig_width
,
orig_height
,
min_num
,
max_num
,
image_size
)
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
((
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
def
image_to_pixel_values
(
image
:
Image
.
Image
,
input_size
=
448
,
max_num
=
6
):
transform
=
build_transform
(
input_size
=
input_size
)
images
=
dynamic_preprocess
(
image
,
image_size
=
input_size
,
use_thumbnail
=
True
,
max_num
=
max_num
)
pixel_values
=
[
transform
(
image
)
for
image
in
images
]
pixel_values
=
torch
.
stack
(
pixel_values
)
return
pixel_values
def
get_internvl_num_patches
(
image_size
:
int
,
patch_size
:
int
,
downsample_ratio
:
float
):
return
int
(
get_clip_num_patches
(
image_size
=
image_size
,
patch_size
=
patch_size
)
*
(
downsample_ratio
**
2
))
def
get_max_internvl_image_tokens
(
ctx
:
InputContext
):
hf_config
=
ctx
.
get_hf_config
(
PretrainedConfig
)
vision_config
=
hf_config
.
vision_config
image_size
=
vision_config
.
image_size
patch_size
=
vision_config
.
patch_size
downsample_ratio
=
hf_config
.
downsample_ratio
num_patches
=
get_internvl_num_patches
(
image_size
,
patch_size
,
downsample_ratio
)
return
num_patches
*
7
def
input_processor_for_internvl
(
ctx
:
InputContext
,
llm_inputs
:
LLMInputs
):
multi_modal_data
=
llm_inputs
.
get
(
"multi_modal_data"
)
if
multi_modal_data
is
None
or
"image"
not
in
multi_modal_data
:
return
llm_inputs
model_config
=
ctx
.
model_config
hf_config
=
ctx
.
get_hf_config
(
PretrainedConfig
)
vision_config
=
hf_config
.
vision_config
image_data
=
multi_modal_data
[
"image"
]
if
isinstance
(
image_data
,
Image
.
Image
):
width
,
height
=
image_data
.
size
num_blocks
,
_
,
_
=
calculate_num_blocks
(
width
,
height
)
elif
isinstance
(
image_data
,
torch
.
Tensor
):
raise
NotImplementedError
(
"Embeddings input is not supported yet"
)
else
:
raise
TypeError
(
f
"Invalid image type:
{
type
(
image_data
)
}
"
)
image_size
=
vision_config
.
image_size
patch_size
=
vision_config
.
patch_size
downsample_ratio
=
hf_config
.
downsample_ratio
num_patches
=
get_internvl_num_patches
(
image_size
,
patch_size
,
downsample_ratio
)
tokenizer
=
cached_get_tokenizer
(
model_config
.
tokenizer
,
trust_remote_code
=
True
)
prompt
=
llm_inputs
.
get
(
"prompt"
)
prompt_token_ids
=
llm_inputs
[
"prompt_token_ids"
]
if
prompt
is
None
:
prompt
=
tokenizer
.
decode
(
prompt_token_ids
)
image_prompt
=
IMG_START
+
IMG_CONTEXT
*
(
num_blocks
+
1
)
*
num_patches
+
IMG_END
new_prompt
=
prompt
.
replace
(
'<image>'
,
image_prompt
,
1
)
new_prompt_token_ids
=
tokenizer
.
encode
(
new_prompt
)
return
LLMInputs
(
prompt
=
prompt
,
prompt_token_ids
=
new_prompt_token_ids
,
multi_modal_data
=
multi_modal_data
)
def
input_mapper_for_internvl
(
ctx
:
InputContext
,
data
:
object
):
if
isinstance
(
data
,
Image
.
Image
):
data
=
image_to_pixel_values
(
data
)
model_config
=
ctx
.
model_config
tokenizer
=
cached_get_tokenizer
(
model_config
.
tokenizer
,
trust_remote_code
=
True
)
image_token_id
=
tokenizer
.
encode
(
IMG_CONTEXT
,
add_special_tokens
=
False
,
return_tensors
=
"pt"
)[
0
]
return
MultiModalInputs
({
"pixel_values"
:
data
,
"image_token_id"
:
image_token_id
})
def
dummy_data_for_internvl
(
ctx
:
InputContext
,
seq_len
:
int
):
image_feature_size
=
get_max_internvl_image_tokens
(
ctx
)
model_config
=
ctx
.
model_config
hf_config
=
ctx
.
get_hf_config
(
PretrainedConfig
)
vision_config
=
hf_config
.
vision_config
tokenizer
=
cached_get_tokenizer
(
model_config
.
tokenizer
,
trust_remote_code
=
True
)
seq_data
=
dummy_seq_data_for_clip
(
vision_config
,
seq_len
,
image_token_id
=
tokenizer
.
encode
(
IMG_CONTEXT
,
add_special_tokens
=
False
)[
0
],
image_feature_size_override
=
image_feature_size
,
)
mm_data
=
dummy_image_for_clip
(
vision_config
,
image_width_override
=
MAX_IMAGE_FEATURE_SIZE_WIDTH
,
image_height_override
=
MAX_IMAGE_FEATURE_SIZE_HEIGHT
,
)
return
seq_data
,
mm_data
@
MULTIMODAL_REGISTRY
.
register_image_input_mapper
(
input_mapper_for_internvl
)
@
MULTIMODAL_REGISTRY
.
register_max_image_tokens
(
get_max_internvl_image_tokens
)
@
INPUT_REGISTRY
.
register_dummy_data
(
dummy_data_for_internvl
)
@
INPUT_REGISTRY
.
register_input_processor
(
input_processor_for_internvl
)
class
InternVLChatModel
(
nn
.
Module
,
SupportsVision
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
multimodal_config
:
MultiModalConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
multimodal_config
=
multimodal_config
image_size
=
config
.
force_image_size
or
config
.
vision_config
.
image_size
patch_size
=
config
.
vision_config
.
patch_size
self
.
patch_size
=
patch_size
self
.
select_layer
=
config
.
select_layer
self
.
num_image_token
=
int
(
(
image_size
//
patch_size
)
**
2
*
(
config
.
downsample_ratio
**
2
))
self
.
downsample_ratio
=
config
.
downsample_ratio
self
.
ps_version
=
config
.
ps_version
vision_feature_layer
=
self
.
select_layer
if
vision_feature_layer
<
0
:
num_hidden_layers
=
config
.
vision_config
.
num_hidden_layers
\
+
vision_feature_layer
+
1
else
:
num_hidden_layers
=
vision_feature_layer
+
1
self
.
vision_model
=
InternVisionModel
(
config
.
vision_config
,
num_hidden_layers_override
=
num_hidden_layers
)
llm_class
=
ModelRegistry
.
load_model_cls
(
config
.
text_config
.
architectures
[
0
])
self
.
language_model
=
llm_class
(
config
.
text_config
,
cache_config
,
quant_config
)
vit_hidden_size
=
config
.
vision_config
.
hidden_size
llm_hidden_size
=
config
.
text_config
.
hidden_size
self
.
mlp1
=
nn
.
Sequential
(
nn
.
LayerNorm
(
vit_hidden_size
*
int
(
1
/
self
.
downsample_ratio
)
**
2
),
nn
.
Linear
(
vit_hidden_size
*
int
(
1
/
self
.
downsample_ratio
)
**
2
,
llm_hidden_size
),
nn
.
GELU
(),
nn
.
Linear
(
llm_hidden_size
,
llm_hidden_size
))
self
.
img_context_token_id
=
None
def
pixel_shuffle
(
self
,
x
,
scale_factor
=
0.5
):
n
,
w
,
h
,
c
=
x
.
size
()
# N, W, H, C --> N, W, H * scale, C // scale
x
=
x
.
view
(
n
,
w
,
int
(
h
*
scale_factor
),
int
(
c
/
scale_factor
))
# N, W, H * scale, C // scale --> N, H * scale, W, C // scale
x
=
x
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
x
=
x
.
view
(
n
,
int
(
h
*
scale_factor
),
int
(
w
*
scale_factor
),
int
(
c
/
(
scale_factor
*
scale_factor
)))
if
self
.
ps_version
==
'v1'
:
pass
else
:
x
=
x
.
permute
(
0
,
2
,
1
,
3
).
contiguous
()
return
x
def
extract_feature
(
self
,
pixel_values
):
vit_embeds
=
self
.
vision_model
(
pixel_values
=
pixel_values
)
vit_embeds
=
vit_embeds
[:,
1
:,
:]
h
=
w
=
int
(
vit_embeds
.
shape
[
1
]
**
0.5
)
vit_embeds
=
vit_embeds
.
reshape
(
vit_embeds
.
shape
[
0
],
h
,
w
,
-
1
)
vit_embeds
=
self
.
pixel_shuffle
(
vit_embeds
,
scale_factor
=
self
.
downsample_ratio
)
vit_embeds
=
vit_embeds
.
reshape
(
vit_embeds
.
shape
[
0
],
-
1
,
vit_embeds
.
shape
[
-
1
])
vit_embeds
=
self
.
mlp1
(
vit_embeds
)
return
vit_embeds
def
_validate_image_sizes
(
self
,
data
:
torch
.
Tensor
)
->
torch
.
Tensor
:
if
list
(
data
.
shape
[
1
:])
!=
[
2
]:
raise
ValueError
(
f
"The expected image sizes shape is batch dimension plus "
f
"
{
[
2
]
}
. You supplied
{
data
.
shape
}
."
)
return
data
def
_validate_pixel_values
(
self
,
data
:
Union
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]]
)
->
Union
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]]:
h
=
w
=
self
.
config
.
vision_config
.
image_size
expected_dims
=
(
3
,
h
,
w
)
def
_validate_shape
(
d
:
torch
.
Tensor
):
actual_dims
=
tuple
(
d
.
shape
)
if
actual_dims
!=
expected_dims
:
expected_expr
=
(
"num_patches"
,
*
map
(
str
,
expected_dims
))
raise
ValueError
(
"The expected shape of pixel values in each batch element "
f
"is
{
expected_expr
}
. You supplied
{
tuple
(
d
.
shape
)
}
."
)
for
d
in
data
:
_validate_shape
(
d
)
return
data
def
_parse_and_validate_image_input
(
self
,
**
kwargs
:
object
)
->
Optional
[
InternVLImagePixelInputs
]:
pixel_values
=
kwargs
.
pop
(
"pixel_values"
,
None
)
image_token_id
=
kwargs
.
pop
(
"image_token_id"
,
None
)
if
pixel_values
is
None
:
return
None
self
.
img_context_token_id
=
image_token_id
[
0
]
if
not
isinstance
(
pixel_values
,
(
torch
.
Tensor
,
list
)):
raise
ValueError
(
"Incorrect type of pixel values. "
f
"Got type:
{
type
(
pixel_values
)
}
"
)
return
InternVLImagePixelInputs
(
type
=
"pixel_values"
,
data
=
self
.
_validate_pixel_values
(
pixel_values
),
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
**
kwargs
:
object
,
)
->
SamplerOutput
:
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
not
None
:
inputs_embeds
=
self
.
language_model
.
model
.
get_input_embeddings
(
input_ids
)
vit_embeds
=
self
.
extract_feature
(
image_input
[
"data"
])
inputs_embeds
=
merge_vision_embeddings
(
input_ids
,
inputs_embeds
,
vit_embeds
,
self
.
img_context_token_id
)
input_ids
=
None
else
:
inputs_embeds
=
None
hidden_states
=
self
.
language_model
.
model
(
input_ids
,
positions
,
kv_caches
,
attn_metadata
,
None
,
inputs_embeds
=
inputs_embeds
)
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
torch
.
Tensor
:
return
self
.
language_model
.
compute_logits
(
hidden_states
,
sampling_metadata
)
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
return
self
.
language_model
.
sample
(
logits
,
sampling_metadata
)
def
_filter_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]],
prefix
:
str
):
for
name
,
loaded_weight
in
weights
:
name
=
name
.
split
(
"."
)
if
prefix
==
name
.
pop
(
0
):
name
=
"."
.
join
(
name
)
yield
name
,
loaded_weight
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
# prepare weight iterators for components
vit_weights
,
mlp_weights
,
llm_weights
=
itertools
.
tee
(
weights
,
3
)
# load vision encoder
vit_weights
=
self
.
_filter_weights
(
vit_weights
,
"vision_model"
)
self
.
vision_model
.
load_weights
(
vit_weights
)
# load mlp projector
mlp_weights
=
self
.
_filter_weights
(
mlp_weights
,
"mlp1"
)
mlp_params_dict
=
dict
(
self
.
mlp1
.
named_parameters
())
for
name
,
loaded_weight
in
mlp_weights
:
param
=
mlp_params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
# load llm backbone
llm_weights
=
self
.
_filter_weights
(
llm_weights
,
"language_model"
)
self
.
language_model
.
load_weights
(
llm_weights
)
vllm/model_executor/models/jamba.py
View file @
e661d594
# coding=utf-8
# coding=utf-8
"""Inference-only J
urassic
model."""
"""Inference-only J
amba
model."""
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
Dict
,
Iterable
,
List
,
Optional
,
Tuple
from
typing
import
Dict
,
Iterable
,
List
,
Optional
,
Tuple
...
@@ -15,10 +15,9 @@ from vllm.attention.backends.abstract import AttentionMetadata
...
@@ -15,10 +15,9 @@ from vllm.attention.backends.abstract import AttentionMetadata
from
vllm.attention.layer
import
Attention
from
vllm.attention.layer
import
Attention
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
SchedulerConfig
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
SchedulerConfig
from
vllm.distributed
import
(
get_tensor_model_parallel_rank
,
from
vllm.distributed
import
(
get_tensor_model_parallel_rank
,
get_tensor_model_parallel_world_size
,
get_tensor_model_parallel_world_size
)
tensor_model_parallel_all_reduce
)
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.fused_moe
import
f
used
_moe
from
vllm.model_executor.layers.fused_moe
import
F
used
MoE
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
MergedColumnParallelLinear
,
MergedColumnParallelLinear
,
...
@@ -282,108 +281,50 @@ class JambaMLP(nn.Module):
...
@@ -282,108 +281,50 @@ class JambaMLP(nn.Module):
class
JambaMoE
(
nn
.
Module
):
class
JambaMoE
(
nn
.
Module
):
"""A tensor-parallel MoE implementation for Mixtral that shards each expert
across all ranks.
Each expert's weights are sharded across all ranks and a fused MoE
def
__init__
(
self
,
kernel is used for the forward pass, and finally we reduce the outputs
config
:
JambaConfig
,
across ranks.
num_experts
:
Optional
[
int
]
=
None
,
"""
top_k
:
Optional
[
int
]
=
None
,
params_dtype
:
Optional
[
torch
.
dtype
]
=
None
,
def
__init__
(
tp_size
:
Optional
[
int
]
=
None
,
self
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
):
config
:
JambaConfig
,
params_dtype
:
Optional
[
torch
.
dtype
]
=
None
,
tp_size
:
Optional
[
int
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
):
super
().
__init__
()
super
().
__init__
()
self
.
tp_size
=
tp_size
or
get_tensor_model_parallel_world_size
()
self
.
num_total_experts
=
num_experts
or
config
.
num_experts
self
.
num_total_experts
=
config
.
num_experts
self
.
top_k
=
top_k
or
config
.
num_experts_per_tok
self
.
top_k
=
config
.
num_experts_per_tok
self
.
hidden_size
=
config
.
hidden_size
self
.
hidden_size
=
config
.
hidden_size
self
.
intermediate_size
=
config
.
intermediate_size
//
self
.
tp_size
self
.
intermediate_size
=
config
.
intermediate_size
if
params_dtype
is
None
:
params_dtype
=
torch
.
get_default_dtype
()
self
.
params_dtype
=
params_dtype
self
.
router
=
ReplicatedLinear
(
self
.
hidden_size
,
if
self
.
num_total_experts
>
1
:
self
.
num_total_experts
,
self
.
router
=
ReplicatedLinear
(
self
.
hidden_size
,
bias
=
False
,
self
.
num_total_experts
,
params_dtype
=
self
.
params_dtype
)
bias
=
False
,
quant_config
=
None
,
self
.
ws
=
nn
.
Parameter
(
params_dtype
=
params_dtype
)
torch
.
empty
(
self
.
num_total_experts
,
self
.
experts
=
FusedMoE
(
self
.
num_total_experts
,
2
*
self
.
intermediate_size
,
self
.
top_k
,
self
.
hidden_size
,
self
.
hidden_size
,
device
=
"cuda"
,
self
.
intermediate_size
,
dtype
=
self
.
params_dtype
,
tp_size
=
tp_size
,
))
params_dtype
=
params_dtype
,
self
.
w2s
=
nn
.
Parameter
(
reduce_results
=
True
,
torch
.
empty
(
renormalize
=
False
,
self
.
num_total_experts
,
use_grouped_topk
=
False
,
self
.
hidden_size
,
quant_config
=
quant_config
)
self
.
intermediate_size
,
device
=
"cuda"
,
dtype
=
self
.
params_dtype
,
))
set_weight_attrs
(
self
.
ws
,
{
"weight_loader"
:
self
.
weight_loader
,
},
)
set_weight_attrs
(
self
.
w2s
,
{
"weight_loader"
:
self
.
weight_loader
,
},
)
def
weight_loader
(
self
,
param
:
nn
.
Parameter
,
loaded_weight
:
torch
.
Tensor
,
weight_name
:
str
,
expert_id
:
int
,
):
tp_rank
=
get_tensor_model_parallel_rank
()
param_data
=
param
.
data
shard_size
=
self
.
intermediate_size
shard
=
slice
(
tp_rank
*
shard_size
,
(
tp_rank
+
1
)
*
shard_size
)
if
weight_name
.
endswith
(
"gate_proj.weight"
):
param_data
[
expert_id
,
0
:
shard_size
,
:]
=
loaded_weight
[
shard
,
:]
if
weight_name
.
endswith
(
"up_proj.weight"
):
param_data
[
expert_id
,
shard_size
:
2
*
shard_size
,
:]
=
loaded_weight
[
shard
,
:]
if
weight_name
.
endswith
(
"down_proj.weight"
):
param_data
[
expert_id
,
:,
:]
=
loaded_weight
[:,
shard
]
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
num_tokens
,
hidden_siz
e
=
hidden_states
.
shape
orig_shap
e
=
hidden_states
.
shape
hidden_states
=
hidden_states
.
view
(
-
1
,
self
.
hidden_size
)
hidden_states
=
hidden_states
.
view
(
-
1
,
self
.
hidden_size
)
# router_logits: (batch * sequence_length, n_experts)
# router_logits: (batch * sequence_length, n_experts)
router_logits
,
_
=
self
.
router
(
hidden_states
)
if
self
.
num_total_experts
>
1
:
router_logits
,
_
=
self
.
router
(
hidden_states
)
final_hidden_states
=
fused_moe
(
else
:
hidden_states
,
router_logits
=
torch
.
ones
((
hidden_states
.
shape
[
0
],
1
),
self
.
ws
,
device
=
hidden_states
.
device
,
self
.
w2s
,
dtype
=
hidden_states
.
dtype
)
router_logits
,
hidden_states
=
self
.
experts
(
hidden_states
,
router_logits
)
self
.
top_k
,
return
hidden_states
.
view
(
orig_shape
)
renormalize
=
False
,
# Mixtral normalize the expert probs to 1. We don't!
inplace
=
True
,
)
if
self
.
tp_size
>
1
:
final_hidden_states
=
tensor_model_parallel_all_reduce
(
final_hidden_states
)
return
final_hidden_states
.
view
(
num_tokens
,
hidden_size
)
class
JambaMambaDecoderLayer
(
nn
.
Module
):
class
JambaMambaDecoderLayer
(
nn
.
Module
):
...
@@ -644,6 +585,11 @@ class JambaForCausalLM(nn.Module, HasInnerState):
...
@@ -644,6 +585,11 @@ class JambaForCausalLM(nn.Module, HasInnerState):
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
scheduler_config
:
Optional
[
SchedulerConfig
]
=
None
,
scheduler_config
:
Optional
[
SchedulerConfig
]
=
None
,
)
->
None
:
)
->
None
:
assert
not
scheduler_config
.
chunked_prefill_enabled
,
\
"Jamba currently does not support chunked prefill"
assert
not
cache_config
.
enable_prefix_caching
,
\
"Jamba currently does not support prefix caching"
super
().
__init__
()
super
().
__init__
()
self
.
config
=
config
self
.
config
=
config
self
.
scheduler_config
=
scheduler_config
self
.
scheduler_config
=
scheduler_config
...
@@ -912,15 +858,13 @@ class JambaForCausalLM(nn.Module, HasInnerState):
...
@@ -912,15 +858,13 @@ class JambaForCausalLM(nn.Module, HasInnerState):
(
"gate_up_proj"
,
"up_proj"
,
1
),
(
"gate_up_proj"
,
"up_proj"
,
1
),
]
]
expert_params_mapping
=
[
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id)
# (param_name, weight_name, expert_id, shard_id)
(
expert_params_mapping
=
FusedMoE
.
make_expert_params_mapping
(
"ws"
if
weight_name
in
[
"gate_proj"
,
"up_proj"
]
else
"w2s"
,
ckpt_gate_proj_name
=
"gate_proj"
,
f
"experts.
{
expert_id
}
.
{
weight_name
}
.weight"
,
ckpt_down_proj_name
=
"down_proj"
,
expert_id
,
ckpt_up_proj_name
=
"up_proj"
,
)
for
expert_id
in
range
(
self
.
config
.
num_experts
)
num_experts
=
self
.
config
.
num_experts
)
for
weight_name
in
[
"down_proj"
,
"up_proj"
,
"gate_proj"
]
]
params_dict
=
dict
(
self
.
named_parameters
())
params_dict
=
dict
(
self
.
named_parameters
())
for
name
,
loaded_weight
in
weights
:
for
name
,
loaded_weight
in
weights
:
...
@@ -947,7 +891,8 @@ class JambaForCausalLM(nn.Module, HasInnerState):
...
@@ -947,7 +891,8 @@ class JambaForCausalLM(nn.Module, HasInnerState):
weight_loader
(
param
,
loaded_weight
,
shard_id
)
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
break
else
:
else
:
for
param_name
,
weight_name
,
expert_id
in
expert_params_mapping
:
for
mapping
in
expert_params_mapping
:
param_name
,
weight_name
,
expert_id
,
shard_id
=
mapping
if
weight_name
not
in
name
:
if
weight_name
not
in
name
:
continue
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
name
=
name
.
replace
(
weight_name
,
param_name
)
...
@@ -956,6 +901,7 @@ class JambaForCausalLM(nn.Module, HasInnerState):
...
@@ -956,6 +901,7 @@ class JambaForCausalLM(nn.Module, HasInnerState):
weight_loader
(
param
,
weight_loader
(
param
,
loaded_weight
,
loaded_weight
,
weight_name
,
weight_name
,
shard_id
=
shard_id
,
expert_id
=
expert_id
)
expert_id
=
expert_id
)
break
break
else
:
else
:
...
...
vllm/model_executor/models/llama.py
View file @
e661d594
...
@@ -485,6 +485,11 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
...
@@ -485,6 +485,11 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA):
# Models trained using ColossalAI may include these tensors in
# Models trained using ColossalAI may include these tensors in
# the checkpoint. Skip them.
# the checkpoint. Skip them.
continue
continue
# With tie_word_embeddings, we can skip lm_head.weight
# The weight might appear unnecessarily in the files if the model is
# processed with quantization, LoRA, fine-tuning, etc.
if
self
.
config
.
tie_word_embeddings
and
"lm_head.weight"
in
name
:
continue
if
scale_name
:
=
get_compressed_tensors_cache_scale
(
name
):
if
scale_name
:
=
get_compressed_tensors_cache_scale
(
name
):
# Loading kv cache scales for compressed-tensors quantization
# Loading kv cache scales for compressed-tensors quantization
param
=
params_dict
[
scale_name
]
param
=
params_dict
[
scale_name
]
...
...
vllm/model_executor/models/llava_next.py
View file @
e661d594
...
@@ -21,7 +21,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
...
@@ -21,7 +21,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from
vllm.model_executor.models.clip
import
CLIPVisionModel
from
vllm.model_executor.models.clip
import
CLIPVisionModel
from
vllm.model_executor.models.llama
import
LlamaModel
from
vllm.model_executor.models.llama
import
LlamaModel
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
BatchedTensors
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
.clip
import
(
dummy_image_for_clip
,
dummy_seq_data_for_clip
,
from
.clip
import
(
dummy_image_for_clip
,
dummy_seq_data_for_clip
,
...
@@ -43,7 +43,7 @@ MAX_IMAGE_FEATURE_SIZE_HEIGHT = MAX_IMAGE_FEATURE_SIZE_WIDTH = 448
...
@@ -43,7 +43,7 @@ MAX_IMAGE_FEATURE_SIZE_HEIGHT = MAX_IMAGE_FEATURE_SIZE_WIDTH = 448
class
LlavaNextImagePixelInputs
(
TypedDict
):
class
LlavaNextImagePixelInputs
(
TypedDict
):
type
:
Literal
[
"pixel_values"
]
type
:
Literal
[
"pixel_values"
]
data
:
Batched
Tensor
s
data
:
Union
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]]
"""
"""
Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
...
@@ -62,31 +62,26 @@ class LlavaNextImagePixelInputs(TypedDict):
...
@@ -62,31 +62,26 @@ class LlavaNextImagePixelInputs(TypedDict):
LlavaNextImageInputs
=
LlavaNextImagePixelInputs
LlavaNextImageInputs
=
LlavaNextImagePixelInputs
# Taken from: https://github.com/huggingface/text-generation-inference/blob/v2.0.4/server/text_generation_server/models/vlm_causal_lm.py#L91
# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L79
# NOTE: new_height and new_width are further incremented to properly invert the
# floordiv operation: https://github.com/huggingface/transformers/blob/v4.42.2/src/transformers/models/llava_next/modeling_llava_next.py#L133
def
_get_llava_next_num_unpadded_features
(
def
_get_llava_next_num_unpadded_features
(
height
:
int
,
original_
height
:
int
,
width
:
int
,
original_
width
:
int
,
npatches
:
int
,
npatches
:
int
,
num_patch_height
:
int
,
num_patch_height
:
int
,
num_patch_width
:
int
,
num_patch_width
:
int
,
)
->
Tuple
[
int
,
int
]:
)
->
Tuple
[
int
,
int
]:
current_height
=
npatches
*
num_patch_height
current_height
=
npatches
*
num_patch_height
current_width
=
npatches
*
num_patch_width
current_width
=
npatches
*
num_patch_width
current_height
=
torch
.
tensor
(
current_height
).
to
(
"cuda"
)
current_width
=
torch
.
tensor
(
current_width
).
to
(
"cuda"
)
aspect_ratio
:
float
=
width
/
height
aspect_ratio
=
original_width
/
original_height
current_aspect_ratio
:
float
=
current_width
/
current_height
current_aspect_ratio
=
current_width
/
current_height
if
aspect_ratio
>
current_aspect_ratio
:
if
aspect_ratio
>
current_aspect_ratio
:
scale_factor
=
current_width
/
width
new_height
=
(
original_height
*
current_width
)
//
original_width
new_height
=
int
(
height
*
scale_factor
)
padding
=
(
current_height
-
new_height
)
//
2
padding
=
(
current_height
-
new_height
)
//
2
current_height
-=
padding
*
2
current_height
-=
padding
*
2
else
:
else
:
scale_factor
=
current_height
/
height
new_width
=
(
original_width
*
current_height
)
//
original_height
new_width
=
int
(
width
*
scale_factor
)
padding
=
(
current_width
-
new_width
)
//
2
padding
=
(
current_width
-
new_width
)
//
2
current_width
-=
padding
*
2
current_width
-=
padding
*
2
...
@@ -95,7 +90,7 @@ def _get_llava_next_num_unpadded_features(
...
@@ -95,7 +90,7 @@ def _get_llava_next_num_unpadded_features(
return
(
unpadded_features
,
newline_features
)
return
(
unpadded_features
,
newline_features
)
# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.
0.4
/server/text_generation_server/models/vlm_causal_lm.py#L1
11
# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.
2.0
/server/text_generation_server/models/vlm_causal_lm.py#L1
06
def
get_llava_next_image_feature_size
(
def
get_llava_next_image_feature_size
(
hf_config
:
LlavaNextConfig
,
hf_config
:
LlavaNextConfig
,
*
,
*
,
...
@@ -111,9 +106,7 @@ def get_llava_next_image_feature_size(
...
@@ -111,9 +106,7 @@ def get_llava_next_image_feature_size(
)
)
base_feature_size
=
num_patches
*
num_patches
base_feature_size
=
num_patches
*
num_patches
# Note: We follow the "wrong" width/height order
num_patch_height
,
num_patch_width
=
get_anyres_image_grid_shape
(
# [ref: PR huggingface/transformers#31588]
num_patch_width
,
num_patch_height
=
get_anyres_image_grid_shape
(
image_size
=
(
input_height
,
input_width
),
image_size
=
(
input_height
,
input_width
),
grid_pinpoints
=
hf_config
.
image_grid_pinpoints
,
grid_pinpoints
=
hf_config
.
image_grid_pinpoints
,
patch_size
=
vision_config
.
image_size
,
patch_size
=
vision_config
.
image_size
,
...
@@ -349,11 +342,12 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
...
@@ -349,11 +342,12 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
if
patch_embeddings
.
shape
[
0
]
>
1
:
if
patch_embeddings
.
shape
[
0
]
>
1
:
other_patch_embeds
=
patch_embeddings
[
1
:]
other_patch_embeds
=
patch_embeddings
[
1
:]
# Move to CPU to avoid floating-point errors
orig_height
,
orig_width
=
image_size
.
tolist
()
# image_aspect_ratio == "anyres"
# image_aspect_ratio == "anyres"
# Note: We follow the "wrong" width/height order
num_patch_height
,
num_patch_width
=
get_anyres_image_grid_shape
(
# [ref: PR huggingface/transformers#31588]
(
orig_height
,
orig_width
),
num_patch_width
,
num_patch_height
=
get_anyres_image_grid_shape
(
image_size
,
self
.
config
.
image_grid_pinpoints
,
self
.
config
.
image_grid_pinpoints
,
self
.
config
.
vision_config
.
image_size
,
self
.
config
.
vision_config
.
image_size
,
)
)
...
@@ -365,7 +359,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
...
@@ -365,7 +359,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
.
permute
(
4
,
0
,
2
,
1
,
3
).
contiguous
()
\
.
permute
(
4
,
0
,
2
,
1
,
3
).
contiguous
()
\
.
flatten
(
1
,
2
).
flatten
(
2
,
3
)
.
flatten
(
1
,
2
).
flatten
(
2
,
3
)
other_patch_embeds
=
unpad_image
(
other_patch_embeds
,
other_patch_embeds
=
unpad_image
(
other_patch_embeds
,
image_size
)
(
orig_height
,
orig_width
)
)
other_patch_embeds
=
torch
.
cat
((
other_patch_embeds
=
torch
.
cat
((
other_patch_embeds
,
other_patch_embeds
,
self
.
image_newline
[:,
None
,
None
]
\
self
.
image_newline
[:,
None
,
None
]
\
...
@@ -398,7 +392,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
...
@@ -398,7 +392,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
def
_process_image_pixels
(
def
_process_image_pixels
(
self
,
self
,
inputs
:
LlavaNextImagePixelInputs
,
inputs
:
LlavaNextImagePixelInputs
,
)
->
Batched
Tensor
s
:
)
->
Union
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]]
:
assert
self
.
vision_tower
is
not
None
assert
self
.
vision_tower
is
not
None
pixel_values
=
inputs
[
"data"
]
pixel_values
=
inputs
[
"data"
]
...
@@ -425,7 +419,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
...
@@ -425,7 +419,9 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsVision):
]
]
def
_process_image_input
(
def
_process_image_input
(
self
,
image_input
:
LlavaNextImageInputs
)
->
BatchedTensors
:
self
,
image_input
:
LlavaNextImageInputs
,
)
->
Union
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]]:
patch_embeddings
=
self
.
_process_image_pixels
(
image_input
)
patch_embeddings
=
self
.
_process_image_pixels
(
image_input
)
image_sizes
=
image_input
.
get
(
"image_sizes"
)
image_sizes
=
image_input
.
get
(
"image_sizes"
)
...
...
vllm/model_executor/models/minicpm.py
View file @
e661d594
...
@@ -370,6 +370,7 @@ class MiniCPMModel(nn.Module):
...
@@ -370,6 +370,7 @@ class MiniCPMModel(nn.Module):
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
if
inputs_embeds
is
not
None
:
if
inputs_embeds
is
not
None
:
...
@@ -466,7 +467,7 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA):
...
@@ -466,7 +467,7 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA):
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
hidden_states
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
hidden_states
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
attn_metadata
)
attn_metadata
,
intermediate_tensors
)
return
hidden_states
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
...
@@ -513,7 +514,11 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA):
...
@@ -513,7 +514,11 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA):
# Models trained using ColossalAI may include these tensors in
# Models trained using ColossalAI may include these tensors in
# the checkpoint. Skip them.
# the checkpoint. Skip them.
continue
continue
# With tie_word_embeddings, we can skip lm_head.weight
# The weight might appear unnecessarily in the files if the model is
# processed with quantization, LoRA, fine-tuning, etc.
if
self
.
config
.
tie_word_embeddings
and
"lm_head.weight"
in
name
:
continue
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
if
weight_name
not
in
name
:
continue
continue
...
...
vllm/model_executor/models/minicpmv.py
0 → 100644
View file @
e661d594
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only MiniCPM-V model compatible with HuggingFace weights."""
import
math
import
re
from
functools
import
partial
from
typing
import
(
Any
,
Callable
,
Iterable
,
List
,
Optional
,
Tuple
,
TypedDict
,
Union
)
import
numpy
as
np
import
torch
import
torch.nn.functional
as
F
import
torch.types
from
PIL
import
Image
from
torch
import
nn
from
torch.nn.init
import
trunc_normal_
from
transformers.configuration_utils
import
PretrainedConfig
from
vllm.attention
import
AttentionMetadata
from
vllm.config
import
CacheConfig
,
MultiModalConfig
from
vllm.inputs
import
INPUT_REGISTRY
,
InputContext
,
LLMInputs
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.linear
import
ReplicatedLinear
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.model_loader.utils
import
set_default_torch_dtype
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.interfaces
import
SupportsVision
from
vllm.model_executor.models.llama
import
LlamaModel
from
vllm.model_executor.models.minicpm
import
MiniCPMModel
from
vllm.model_executor.models.qwen2
import
Qwen2Model
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
(
cached_get_image_processor
,
cached_get_tokenizer
)
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
,
SequenceData
from
.idefics2_vision_model
import
Idefics2VisionTransformer
logger
=
init_logger
(
__name__
)
_KEYS_TO_MODIFY_MAPPING
=
{
"llm.lm_head"
:
"lm_head"
,
"llm.model"
:
"llm"
,
}
class
MiniCPMVImagePixelInputs
(
TypedDict
):
pixel_values
:
List
[
torch
.
Tensor
]
"""
Shape: `(batch_size * num_images, num_channels, height, width)`
Note that the image size may vary, so we pass it as a list
instead of a batched tensor.
"""
image_bounds
:
torch
.
Tensor
"""
Shape: `(batch_size * num_images, 2)`
This should be in `(start, stop)` format.
"""
tgt_sizes
:
torch
.
Tensor
"""
Shape: `(batch_size * num_images, 2)`
This should be in `(height, width)` format.
"""
MiniCPMVImageInputs
=
MiniCPMVImagePixelInputs
DEFAULT_LN
=
partial
(
nn
.
LayerNorm
,
eps
=
1e-6
)
def
get_abs_pos
(
abs_pos
:
torch
.
Tensor
,
tgt_size
:
torch
.
Tensor
):
# abs_pos: L, C
# tgt_size: (H, W)
# return: M, C
src_size
=
int
(
math
.
sqrt
(
abs_pos
.
size
(
0
)))
# tgt_size = int(math.sqrt(tgt_size))
dtype
=
abs_pos
.
dtype
return
(
F
.
interpolate
(
abs_pos
.
float
().
reshape
(
1
,
src_size
,
src_size
,
-
1
).
permute
(
0
,
3
,
1
,
2
),
size
=
(
tgt_size
[
0
],
tgt_size
[
1
]),
mode
=
"bicubic"
,
align_corners
=
False
,
).
permute
(
0
,
2
,
3
,
1
).
flatten
(
0
,
2
).
to
(
dtype
=
dtype
))
# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20
def
get_2d_sincos_pos_embed
(
embed_dim
:
int
,
grid_size
:
Union
[
int
,
Tuple
[
int
,
int
]],
cls_token
:
bool
=
False
,
version
:
Tuple
[
int
,
int
]
=
(
2
,
0
),
):
"""
grid_size: int of the grid height and width
return:
pos_embed: [grid_size*grid_size, embed_dim] or
[1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
"""
if
isinstance
(
grid_size
,
int
):
grid_h_size
,
grid_w_size
=
grid_size
,
grid_size
else
:
grid_h_size
,
grid_w_size
=
grid_size
[
0
],
grid_size
[
1
]
grid_h
=
np
.
arange
(
grid_h_size
,
dtype
=
np
.
float32
)
grid_w
=
np
.
arange
(
grid_w_size
,
dtype
=
np
.
float32
)
grid
=
np
.
meshgrid
(
grid_w
,
grid_h
)
# here w goes first
grid
=
np
.
stack
(
grid
,
axis
=
0
)
if
version
==
(
2
,
0
):
grid
=
grid
.
reshape
([
2
,
1
,
grid_h_size
,
grid_w_size
])
pos_embed
=
get_2d_sincos_pos_embed_from_grid
(
embed_dim
,
grid
,
version
)
if
cls_token
:
pos_embed
=
np
.
concatenate
([
np
.
zeros
([
1
,
embed_dim
]),
pos_embed
],
axis
=
0
)
else
:
pos_embed
=
get_2d_sincos_pos_embed_from_grid
(
embed_dim
,
grid
,
version
)
return
pos_embed
def
get_2d_sincos_pos_embed_from_grid
(
embed_dim
:
int
,
grid
:
np
.
ndarray
,
version
:
Tuple
[
int
,
int
]
=
(
2
,
0
)):
assert
embed_dim
%
2
==
0
# use half of dimensions to encode grid_h
emb_h
=
get_1d_sincos_pos_embed_from_grid
(
embed_dim
//
2
,
grid
[
0
],
version
)
# (H*W, D/2) or (H, W, D/2)
emb_w
=
get_1d_sincos_pos_embed_from_grid
(
embed_dim
//
2
,
grid
[
1
],
version
)
# (H*W, D/2) or (H, W, D/2)
if
version
==
(
2
,
0
):
emb
=
np
.
concatenate
([
emb_h
,
emb_w
],
axis
=
1
)
# (H*W, D)
else
:
emb
=
np
.
concatenate
([
emb_h
,
emb_w
],
axis
=-
1
)
# (H, W, D)
return
emb
def
get_1d_sincos_pos_embed_from_grid
(
embed_dim
:
int
,
pos
:
np
.
ndarray
,
version
:
Tuple
[
int
,
int
]
=
(
2
,
0
)):
"""
embed_dim: output dimension for each position
pos: a list of positions to be encoded: size (M,) / (H, W)
out: (M, D) / (H, W, D)
"""
assert
embed_dim
%
2
==
0
omega
=
np
.
arange
(
embed_dim
//
2
,
dtype
=
np
.
float32
)
omega
/=
embed_dim
/
2.0
omega
=
1.0
/
10000
**
omega
# (D/2,)
if
version
==
(
2
,
0
):
pos
=
pos
.
reshape
(
-
1
)
# (M,)
out
=
np
.
einsum
(
"m,d->md"
,
pos
,
omega
)
# (M, D/2), outer product
emb_sin
=
np
.
sin
(
out
)
# (M, D/2)
emb_cos
=
np
.
cos
(
out
)
# (M, D/2)
emb
=
np
.
concatenate
([
emb_sin
,
emb_cos
],
axis
=
1
)
# (M, D)
else
:
out
=
np
.
einsum
(
"hw,d->hwd"
,
pos
,
omega
)
# (H, W, D/2), outer product
emb_sin
=
np
.
sin
(
out
)
# (H, W, D/2)
emb_cos
=
np
.
cos
(
out
)
# (H, W, D/2)
emb
=
np
.
concatenate
([
emb_sin
,
emb_cos
],
axis
=-
1
)
# (H, W, D)
return
emb
class
BaseResampler
(
nn
.
Module
):
"""
A 2D perceiver-resampler network with one cross attention layers by
(grid_size**2) learnable queries and 2d sincos pos_emb
Outputs:
A tensor with the shape of (grid_size**2, embed_dim)
"""
def
__init__
(
self
,
num_queries
:
int
,
embed_dim
:
int
,
num_heads
:
int
,
kv_dim
:
Optional
[
int
]
=
None
,
norm_layer
:
Callable
[[
int
],
nn
.
LayerNorm
]
=
DEFAULT_LN
,
)
->
None
:
super
().
__init__
()
self
.
num_queries
=
num_queries
self
.
embed_dim
=
embed_dim
self
.
num_heads
=
num_heads
self
.
query
=
nn
.
Parameter
(
torch
.
zeros
(
self
.
num_queries
,
embed_dim
))
trunc_normal_
(
self
.
query
,
std
=
0.02
)
if
kv_dim
is
not
None
and
kv_dim
!=
embed_dim
:
self
.
kv_proj
=
ReplicatedLinear
(
kv_dim
,
embed_dim
,
bias
=
False
)
else
:
# Maintain the same return value with ReplicatedLinear.forward
self
.
kv_proj
=
lambda
*
args
,
**
kwargs
:
(
nn
.
Identity
()(
*
args
,
**
kwargs
),
None
,
)
self
.
attn
=
nn
.
MultiheadAttention
(
embed_dim
,
num_heads
)
self
.
ln_q
=
norm_layer
(
embed_dim
)
self
.
ln_kv
=
norm_layer
(
embed_dim
)
self
.
ln_post
=
norm_layer
(
embed_dim
)
self
.
proj
=
nn
.
Parameter
(
(
embed_dim
**-
0.5
)
*
torch
.
randn
(
embed_dim
,
embed_dim
))
def
_init_weights
(
self
,
m
:
nn
.
Module
)
->
None
:
if
isinstance
(
m
,
nn
.
Linear
):
trunc_normal_
(
m
.
weight
,
std
=
0.02
)
if
isinstance
(
m
,
nn
.
Linear
)
and
m
.
bias
is
not
None
:
nn
.
init
.
constant_
(
m
.
bias
,
0
)
elif
isinstance
(
m
,
nn
.
LayerNorm
):
nn
.
init
.
constant_
(
m
.
bias
,
0
)
nn
.
init
.
constant_
(
m
.
weight
,
1.0
)
def
_repeat
(
self
,
query
,
N
:
int
):
return
query
.
unsqueeze
(
1
).
repeat
(
1
,
N
,
1
)
class
Resampler2
(
BaseResampler
):
def
__init__
(
self
,
grid_size
:
int
,
embed_dim
:
int
,
num_heads
:
int
,
kv_dim
:
Optional
[
int
]
=
None
,
norm_layer
:
Callable
[[
int
],
nn
.
LayerNorm
]
=
DEFAULT_LN
,
adaptive
:
bool
=
False
,
)
->
None
:
super
().
__init__
(
grid_size
**
2
,
embed_dim
,
num_heads
,
kv_dim
,
norm_layer
)
self
.
adaptive
=
adaptive
pos_embed_arr
=
get_2d_sincos_pos_embed
(
embed_dim
,
grid_size
,
version
=
(
2
,
0
))
self
.
pos_embed
=
nn
.
Parameter
(
torch
.
from_numpy
(
pos_embed_arr
).
float
()).
requires_grad_
(
False
)
self
.
apply
(
self
.
_init_weights
)
def
forward
(
self
,
x
:
torch
.
Tensor
,
tgt_sizes
:
torch
.
Tensor
,
attn_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
):
if
self
.
adaptive
:
pos_embed_arr
=
get_2d_sincos_pos_embed
(
self
.
embed_dim
,
tgt_sizes
,
version
=
(
2
,
0
))
pos_embed
=
torch
.
from_numpy
(
pos_embed_arr
).
to
(
device
=
x
.
device
,
dtype
=
x
.
dtype
)
else
:
pos_embed
=
get_abs_pos
(
self
.
pos_embed
,
tgt_sizes
)
x
,
_
=
self
.
kv_proj
(
x
)
x
=
self
.
ln_kv
(
x
).
permute
(
1
,
0
,
2
)
N
=
x
.
shape
[
1
]
q
=
self
.
ln_q
(
self
.
query
)
out
=
self
.
attn
(
self
.
_repeat
(
q
,
N
)
+
self
.
pos_embed
.
unsqueeze
(
1
),
x
+
pos_embed
.
unsqueeze
(
1
),
x
,
attn_mask
=
attn_mask
,
)[
0
]
x
=
out
.
permute
(
1
,
0
,
2
)
x
=
self
.
ln_post
(
x
)
x
=
x
@
self
.
proj
return
x
class
Resampler2_5
(
BaseResampler
):
def
__init__
(
self
,
num_queries
:
int
,
embed_dim
:
int
,
num_heads
:
int
,
kv_dim
:
Optional
[
int
]
=
None
,
norm_layer
:
Callable
[[
int
],
nn
.
LayerNorm
]
=
DEFAULT_LN
,
max_size
:
Tuple
[
int
,
int
]
=
(
70
,
70
),
)
->
None
:
super
().
__init__
(
num_queries
,
embed_dim
,
num_heads
,
kv_dim
,
norm_layer
)
self
.
max_size
=
max_size
self
.
_set_2d_pos_cache
(
self
.
max_size
)
self
.
apply
(
self
.
_init_weights
)
def
_set_2d_pos_cache
(
self
,
max_size
:
Tuple
[
int
,
int
],
device
:
torch
.
types
.
Device
=
"cpu"
)
->
None
:
pos_embed_arr
=
get_2d_sincos_pos_embed
(
self
.
embed_dim
,
max_size
,
version
=
(
2
,
5
))
pos_embed
=
torch
.
from_numpy
(
pos_embed_arr
).
float
().
to
(
device
)
self
.
register_buffer
(
"pos_embed"
,
pos_embed
,
persistent
=
False
)
def
_adjust_pos_cache
(
self
,
tgt_sizes
:
torch
.
Tensor
,
device
:
torch
.
types
.
Device
)
->
None
:
max_h
=
tgt_sizes
[:,
0
].
max
().
item
()
max_w
=
tgt_sizes
[:,
1
].
max
().
item
()
assert
isinstance
(
max_h
,
int
)
and
isinstance
(
max_w
,
int
)
if
max_h
>
self
.
max_size
[
0
]
or
max_w
>
self
.
max_size
[
1
]:
self
.
max_size
=
(
max
(
max_h
,
self
.
max_size
[
0
]),
max
(
max_w
,
self
.
max_size
[
1
]),
)
self
.
_set_2d_pos_cache
(
self
.
max_size
,
device
)
def
forward
(
self
,
x
:
torch
.
Tensor
,
tgt_sizes
:
torch
.
Tensor
)
->
torch
.
Tensor
:
assert
x
.
shape
[
0
]
==
tgt_sizes
.
shape
[
0
]
bs
=
x
.
shape
[
0
]
device
=
x
.
device
dtype
=
x
.
dtype
patch_len
=
tgt_sizes
[:,
0
]
*
tgt_sizes
[:,
1
]
self
.
_adjust_pos_cache
(
tgt_sizes
,
device
=
device
)
max_patch_len
=
patch_len
.
max
().
item
()
assert
isinstance
(
max_patch_len
,
int
)
key_padding_mask
=
torch
.
zeros
((
bs
,
max_patch_len
),
dtype
=
torch
.
bool
,
device
=
device
)
pos_embed
=
[]
for
i
in
range
(
bs
):
tgt_h
,
tgt_w
=
tgt_sizes
[
i
].
tolist
()
pos_embed
.
append
(
self
.
pos_embed
[:
tgt_h
,
:
tgt_w
,
:].
reshape
(
(
tgt_h
*
tgt_w
,
-
1
)).
to
(
dtype
))
# patches * D
key_padding_mask
[
i
,
patch_len
[
i
]:]
=
True
pos_embed
=
torch
.
nn
.
utils
.
rnn
.
pad_sequence
(
pos_embed
,
batch_first
=
True
,
padding_value
=
0.0
).
permute
(
1
,
0
,
2
)
# BLD => L * B * D
x
,
_
=
self
.
kv_proj
(
x
)
# B * L * D
x
=
self
.
ln_kv
(
x
).
permute
(
1
,
0
,
2
)
# L * B * D
q
=
self
.
ln_q
(
self
.
query
)
# Q * D
out
=
self
.
attn
(
self
.
_repeat
(
q
,
bs
),
# Q * B * D
x
+
pos_embed
,
# L * B * D + L * B * D
x
,
key_padding_mask
=
key_padding_mask
,
)[
0
]
# out: Q * B * D
x
=
out
.
permute
(
1
,
0
,
2
)
# B * Q * D
x
=
self
.
ln_post
(
x
)
x
=
x
@
self
.
proj
return
x
def
get_version_by_config
(
config
:
PretrainedConfig
)
->
Tuple
[
int
,
...]:
version_float
=
getattr
(
config
,
"version"
,
None
)
# The old configs do not include version number
# TODO: Remove this after the HF repos are updated
if
version_float
is
None
:
if
config
.
hidden_size
==
2304
and
config
.
query_num
==
64
:
return
(
2
,
0
)
return
(
2
,
5
)
version_str
=
str
(
version_float
)
return
tuple
(
int
(
x
)
for
x
in
version_str
.
split
(
"."
))
def
get_max_minicpmv_image_tokens
(
ctx
:
InputContext
):
hf_config
=
ctx
.
get_hf_config
(
PretrainedConfig
)
return
getattr
(
hf_config
,
"query_num"
,
64
)
def
dummy_seq_data_for_minicpmv
(
seq_len
:
int
):
token_ids
=
[
0
]
*
seq_len
return
SequenceData
(
token_ids
)
def
dummy_image_for_minicpmv
(
hf_config
:
PretrainedConfig
):
width
=
height
=
hf_config
.
image_size
image
=
Image
.
new
(
"RGB"
,
(
width
,
height
),
color
=
0
)
return
{
"image"
:
image
}
def
dummy_data_for_minicpmv
(
ctx
:
InputContext
,
seq_len
:
int
):
hf_config
=
ctx
.
get_hf_config
(
PretrainedConfig
)
seq_data
=
dummy_seq_data_for_minicpmv
(
seq_len
)
mm_data
=
dummy_image_for_minicpmv
(
hf_config
)
return
seq_data
,
mm_data
def
input_processor_for_minicpmv
(
ctx
:
InputContext
,
llm_inputs
:
LLMInputs
):
multi_modal_data
=
llm_inputs
.
get
(
"multi_modal_data"
)
if
multi_modal_data
is
None
or
"image"
not
in
multi_modal_data
:
return
llm_inputs
model_config
=
ctx
.
model_config
version
=
get_version_by_config
(
model_config
.
hf_config
)
tokenizer
=
cached_get_tokenizer
(
model_config
.
tokenizer
,
trust_remote_code
=
True
)
image_processor
=
cached_get_image_processor
(
model_config
.
tokenizer
)
def
get_placeholder
(
image_size
:
Tuple
[
int
,
int
],
num_image
:
int
):
if
version
==
(
2
,
0
)
or
version
==
(
2
,
5
):
return
image_processor
.
\
get_slice_image_placeholder
(
image_size
)
return
image_processor
.
\
get_slice_image_placeholder
(
image_size
,
num_image
)
prompt
=
llm_inputs
.
get
(
"prompt"
)
if
prompt
is
None
:
token_ids
=
llm_inputs
.
get
(
"prompt_token_ids"
)
prompt
=
tokenizer
.
decode
(
token_ids
)
pattern
=
"(<image>./</image>)"
images
=
multi_modal_data
[
"image"
]
if
isinstance
(
images
,
Image
.
Image
):
images
=
[
images
]
image_tags
=
re
.
findall
(
pattern
,
prompt
)
if
len
(
image_tags
)
==
0
:
new_token_ids
=
token_ids
new_prompt
=
prompt
else
:
text_chunks
=
prompt
.
split
(
pattern
)
new_prompt_chunks
:
List
[
str
]
=
[]
for
i
in
range
(
len
(
images
)):
new_prompt_chunks
+=
[
text_chunks
[
i
],
get_placeholder
(
images
[
i
].
size
,
i
)
]
new_prompt_chunks
.
append
(
text_chunks
[
-
1
])
new_prompt
=
""
.
join
(
new_prompt_chunks
)
new_token_ids
=
tokenizer
.
encode
(
new_prompt
)
llm_inputs
=
LLMInputs
(
prompt_token_ids
=
new_token_ids
,
prompt
=
new_prompt
,
multi_modal_data
=
multi_modal_data
,
)
return
llm_inputs
class
MiniCPMVBaseModel
(
nn
.
Module
,
SupportsVision
):
"""
The abstract class of MiniCPMV can only be inherited, but cannot be
instantiated.
"""
def
__init__
(
self
,
config
:
PretrainedConfig
,
multimodal_config
:
MultiModalConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
):
super
().
__init__
()
self
.
config
=
config
self
.
multimodal_config
=
multimodal_config
self
.
version
=
get_version_by_config
(
self
.
config
)
self
.
llm
=
self
.
init_llm
(
config
,
cache_config
,
quant_config
)
self
.
vpm
=
self
.
init_vision_module
()
param_dtype
=
torch
.
get_default_dtype
()
self
.
vpm
.
to
(
dtype
=
param_dtype
)
self
.
vision_dim
=
(
self
.
vpm
.
embed_dim
if
self
.
version
==
(
2
,
0
)
else
self
.
vpm
.
embeddings
.
embed_dim
)
self
.
embed_dim
=
self
.
config
.
hidden_size
self
.
resampler
=
self
.
init_resampler
(
self
.
embed_dim
,
self
.
vision_dim
)
self
.
resampler
.
to
(
device
=
"cuda"
,
dtype
=
param_dtype
)
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
hidden_size
,
quant_config
=
quant_config
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
sampler
=
Sampler
()
def
get_embedding
(
self
,
input_ids
:
torch
.
Tensor
,
image_inputs
:
Optional
[
MiniCPMVImageInputs
],
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
vlm_embedding
:
torch
.
Tensor
=
self
.
llm
.
embed_tokens
(
input_ids
)
if
hasattr
(
self
.
config
,
"scale_emb"
):
vlm_embedding
*=
self
.
config
.
scale_emb
if
image_inputs
is
None
:
# No image
vision_hidden_states
=
torch
.
tensor
([],
device
=
input_ids
.
device
)
else
:
vision_hidden_states
=
self
.
get_vision_hidden_states
(
image_inputs
)
# See NOTE in _parse_and_validate_inputs
image_bounds
=
image_inputs
[
"image_bounds"
]
if
len
(
image_bounds
)
>
0
:
image_indices
=
torch
.
stack
([
torch
.
arange
(
start
,
end
,
dtype
=
torch
.
long
)
for
start
,
end
in
image_bounds
.
tolist
()
]).
to
(
vlm_embedding
.
device
)
vlm_embedding
.
scatter_
(
0
,
image_indices
.
view
(
-
1
,
1
).
repeat
(
1
,
vlm_embedding
.
shape
[
-
1
]),
vision_hidden_states
.
view
(
-
1
,
vision_hidden_states
.
shape
[
-
1
]),
)
return
vlm_embedding
,
vision_hidden_states
def
_get_image_bounds
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
tokenizer
=
cached_get_tokenizer
(
self
.
config
.
_name_or_path
,
trust_remote_code
=
True
)
start_cond
=
input_ids
==
tokenizer
.
im_start_id
end_cond
=
input_ids
==
tokenizer
.
im_end_id
if
hasattr
(
tokenizer
,
"slice_start_id"
):
start_cond
|=
(
input_ids
==
tokenizer
.
slice_start_id
)
end_cond
|=
(
input_ids
==
tokenizer
.
slice_end_id
)
image_start_tokens
,
=
torch
.
where
(
start_cond
)
image_start_tokens
+=
1
image_end_tokens
,
=
torch
.
where
(
end_cond
)
valid_image_nums
=
max
(
len
(
image_start_tokens
),
len
(
image_end_tokens
))
if
valid_image_nums
==
0
:
return
torch
.
zeros
((
0
,
2
),
device
=
input_ids
.
device
)
return
torch
.
hstack
([
image_start_tokens
[:
valid_image_nums
].
unsqueeze
(
-
1
),
image_end_tokens
[:
valid_image_nums
].
unsqueeze
(
-
1
),
])
def
_parse_and_validate_inputs
(
self
,
input_ids
:
torch
.
Tensor
,
**
kwargs
:
object
,
)
->
Optional
[
MiniCPMVImageInputs
]:
pixel_values
=
kwargs
.
pop
(
"pixel_values"
,
[])
tgt_sizes
=
kwargs
.
pop
(
"tgt_sizes"
,
[])
if
not
isinstance
(
pixel_values
,
(
torch
.
Tensor
,
list
)):
raise
ValueError
(
"Incorrect type of pixel values. "
f
"Got type:
{
type
(
pixel_values
)
}
"
)
if
not
isinstance
(
tgt_sizes
,
(
torch
.
Tensor
,
list
)):
raise
ValueError
(
"Incorrect type of target sizes. "
f
"Got type:
{
type
(
tgt_sizes
)
}
"
)
if
len
(
pixel_values
)
!=
len
(
tgt_sizes
):
raise
ValueError
(
"Inconsistent batch lengths, found: "
f
"
{
len
(
pixel_values
)
}
vs.
{
len
(
tgt_sizes
)
}
"
)
pixel_values_flat
:
List
[
torch
.
Tensor
]
=
[]
tgt_sizes_flat
:
List
[
torch
.
Tensor
]
=
[]
for
b
in
range
(
len
(
pixel_values
)):
pixel_values_flat
+=
pixel_values
[
b
]
tgt_sizes_flat
+=
tgt_sizes
[
b
]
# NOTE: Input IDs does not contain image tokens during memory profiling,
# so we allow it to be empty
if
len
(
pixel_values_flat
)
!=
len
(
tgt_sizes_flat
):
raise
ValueError
(
"Inconsistent flattened lengths, found: "
f
"
{
len
(
pixel_values_flat
)
}
vs. "
f
"
{
len
(
tgt_sizes_flat
)
}
"
)
if
len
(
pixel_values_flat
)
==
0
:
return
None
return
MiniCPMVImageInputs
(
image_bounds
=
self
.
_get_image_bounds
(
input_ids
),
pixel_values
=
pixel_values_flat
,
tgt_sizes
=
torch
.
stack
(
tgt_sizes_flat
),
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
**
kwargs
:
Any
,
)
->
torch
.
Tensor
:
image_inputs
=
self
.
_parse_and_validate_inputs
(
input_ids
,
**
kwargs
)
vlm_embeddings
,
_
=
self
.
get_embedding
(
input_ids
,
image_inputs
)
output
=
self
.
llm
(
input_ids
=
None
,
positions
=
positions
,
kv_caches
=
kv_caches
,
attn_metadata
=
attn_metadata
,
intermediate_tensors
=
intermediate_tensors
,
inputs_embeds
=
vlm_embeddings
,
)
return
output
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
torch
.
Tensor
:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
"qkv_proj"
,
"q_proj"
,
"q"
),
(
"qkv_proj"
,
"k_proj"
,
"k"
),
(
"qkv_proj"
,
"v_proj"
,
"v"
),
(
"gate_up_proj"
,
"gate_proj"
,
0
),
(
"gate_up_proj"
,
"up_proj"
,
1
),
]
params_dict
=
dict
(
self
.
named_parameters
())
for
name
,
loaded_weight
in
weights
:
for
key_to_modify
,
new_key
in
_KEYS_TO_MODIFY_MAPPING
.
items
():
if
key_to_modify
in
name
:
name
=
name
.
replace
(
key_to_modify
,
new_key
)
if
"rotary_emb.inv_freq"
in
name
:
continue
if
(
"rotary_emb.cos_cached"
in
name
or
"rotary_emb.sin_cached"
in
name
):
# Models trained using ColossalAI may include these tensors in
# the checkpoint. Skip them.
continue
use_default_weight_loading
=
False
if
self
.
is_default_weight_loading
(
name
):
use_default_weight_loading
=
True
else
:
for
param_name
,
weight_name
,
shard_id
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
param
=
params_dict
[
name
.
replace
(
weight_name
,
param_name
)]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
use_default_weight_loading
=
True
if
use_default_weight_loading
:
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
def
init_llm
(
self
,
config
:
PretrainedConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
)
->
nn
.
Module
:
raise
NotImplementedError
def
init_vision_module
(
self
)
->
nn
.
Module
:
raise
NotImplementedError
def
init_resampler
(
self
,
embed_dim
:
int
,
vision_dim
:
int
)
->
nn
.
Module
:
raise
NotImplementedError
def
get_vision_embedding
(
self
,
pixel_values
:
List
[
torch
.
Tensor
],
patch_attn_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
tgt_sizes
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
raise
NotImplementedError
def
get_vision_hidden_states
(
self
,
data
:
MiniCPMVImageInputs
)
->
torch
.
Tensor
:
raise
NotImplementedError
def
is_default_weight_loading
(
self
,
name
:
str
)
->
bool
:
raise
NotImplementedError
class
MiniCPMV2
(
MiniCPMVBaseModel
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
multimodal_config
:
MultiModalConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
):
super
().
__init__
(
config
,
multimodal_config
,
cache_config
,
quant_config
)
assert
self
.
version
==
(
2
,
0
)
def
init_llm
(
self
,
config
:
PretrainedConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
)
->
nn
.
Module
:
return
MiniCPMModel
(
config
,
cache_config
=
cache_config
,
quant_config
=
quant_config
)
def
init_vision_module
(
self
)
->
nn
.
Module
:
# TODO :refactor this vision model
try
:
import
timm
except
ImportError
:
raise
ImportError
(
"Please install timm==0.9.10"
)
from
ImportError
with
set_default_torch_dtype
(
torch
.
float16
):
model
=
timm
.
create_model
(
"vit_so400m_patch14_siglip_384.webli"
,
pretrained
=
False
,
num_classes
=
0
,
dynamic_img_size
=
True
,
dynamic_img_pad
=
True
,
)
if
(
isinstance
(
model
,
timm
.
models
.
VisionTransformer
)
and
model
.
attn_pool
is
not
None
):
model
.
attn_pool
=
torch
.
nn
.
Identity
()
if
self
.
config
.
drop_vision_last_layer
:
model
.
blocks
=
model
.
blocks
[:
-
1
]
return
model
def
init_resampler
(
self
,
embed_dim
:
int
,
vision_dim
:
int
)
->
nn
.
Module
:
with
set_default_torch_dtype
(
torch
.
float16
):
resampler
=
Resampler2
(
embed_dim
=
embed_dim
,
num_heads
=
embed_dim
//
128
,
grid_size
=
int
(
math
.
sqrt
(
self
.
config
.
query_num
)),
kv_dim
=
vision_dim
,
adaptive
=
True
,
)
return
resampler
def
get_vision_embedding
(
self
,
pixel_values
:
List
[
torch
.
Tensor
],
patch_attn_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
tgt_sizes
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
res
=
[]
dtype
=
self
.
vpm
.
pos_embed
.
data
.
dtype
for
pixel_value
in
pixel_values
:
H
,
W
=
pixel_value
[
0
].
shape
[
-
2
:]
tgt_size
=
(
math
.
ceil
(
H
/
self
.
vpm
.
patch_embed
.
patch_size
[
0
]),
math
.
ceil
(
W
/
self
.
vpm
.
patch_embed
.
patch_size
[
0
]),
)
vision_embedding
=
self
.
vpm
.
forward_features
(
pixel_value
.
unsqueeze
(
0
).
type
(
dtype
))
if
(
hasattr
(
self
.
vpm
,
"num_prefix_tokens"
)
and
self
.
vpm
.
num_prefix_tokens
>
0
):
vision_embedding
=
vision_embedding
[:,
self
.
vpm
.
num_prefix_tokens
:]
res
.
append
(
self
.
resampler
(
vision_embedding
,
tgt_size
))
return
torch
.
vstack
(
res
)
def
get_vision_hidden_states
(
self
,
data
:
MiniCPMVImageInputs
)
->
torch
.
Tensor
:
pixel_values
=
data
[
"pixel_values"
]
return
self
.
get_vision_embedding
(
pixel_values
)
def
is_default_weight_loading
(
self
,
name
:
str
)
->
bool
:
return
"resampler"
in
name
or
"vpm"
in
name
class
MiniCPMV2_5
(
MiniCPMVBaseModel
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
multimodal_config
:
MultiModalConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
):
super
().
__init__
(
config
,
multimodal_config
,
cache_config
,
quant_config
)
assert
self
.
version
==
(
2
,
5
)
def
init_llm
(
self
,
config
:
PretrainedConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
)
->
nn
.
Module
:
return
LlamaModel
(
config
,
cache_config
=
cache_config
,
quant_config
=
quant_config
)
def
init_vision_module
(
self
)
->
nn
.
Module
:
model
=
Idefics2VisionTransformer
(
self
.
config
.
vision_config
)
if
self
.
config
.
drop_vision_last_layer
:
model
.
encoder
.
layers
=
model
.
encoder
.
layers
[:
-
1
]
return
model
def
init_resampler
(
self
,
embed_dim
:
int
,
vision_dim
:
int
)
->
nn
.
Module
:
with
set_default_torch_dtype
(
torch
.
float16
):
resampler
=
Resampler2_5
(
num_queries
=
self
.
config
.
query_num
,
embed_dim
=
embed_dim
,
num_heads
=
embed_dim
//
128
,
kv_dim
=
vision_dim
,
)
return
resampler
def
get_vision_embedding
(
self
,
pixel_values
:
List
[
torch
.
Tensor
],
patch_attn_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
tgt_sizes
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
vision_embedding
=
self
.
vpm
(
pixel_values
,
patch_attention_mask
=
patch_attn_mask
)
vision_embedding
=
self
.
resampler
(
vision_embedding
,
tgt_sizes
)
return
vision_embedding
def
get_vision_hidden_states
(
self
,
data
:
MiniCPMVImageInputs
)
->
torch
.
Tensor
:
pixel_values
=
data
[
"pixel_values"
]
tgt_sizes
=
data
[
"tgt_sizes"
]
device
=
self
.
vpm
.
embeddings
.
position_embedding
.
weight
.
device
dtype
=
self
.
vpm
.
embeddings
.
position_embedding
.
weight
.
dtype
all_pixel_values_lst
=
[
i
.
flatten
(
end_dim
=
1
).
permute
(
1
,
0
)
for
i
in
pixel_values
]
max_patches
=
(
tgt_sizes
[:,
0
]
*
tgt_sizes
[:,
1
]).
max
().
item
()
assert
isinstance
(
max_patches
,
int
)
all_pixel_values
=
torch
.
nn
.
utils
.
rnn
.
pad_sequence
(
all_pixel_values_lst
,
batch_first
=
True
,
padding_value
=
0.0
)
B
,
L
,
_
=
all_pixel_values
.
shape
all_pixel_values
=
all_pixel_values
.
permute
(
0
,
2
,
1
).
reshape
(
B
,
3
,
-
1
,
L
)
patch_attn_mask
=
torch
.
zeros
((
B
,
1
,
max_patches
),
dtype
=
torch
.
bool
,
device
=
device
)
for
i
in
range
(
B
):
patch_attn_mask
[
i
,
:
tgt_sizes
[
i
][
0
]
*
tgt_sizes
[
i
][
1
]]
=
True
return
self
.
get_vision_embedding
(
all_pixel_values
.
type
(
dtype
),
patch_attn_mask
,
tgt_sizes
)
def
is_default_weight_loading
(
self
,
name
:
str
)
->
bool
:
return
"resampler"
in
name
# NOTE: Currently, information about this model is unavailable. We are
# temporarily using `MiniCPMVQwen2` as it's name. The name may need
# to be modified in the future.
class
MiniCPMVQwen2
(
MiniCPMVBaseModel
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
multimodal_config
:
MultiModalConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
):
super
().
__init__
(
config
,
multimodal_config
,
cache_config
,
quant_config
)
def
init_llm
(
self
,
config
:
PretrainedConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
)
->
nn
.
Module
:
return
Qwen2Model
(
config
,
cache_config
=
cache_config
,
quant_config
=
quant_config
)
def
init_vision_module
(
self
)
->
nn
.
Module
:
# A custom version of SiglipVisionTransformer, won't work with TP
from
vllm.model_executor.models.na_vit
import
SiglipVisionTransformer
if
self
.
config
.
_attn_implementation
==
"flash_attention_2"
:
self
.
config
.
vision_config
.
_attn_implementation
=
"flash_attention_2"
else
:
# not support sdpa
self
.
config
.
vision_config
.
_attn_implementation
=
"eager"
model
=
SiglipVisionTransformer
(
self
.
config
.
vision_config
)
if
self
.
config
.
drop_vision_last_layer
:
model
.
encoder
.
layers
=
model
.
encoder
.
layers
[:
-
1
]
return
model
def
init_resampler
(
self
,
embed_dim
:
int
,
vision_dim
:
int
)
->
nn
.
Module
:
with
set_default_torch_dtype
(
torch
.
float16
):
resampler
=
Resampler2_5
(
num_queries
=
self
.
config
.
query_num
,
embed_dim
=
embed_dim
,
num_heads
=
embed_dim
//
128
,
kv_dim
=
vision_dim
,
)
return
resampler
def
get_vision_embedding
(
self
,
pixel_values
:
List
[
torch
.
Tensor
],
patch_attn_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
tgt_sizes
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
vision_embedding
=
self
.
vpm
(
pixel_values
,
patch_attention_mask
=
patch_attn_mask
,
tgt_sizes
=
tgt_sizes
,
).
last_hidden_state
return
vision_embedding
def
get_vision_hidden_states
(
self
,
data
:
MiniCPMVImageInputs
)
->
torch
.
Tensor
:
pixel_values
=
data
[
"pixel_values"
]
tgt_sizes
=
data
[
"tgt_sizes"
]
device
=
self
.
vpm
.
embeddings
.
position_embedding
.
weight
.
device
dtype
=
self
.
vpm
.
embeddings
.
position_embedding
.
weight
.
dtype
all_pixel_values_lst
=
[
i
.
flatten
(
end_dim
=
1
).
permute
(
1
,
0
)
for
i
in
pixel_values
]
max_patches
=
(
tgt_sizes
[:,
0
]
*
tgt_sizes
[:,
1
]).
max
().
item
()
assert
isinstance
(
max_patches
,
int
)
all_pixel_values
=
torch
.
nn
.
utils
.
rnn
.
pad_sequence
(
all_pixel_values_lst
,
batch_first
=
True
,
padding_value
=
0.0
)
B
,
L
,
_
=
all_pixel_values
.
shape
all_pixel_values
=
all_pixel_values
.
permute
(
0
,
2
,
1
).
reshape
(
B
,
3
,
-
1
,
L
)
patch_attn_mask
=
torch
.
zeros
((
B
,
1
,
max_patches
),
dtype
=
torch
.
bool
,
device
=
device
)
for
i
in
range
(
B
):
patch_attn_mask
[
i
,
0
,
:
tgt_sizes
[
i
][
0
]
*
tgt_sizes
[
i
][
1
]]
=
True
vision_embedding
=
self
.
vpm
(
all_pixel_values
.
type
(
dtype
),
patch_attention_mask
=
patch_attn_mask
,
tgt_sizes
=
tgt_sizes
,
).
last_hidden_state
return
self
.
resampler
(
vision_embedding
,
tgt_sizes
)
def
is_default_weight_loading
(
self
,
name
:
str
)
->
bool
:
return
"resampler"
in
name
or
"vpm"
in
name
@
MULTIMODAL_REGISTRY
.
register_image_input_mapper
()
@
MULTIMODAL_REGISTRY
.
register_max_image_tokens
(
get_max_minicpmv_image_tokens
)
@
INPUT_REGISTRY
.
register_dummy_data
(
dummy_data_for_minicpmv
)
@
INPUT_REGISTRY
.
register_input_processor
(
input_processor_for_minicpmv
)
class
MiniCPMV
(
MiniCPMVBaseModel
):
"""
Different versions of MiniCPMV use different visual encoders and LLMs,
which is not conducive to the current integration logic of LoRA and
bitsandbytes in vLLM. Therefore, it is necessary to separate them.
"""
def
__new__
(
cls
,
config
:
PretrainedConfig
,
multimodal_config
:
MultiModalConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
):
if
not
hasattr
(
config
,
"version"
):
if
config
.
hidden_size
==
2304
and
config
.
query_num
==
64
:
version
=
(
2
,
0
)
else
:
version
=
(
2
,
5
)
else
:
version
=
str
(
config
.
version
).
split
(
"."
)
version
=
tuple
([
int
(
x
)
for
x
in
version
])
# Dispatch class based on version
if
version
==
(
2
,
0
):
instance_class
=
MiniCPMV2
elif
version
==
(
2
,
5
):
instance_class
=
MiniCPMV2_5
else
:
instance_class
=
MiniCPMVQwen2
return
instance_class
(
config
,
multimodal_config
,
cache_config
,
quant_config
)
vllm/model_executor/models/na_vit.py
0 → 100644
View file @
e661d594
import
logging
import
math
import
os
import
warnings
from
typing
import
Optional
,
Tuple
,
Union
import
numpy
as
np
import
torch
import
torch.nn.functional
as
F
from
torch
import
nn
from
torch.nn.init
import
_calculate_fan_in_and_fan_out
from
transformers.activations
import
ACT2FN
from
transformers.configuration_utils
import
PretrainedConfig
from
transformers.modeling_attn_mask_utils
import
_prepare_4d_attention_mask
from
transformers.modeling_outputs
import
(
BaseModelOutput
,
BaseModelOutputWithPooling
)
from
transformers.modeling_utils
import
PreTrainedModel
from
transformers.utils
import
(
ModelOutput
,
is_flash_attn_2_available
,
replace_return_docstrings
)
logger
=
logging
.
getLogger
(
"vllm"
)
# For Siglip: copied from
# HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
# Remove hints as there's little possibility to change these code.
class
SiglipVisionConfig
(
PretrainedConfig
):
model_type
=
"siglip_vision_model"
def
__init__
(
self
,
hidden_size
=
768
,
intermediate_size
=
3072
,
num_hidden_layers
=
12
,
num_attention_heads
=
12
,
num_channels
=
3
,
image_size
=
224
,
patch_size
=
16
,
hidden_act
=
"gelu_pytorch_tanh"
,
layer_norm_eps
=
1e-6
,
attention_dropout
=
0.0
,
**
kwargs
,
):
super
().
__init__
(
**
kwargs
)
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
num_channels
=
num_channels
self
.
patch_size
=
patch_size
self
.
image_size
=
image_size
self
.
attention_dropout
=
attention_dropout
self
.
layer_norm_eps
=
layer_norm_eps
self
.
hidden_act
=
hidden_act
@
classmethod
def
from_pretrained
(
cls
,
pretrained_model_name_or_path
:
Union
[
str
,
os
.
PathLike
],
**
kwargs
)
->
"PretrainedConfig"
:
cls
.
_set_token_in_kwargs
(
kwargs
)
config_dict
,
kwargs
=
cls
.
get_config_dict
(
pretrained_model_name_or_path
,
**
kwargs
)
# get the vision config dict if we are loading from SiglipConfig
if
config_dict
.
get
(
"model_type"
)
==
"siglip"
:
config_dict
=
config_dict
[
"vision_config"
]
if
"model_type"
in
config_dict
and
hasattr
(
cls
,
"model_type"
)
and
config_dict
[
"model_type"
]
!=
cls
.
model_type
:
logger
.
warning
(
"You are using a model of type %s to "
"instantiate a model of type %s. "
"This is not supported for all configurations"
"of models and can yield errors."
,
config_dict
[
'model_type'
],
cls
.
model_type
)
return
cls
.
from_dict
(
config_dict
,
**
kwargs
)
_CHECKPOINT_FOR_DOC
=
"google/siglip-base-patch16-224"
SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST
=
[
"google/siglip-base-patch16-224"
,
# See all SigLIP models at https://huggingface.co/models?filter=siglip
]
if
is_flash_attn_2_available
():
from
flash_attn
import
flash_attn_func
,
flash_attn_varlen_func
from
flash_attn.bert_padding
import
pad_input
# noqa
from
flash_attn.bert_padding
import
index_first_axis
,
unpad_input
# Copied from transformers.models.llama.modeling_llama._get_unpad_data
def
_get_unpad_data
(
attention_mask
):
seqlens_in_batch
=
attention_mask
.
sum
(
dim
=-
1
,
dtype
=
torch
.
int32
)
indices
=
torch
.
nonzero
(
attention_mask
.
flatten
(),
as_tuple
=
False
).
flatten
()
max_seqlen_in_batch
=
seqlens_in_batch
.
max
().
item
()
cu_seqlens
=
F
.
pad
(
torch
.
cumsum
(
seqlens_in_batch
,
dim
=
0
,
dtype
=
torch
.
int32
),
(
1
,
0
))
return
(
indices
,
cu_seqlens
,
max_seqlen_in_batch
,
)
def
_trunc_normal_
(
tensor
,
mean
,
std
,
a
,
b
):
def
norm_cdf
(
x
):
# Computes standard normal cumulative distribution function
return
(
1.0
+
math
.
erf
(
x
/
math
.
sqrt
(
2.0
)))
/
2.0
if
(
mean
<
a
-
2
*
std
)
or
(
mean
>
b
+
2
*
std
):
warnings
.
warn
(
"mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
"The distribution of values may be incorrect."
,
stacklevel
=
2
,
)
# Values are generated by using a truncated uniform distribution and
# then using the inverse CDF for the normal distribution.
# Get upper and lower cdf values
l_
=
norm_cdf
((
a
-
mean
)
/
std
)
u
=
norm_cdf
((
b
-
mean
)
/
std
)
# Uniformly fill tensor with values from [l, u], then translate to
# [2l-1, 2u-1].
tensor
.
uniform_
(
2
*
l_
-
1
,
2
*
u
-
1
)
# Use inverse cdf transform for normal distribution to get truncated
# standard normal
if
tensor
.
dtype
in
[
torch
.
float16
,
torch
.
bfloat16
]:
# The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
og_dtype
=
tensor
.
dtype
tensor
=
tensor
.
to
(
torch
.
float32
)
tensor
.
erfinv_
()
tensor
=
tensor
.
to
(
og_dtype
)
else
:
tensor
.
erfinv_
()
# Transform to proper mean, std
tensor
.
mul_
(
std
*
math
.
sqrt
(
2.0
))
tensor
.
add_
(
mean
)
# Clamp to ensure it's in the proper range
if
tensor
.
dtype
==
torch
.
float16
:
# The `clamp_` op is not (yet?) defined in float16+cpu
tensor
=
tensor
.
to
(
torch
.
float32
)
tensor
.
clamp_
(
min
=
a
,
max
=
b
)
tensor
=
tensor
.
to
(
torch
.
float16
)
else
:
tensor
.
clamp_
(
min
=
a
,
max
=
b
)
def
trunc_normal_tf_
(
tensor
:
torch
.
Tensor
,
mean
:
float
=
0.0
,
std
:
float
=
1.0
,
a
:
float
=
-
2.0
,
b
:
float
=
2.0
)
->
torch
.
Tensor
:
with
torch
.
no_grad
():
_trunc_normal_
(
tensor
,
0
,
1.0
,
a
,
b
)
tensor
.
mul_
(
std
).
add_
(
mean
)
def
variance_scaling_
(
tensor
,
scale
=
1.0
,
mode
=
"fan_in"
,
distribution
=
"normal"
):
fan_in
,
fan_out
=
_calculate_fan_in_and_fan_out
(
tensor
)
if
mode
==
"fan_in"
:
denom
=
fan_in
elif
mode
==
"fan_out"
:
denom
=
fan_out
elif
mode
==
"fan_avg"
:
denom
=
(
fan_in
+
fan_out
)
/
2
variance
=
scale
/
denom
if
distribution
==
"truncated_normal"
:
# constant is stddev of standard normal truncated to (-2, 2)
trunc_normal_tf_
(
tensor
,
std
=
math
.
sqrt
(
variance
)
/
0.87962566103423978
)
elif
distribution
==
"normal"
:
with
torch
.
no_grad
():
tensor
.
normal_
(
std
=
math
.
sqrt
(
variance
))
elif
distribution
==
"uniform"
:
bound
=
math
.
sqrt
(
3
*
variance
)
with
torch
.
no_grad
():
tensor
.
uniform_
(
-
bound
,
bound
)
else
:
raise
ValueError
(
f
"invalid distribution
{
distribution
}
"
)
def
lecun_normal_
(
tensor
):
variance_scaling_
(
tensor
,
mode
=
"fan_in"
,
distribution
=
"truncated_normal"
)
def
default_flax_embed_init
(
tensor
):
variance_scaling_
(
tensor
,
mode
=
"fan_in"
,
distribution
=
"normal"
)
class
SiglipVisionModelOutput
(
ModelOutput
):
image_embeds
:
Optional
[
torch
.
FloatTensor
]
=
None
last_hidden_state
:
torch
.
FloatTensor
=
None
hidden_states
:
Optional
[
Tuple
[
torch
.
FloatTensor
]]
=
None
attentions
:
Optional
[
Tuple
[
torch
.
FloatTensor
]]
=
None
class
SiglipVisionEmbeddings
(
nn
.
Module
):
def
__init__
(
self
,
config
:
SiglipVisionConfig
):
super
().
__init__
()
self
.
config
=
config
self
.
embed_dim
=
config
.
hidden_size
self
.
image_size
=
config
.
image_size
self
.
patch_size
=
config
.
patch_size
self
.
patch_embedding
=
nn
.
Conv2d
(
in_channels
=
config
.
num_channels
,
out_channels
=
self
.
embed_dim
,
kernel_size
=
self
.
patch_size
,
stride
=
self
.
patch_size
,
padding
=
"valid"
,
)
self
.
num_patches_per_side
=
self
.
image_size
//
self
.
patch_size
self
.
num_patches
=
self
.
num_patches_per_side
**
2
self
.
num_positions
=
self
.
num_patches
self
.
position_embedding
=
nn
.
Embedding
(
self
.
num_positions
,
self
.
embed_dim
)
def
forward
(
self
,
pixel_values
:
torch
.
FloatTensor
,
patch_attention_mask
:
torch
.
BoolTensor
,
tgt_sizes
:
Optional
[
torch
.
IntTensor
]
=
None
)
->
torch
.
Tensor
:
batch_size
=
pixel_values
.
size
(
0
)
patch_embeds
=
self
.
patch_embedding
(
pixel_values
)
embeddings
=
patch_embeds
.
flatten
(
2
).
transpose
(
1
,
2
)
max_im_h
,
max_im_w
=
pixel_values
.
size
(
2
),
pixel_values
.
size
(
3
)
max_nb_patches_h
,
max_nb_patches_w
=
(
max_im_h
//
self
.
patch_size
,
max_im_w
//
self
.
patch_size
)
boundaries
=
torch
.
arange
(
1
/
self
.
num_patches_per_side
,
1.0
,
1
/
self
.
num_patches_per_side
)
position_ids
=
torch
.
full
(
size
=
(
batch_size
,
max_nb_patches_h
*
max_nb_patches_w
,
),
fill_value
=
0
,
)
for
batch_idx
,
p_attn_mask
in
enumerate
(
patch_attention_mask
):
if
tgt_sizes
is
not
None
:
nb_patches_h
=
tgt_sizes
[
batch_idx
][
0
]
nb_patches_w
=
tgt_sizes
[
batch_idx
][
1
]
else
:
nb_patches_h
=
p_attn_mask
[:,
0
].
sum
()
nb_patches_w
=
p_attn_mask
[
0
].
sum
()
fractional_coords_h
=
torch
.
arange
(
0
,
1
-
1e-6
,
1
/
nb_patches_h
)
fractional_coords_w
=
torch
.
arange
(
0
,
1
-
1e-6
,
1
/
nb_patches_w
)
bucket_coords_h
=
torch
.
bucketize
(
fractional_coords_h
,
boundaries
,
right
=
True
)
bucket_coords_w
=
torch
.
bucketize
(
fractional_coords_w
,
boundaries
,
right
=
True
)
pos_ids
=
(
bucket_coords_h
[:,
None
]
*
self
.
num_patches_per_side
+
bucket_coords_w
).
flatten
()
position_ids
[
batch_idx
][
p_attn_mask
.
view
(
-
1
).
cpu
()]
=
pos_ids
position_ids
=
position_ids
.
to
(
self
.
position_embedding
.
weight
.
device
)
embeddings
=
embeddings
+
self
.
position_embedding
(
position_ids
)
return
embeddings
class
SiglipAttention
(
nn
.
Module
):
"""Multi-headed attention from 'Attention Is All You Need' paper"""
# Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
def
__init__
(
self
,
config
):
super
().
__init__
()
self
.
config
=
config
self
.
embed_dim
=
config
.
hidden_size
self
.
num_heads
=
config
.
num_attention_heads
self
.
head_dim
=
self
.
embed_dim
//
self
.
num_heads
if
self
.
head_dim
*
self
.
num_heads
!=
self
.
embed_dim
:
raise
ValueError
(
"embed_dim must be divisible by num_heads (got `embed_dim`: "
f
"
{
self
.
embed_dim
}
and `num_heads`:"
f
"
{
self
.
num_heads
}
)."
)
self
.
scale
=
self
.
head_dim
**-
0.5
self
.
dropout
=
config
.
attention_dropout
self
.
k_proj
=
nn
.
Linear
(
self
.
embed_dim
,
self
.
embed_dim
)
self
.
v_proj
=
nn
.
Linear
(
self
.
embed_dim
,
self
.
embed_dim
)
self
.
q_proj
=
nn
.
Linear
(
self
.
embed_dim
,
self
.
embed_dim
)
self
.
out_proj
=
nn
.
Linear
(
self
.
embed_dim
,
self
.
embed_dim
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
attention_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
output_attentions
:
Optional
[
bool
]
=
False
,
)
->
Tuple
[
torch
.
Tensor
,
Optional
[
torch
.
Tensor
],
Optional
[
Tuple
[
torch
.
Tensor
]]]:
"""Input shape: Batch x Time x Channel"""
batch_size
,
q_len
,
_
=
hidden_states
.
size
()
query_states
=
self
.
q_proj
(
hidden_states
)
key_states
=
self
.
k_proj
(
hidden_states
)
value_states
=
self
.
v_proj
(
hidden_states
)
query_states
=
query_states
.
view
(
batch_size
,
q_len
,
self
.
num_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
key_states
=
key_states
.
view
(
batch_size
,
q_len
,
self
.
num_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
value_states
=
value_states
.
view
(
batch_size
,
q_len
,
self
.
num_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
k_v_seq_len
=
key_states
.
shape
[
-
2
]
attn_weights
=
torch
.
matmul
(
query_states
,
key_states
.
transpose
(
2
,
3
))
*
self
.
scale
if
attn_weights
.
size
()
!=
(
batch_size
,
self
.
num_heads
,
q_len
,
k_v_seq_len
):
raise
ValueError
(
"Attention weights should be of size "
f
"
{
(
batch_size
,
self
.
num_heads
,
q_len
,
k_v_seq_len
)
}
, but is"
f
"
{
attn_weights
.
size
()
}
"
)
if
attention_mask
is
not
None
:
if
attention_mask
.
size
()
!=
(
batch_size
,
1
,
q_len
,
k_v_seq_len
):
raise
ValueError
(
"Attention mask should be of size "
f
"
{
(
batch_size
,
1
,
q_len
,
k_v_seq_len
)
}
"
,
f
"but is
{
attention_mask
.
size
()
}
"
)
attn_weights
=
attn_weights
+
attention_mask
# upcast attention to fp32
attn_weights
=
nn
.
functional
.
softmax
(
attn_weights
,
dim
=-
1
,
dtype
=
torch
.
float32
).
to
(
query_states
.
dtype
)
attn_weights
=
nn
.
functional
.
dropout
(
attn_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
torch
.
matmul
(
attn_weights
,
value_states
)
if
attn_output
.
size
()
!=
(
batch_size
,
self
.
num_heads
,
q_len
,
self
.
head_dim
):
raise
ValueError
(
"`attn_output` should be of size "
f
"
{
(
batch_size
,
self
.
num_heads
,
q_len
,
self
.
head_dim
)
}
, "
"but is"
f
"
{
attn_output
.
size
()
}
"
)
attn_output
=
attn_output
.
transpose
(
1
,
2
).
contiguous
()
attn_output
=
attn_output
.
reshape
(
batch_size
,
q_len
,
self
.
embed_dim
)
attn_output
=
self
.
out_proj
(
attn_output
)
return
attn_output
,
attn_weights
class
SiglipFlashAttention2
(
SiglipAttention
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
is_causal
=
False
# Hack to make sure we don't use a causal mask
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
attention_mask
:
Optional
[
torch
.
LongTensor
]
=
None
,
position_ids
:
Optional
[
torch
.
LongTensor
]
=
None
,
past_key_value
:
Optional
[
Tuple
[
torch
.
Tensor
]]
=
None
,
output_attentions
:
bool
=
False
,
use_cache
:
bool
=
False
,
**
kwargs
,
)
->
Tuple
[
torch
.
Tensor
,
Optional
[
torch
.
Tensor
],
Optional
[
Tuple
[
torch
.
Tensor
]]]:
output_attentions
=
False
bsz
,
q_len
,
_
=
hidden_states
.
size
()
query_states
=
self
.
q_proj
(
hidden_states
)
key_states
=
self
.
k_proj
(
hidden_states
)
value_states
=
self
.
v_proj
(
hidden_states
)
query_states
=
query_states
.
view
(
bsz
,
q_len
,
self
.
num_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
key_states
=
key_states
.
view
(
bsz
,
q_len
,
self
.
num_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
value_states
=
value_states
.
view
(
bsz
,
q_len
,
self
.
num_heads
,
self
.
head_dim
).
transpose
(
1
,
2
)
kv_seq_len
=
key_states
.
shape
[
-
2
]
if
past_key_value
is
not
None
:
kv_seq_len
+=
past_key_value
.
get_usable_length
(
kv_seq_len
,
self
.
layer_idx
)
query_states
=
query_states
.
transpose
(
1
,
2
)
key_states
=
key_states
.
transpose
(
1
,
2
)
value_states
=
value_states
.
transpose
(
1
,
2
)
dropout_rate
=
self
.
dropout
if
self
.
training
else
0.0
input_dtype
=
query_states
.
dtype
if
input_dtype
==
torch
.
float32
:
if
torch
.
is_autocast_enabled
():
target_dtype
=
torch
.
get_autocast_gpu_dtype
()
# Handle the case where the model is quantized
elif
hasattr
(
self
.
config
,
"_pre_quantization_dtype"
):
target_dtype
=
self
.
config
.
_pre_quantization_dtype
else
:
target_dtype
=
self
.
q_proj
.
weight
.
dtype
logger
.
warning
(
"The input hidden states seems to be "
"silently casted in float32, "
"this might be related to the fact "
"you have upcasted embedding or layer norm layers in float32. "
"We will cast back the input in"
" %s."
,
target_dtype
)
query_states
=
query_states
.
to
(
target_dtype
)
key_states
=
key_states
.
to
(
target_dtype
)
value_states
=
value_states
.
to
(
target_dtype
)
attn_output
=
self
.
_flash_attention_forward
(
query_states
,
key_states
,
value_states
,
attention_mask
,
q_len
,
dropout
=
dropout_rate
)
attn_output
=
attn_output
.
reshape
(
bsz
,
q_len
,
self
.
embed_dim
).
contiguous
()
attn_output
=
self
.
out_proj
(
attn_output
)
if
not
output_attentions
:
attn_weights
=
None
return
attn_output
,
attn_weights
def
_flash_attention_forward
(
self
,
query_states
,
key_states
,
value_states
,
attention_mask
,
query_length
,
dropout
=
0.0
,
softmax_scale
=
None
):
causal
=
self
.
is_causal
and
query_length
!=
1
# Contains at least one padding token in the sequence
if
attention_mask
is
not
None
:
batch_size
=
query_states
.
shape
[
0
]
(
query_states
,
key_states
,
value_states
,
indices_q
,
cu_seq_lens
,
max_seq_lens
)
=
self
.
_upad_input
(
query_states
,
key_states
,
value_states
,
attention_mask
,
query_length
)
cu_seqlens_q
,
cu_seqlens_k
=
cu_seq_lens
max_seqlen_in_batch_q
,
max_seqlen_in_batch_k
=
max_seq_lens
attn_output_unpad
=
flash_attn_varlen_func
(
query_states
,
key_states
,
value_states
,
cu_seqlens_q
=
cu_seqlens_q
,
cu_seqlens_k
=
cu_seqlens_k
,
max_seqlen_q
=
max_seqlen_in_batch_q
,
max_seqlen_k
=
max_seqlen_in_batch_k
,
dropout_p
=
dropout
,
softmax_scale
=
softmax_scale
,
causal
=
causal
,
)
attn_output
=
pad_input
(
attn_output_unpad
,
indices_q
,
batch_size
,
query_length
)
else
:
attn_output
=
flash_attn_func
(
query_states
,
key_states
,
value_states
,
dropout
,
softmax_scale
=
softmax_scale
,
causal
=
causal
)
return
attn_output
def
_upad_input
(
self
,
query_layer
,
key_layer
,
value_layer
,
attention_mask
,
query_length
):
indices_k
,
cu_seqlens_k
,
max_seqlen_in_batch_k
=
_get_unpad_data
(
attention_mask
)
batch_size
,
kv_seq_len
,
num_key_value_heads
,
head_dim
=
key_layer
.
shape
key_layer
=
index_first_axis
(
key_layer
.
reshape
(
batch_size
*
kv_seq_len
,
num_key_value_heads
,
head_dim
),
indices_k
)
value_layer
=
index_first_axis
(
value_layer
.
reshape
(
batch_size
*
kv_seq_len
,
num_key_value_heads
,
head_dim
),
indices_k
)
if
query_length
==
kv_seq_len
:
query_layer
=
index_first_axis
(
query_layer
.
reshape
(
batch_size
*
kv_seq_len
,
self
.
num_heads
,
head_dim
),
indices_k
)
cu_seqlens_q
=
cu_seqlens_k
max_seqlen_in_batch_q
=
max_seqlen_in_batch_k
indices_q
=
indices_k
elif
query_length
==
1
:
max_seqlen_in_batch_q
=
1
cu_seqlens_q
=
torch
.
arange
(
batch_size
+
1
,
dtype
=
torch
.
int32
,
device
=
query_layer
.
device
)
# There is a memcpy here, that is very bad.
indices_q
=
cu_seqlens_q
[:
-
1
]
query_layer
=
query_layer
.
squeeze
(
1
)
else
:
# The -q_len: slice assumes left padding.
attention_mask
=
attention_mask
[:,
-
query_length
:]
(
query_layer
,
indices_q
,
cu_seqlens_q
,
max_seqlen_in_batch_q
)
=
unpad_input
(
query_layer
,
attention_mask
)
return
(
query_layer
,
key_layer
,
value_layer
,
indices_q
,
(
cu_seqlens_q
,
cu_seqlens_k
),
(
max_seqlen_in_batch_q
,
max_seqlen_in_batch_k
),
)
# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
class
SiglipMLP
(
nn
.
Module
):
def
__init__
(
self
,
config
):
super
().
__init__
()
self
.
config
=
config
self
.
activation_fn
=
ACT2FN
[
config
.
hidden_act
]
self
.
fc1
=
nn
.
Linear
(
config
.
hidden_size
,
config
.
intermediate_size
)
self
.
fc2
=
nn
.
Linear
(
config
.
intermediate_size
,
config
.
hidden_size
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
hidden_states
=
self
.
fc1
(
hidden_states
)
hidden_states
=
self
.
activation_fn
(
hidden_states
)
hidden_states
=
self
.
fc2
(
hidden_states
)
return
hidden_states
# Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer
# with CLIP->Siglip
class
SiglipEncoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
SiglipVisionConfig
):
super
().
__init__
()
self
.
embed_dim
=
config
.
hidden_size
self
.
_use_flash_attention_2
=
(
config
.
_attn_implementation
==
"flash_attention_2"
)
self
.
self_attn
=
(
SiglipAttention
(
config
)
if
not
self
.
_use_flash_attention_2
else
SiglipFlashAttention2
(
config
))
self
.
layer_norm1
=
nn
.
LayerNorm
(
self
.
embed_dim
,
eps
=
config
.
layer_norm_eps
)
self
.
mlp
=
SiglipMLP
(
config
)
self
.
layer_norm2
=
nn
.
LayerNorm
(
self
.
embed_dim
,
eps
=
config
.
layer_norm_eps
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
,
attention_mask
:
torch
.
Tensor
,
output_attentions
:
Optional
[
bool
]
=
False
,
)
->
Tuple
[
torch
.
FloatTensor
]:
residual
=
hidden_states
hidden_states
=
self
.
layer_norm1
(
hidden_states
)
hidden_states
,
attn_weights
=
self
.
self_attn
(
hidden_states
=
hidden_states
,
attention_mask
=
attention_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
residual
+
hidden_states
residual
=
hidden_states
hidden_states
=
self
.
layer_norm2
(
hidden_states
)
hidden_states
=
self
.
mlp
(
hidden_states
)
hidden_states
=
residual
+
hidden_states
outputs
=
(
hidden_states
,
)
if
output_attentions
:
outputs
+=
(
attn_weights
,
)
return
outputs
class
SiglipPreTrainedModel
(
PreTrainedModel
):
config_class
=
SiglipVisionConfig
base_model_prefix
=
"siglip"
supports_gradient_checkpointing
=
True
def
_init_weights
(
self
,
module
):
"""Initialize the weights"""
if
isinstance
(
module
,
SiglipVisionEmbeddings
):
width
=
self
.
config
.
hidden_size
nn
.
init
.
normal_
(
module
.
position_embedding
.
weight
,
std
=
1
/
np
.
sqrt
(
width
))
elif
isinstance
(
module
,
nn
.
Embedding
):
default_flax_embed_init
(
module
.
weight
)
elif
isinstance
(
module
,
SiglipAttention
):
nn
.
init
.
normal_
(
module
.
q_proj
.
weight
)
nn
.
init
.
normal_
(
module
.
k_proj
.
weight
)
nn
.
init
.
normal_
(
module
.
v_proj
.
weight
)
nn
.
init
.
normal_
(
module
.
out_proj
.
weight
)
nn
.
init
.
zeros_
(
module
.
q_proj
.
bias
)
nn
.
init
.
zeros_
(
module
.
k_proj
.
bias
)
nn
.
init
.
zeros_
(
module
.
v_proj
.
bias
)
nn
.
init
.
zeros_
(
module
.
out_proj
.
bias
)
elif
isinstance
(
module
,
SiglipMLP
):
nn
.
init
.
normal_
(
module
.
fc1
.
weight
)
nn
.
init
.
normal_
(
module
.
fc2
.
weight
)
nn
.
init
.
normal_
(
module
.
fc1
.
bias
,
std
=
1e-6
)
nn
.
init
.
normal_
(
module
.
fc2
.
bias
,
std
=
1e-6
)
elif
isinstance
(
module
,
(
nn
.
Linear
,
nn
.
Conv2d
)):
lecun_normal_
(
module
.
weight
)
if
module
.
bias
is
not
None
:
nn
.
init
.
zeros_
(
module
.
bias
)
elif
isinstance
(
module
,
nn
.
LayerNorm
):
module
.
bias
.
data
.
zero_
()
module
.
weight
.
data
.
fill_
(
1.0
)
# Copied from transformers.models.clip.modeling_clip.CLIPEncoder
# with CLIP->Siglip
class
SiglipEncoder
(
nn
.
Module
):
def
__init__
(
self
,
config
:
SiglipVisionConfig
):
super
().
__init__
()
self
.
config
=
config
self
.
layers
=
nn
.
ModuleList
([
SiglipEncoderLayer
(
config
)
for
_
in
range
(
config
.
num_hidden_layers
)
])
self
.
gradient_checkpointing
=
False
# Ignore copy
def
forward
(
self
,
inputs_embeds
,
attention_mask
:
Optional
[
torch
.
Tensor
]
=
None
,
output_attentions
:
Optional
[
bool
]
=
None
,
output_hidden_states
:
Optional
[
bool
]
=
None
,
return_dict
:
Optional
[
bool
]
=
None
,
)
->
Union
[
Tuple
,
BaseModelOutput
]:
output_attentions
=
output_attentions
if
output_attentions
is
not
None
\
else
self
.
config
.
output_attentions
output_hidden_states
=
(
output_hidden_states
if
output_hidden_states
is
not
None
else
self
.
config
.
output_hidden_states
)
return_dict
=
return_dict
if
return_dict
is
not
None
\
else
self
.
config
.
use_return_dict
encoder_states
=
()
if
output_hidden_states
else
None
all_attentions
=
()
if
output_attentions
else
None
hidden_states
=
inputs_embeds
for
encoder_layer
in
self
.
layers
:
if
output_hidden_states
:
encoder_states
=
encoder_states
+
(
hidden_states
,
)
if
self
.
gradient_checkpointing
and
self
.
training
:
layer_outputs
=
self
.
_gradient_checkpointing_func
(
encoder_layer
.
__call__
,
hidden_states
,
attention_mask
,
output_attentions
,
)
else
:
layer_outputs
=
encoder_layer
(
hidden_states
,
attention_mask
,
output_attentions
=
output_attentions
,
)
hidden_states
=
layer_outputs
[
0
]
if
output_attentions
:
all_attentions
=
all_attentions
+
(
layer_outputs
[
1
],
)
if
output_hidden_states
:
encoder_states
=
encoder_states
+
(
hidden_states
,
)
if
not
return_dict
:
return
tuple
(
v
for
v
in
[
hidden_states
,
encoder_states
,
all_attentions
]
if
v
is
not
None
)
return
BaseModelOutput
(
last_hidden_state
=
hidden_states
,
hidden_states
=
encoder_states
,
attentions
=
all_attentions
)
class
SiglipVisionTransformer
(
SiglipPreTrainedModel
):
config_class
=
SiglipVisionConfig
main_input_name
=
"pixel_values"
_supports_flash_attn_2
=
True
def
__init__
(
self
,
config
:
SiglipVisionConfig
):
super
().
__init__
(
config
)
self
.
config
=
config
embed_dim
=
config
.
hidden_size
self
.
embeddings
=
SiglipVisionEmbeddings
(
config
)
self
.
encoder
=
SiglipEncoder
(
config
)
self
.
post_layernorm
=
nn
.
LayerNorm
(
embed_dim
,
eps
=
config
.
layer_norm_eps
)
self
.
_use_flash_attention_2
=
(
config
.
_attn_implementation
==
"flash_attention_2"
)
# Initialize weights and apply final processing
self
.
post_init
()
def
get_input_embeddings
(
self
)
->
nn
.
Module
:
return
self
.
embeddings
.
patch_embedding
@
replace_return_docstrings
(
output_type
=
BaseModelOutputWithPooling
,
config_class
=
SiglipVisionConfig
)
def
forward
(
self
,
pixel_values
,
patch_attention_mask
:
Optional
[
torch
.
BoolTensor
]
=
None
,
tgt_sizes
:
Optional
[
torch
.
IntTensor
]
=
None
,
output_attentions
:
Optional
[
bool
]
=
None
,
output_hidden_states
:
Optional
[
bool
]
=
None
,
return_dict
:
Optional
[
bool
]
=
None
,
)
->
Union
[
Tuple
,
BaseModelOutputWithPooling
]:
r
"""
Returns:
"""
output_attentions
=
output_attentions
if
output_attentions
is
not
None
\
else
self
.
config
.
output_attentions
output_hidden_states
=
(
output_hidden_states
if
output_hidden_states
is
not
None
else
self
.
config
.
output_hidden_states
)
return_dict
=
return_dict
if
return_dict
is
not
None
\
else
self
.
config
.
use_return_dict
batch_size
=
pixel_values
.
size
(
0
)
if
patch_attention_mask
is
None
:
patch_attention_mask
=
torch
.
ones
(
size
=
(
batch_size
,
pixel_values
.
size
(
2
)
//
self
.
config
.
patch_size
,
pixel_values
.
size
(
3
)
//
self
.
config
.
patch_size
,
),
dtype
=
torch
.
bool
,
device
=
pixel_values
.
device
,
)
hidden_states
=
self
.
embeddings
(
pixel_values
=
pixel_values
,
patch_attention_mask
=
patch_attention_mask
,
tgt_sizes
=
tgt_sizes
)
patch_attention_mask
=
patch_attention_mask
.
view
(
batch_size
,
-
1
)
# The call to `_upad_input` in `_flash_attention_forward` is expensive
# So when the `patch_attention_mask` is full of 1s
# (i.e. attending to the whole sequence),
# avoiding passing the attention_mask,
# which is equivalent to attending to the full sequence
if
not
torch
.
any
(
~
patch_attention_mask
):
attention_mask
=
None
else
:
attention_mask
=
(
_prepare_4d_attention_mask
(
patch_attention_mask
,
hidden_states
.
dtype
)
if
not
self
.
_use_flash_attention_2
else
patch_attention_mask
)
encoder_outputs
=
self
.
encoder
(
inputs_embeds
=
hidden_states
,
attention_mask
=
attention_mask
,
output_attentions
=
output_attentions
,
output_hidden_states
=
output_hidden_states
,
return_dict
=
return_dict
,
)
last_hidden_state
=
encoder_outputs
[
0
]
last_hidden_state
=
self
.
post_layernorm
(
last_hidden_state
)
if
not
return_dict
:
return
(
last_hidden_state
,
None
)
+
encoder_outputs
[
1
:]
return
BaseModelOutputWithPooling
(
last_hidden_state
=
last_hidden_state
,
pooler_output
=
None
,
hidden_states
=
encoder_outputs
.
hidden_states
,
attentions
=
encoder_outputs
.
attentions
,
)
vllm/model_executor/models/nemotron.py
0 → 100644
View file @
e661d594
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only Nemotron model compatible with HuggingFace weights."""
from
typing
import
Any
,
Dict
,
Iterable
,
List
,
Optional
,
Tuple
,
Union
import
torch
from
torch
import
nn
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.config
import
CacheConfig
,
LoRAConfig
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
QKVParallelLinear
,
RowParallelLinear
)
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.transformers_utils.configs
import
NemotronConfig
from
.interfaces
import
SupportsLoRA
from
.utils
import
PPMissingLayer
,
is_pp_missing_parameter
,
make_layers
# The architecture is pretty similar to Llama, with these changes:
# - There is no gate_proj, just up_proj
# - Normal LayerNorm (with a +1 to the weights) instead of RMSNorm
# - Squared ReLU instead of SwiGLU
# - Adds a rotary_percent to RoPE
def
_cast_if_autocast_enabled
(
*
args
):
if
not
torch
.
is_autocast_enabled
():
return
args
else
:
return
torch
.
cuda
.
amp
.
autocast_mode
.
_cast
(
args
,
torch
.
get_autocast_gpu_dtype
())
class
NemotronLayerNorm1P
(
nn
.
LayerNorm
):
def
__init__
(
self
,
normalized_shape
:
Union
[
int
,
List
[
int
],
torch
.
Size
],
eps
:
float
=
1e-5
,
elementwise_affine
:
bool
=
True
,
bias
:
bool
=
True
,
device
=
None
,
dtype
=
None
):
super
().
__init__
(
normalized_shape
,
eps
,
elementwise_affine
,
bias
,
device
,
dtype
)
def
forward
(
self
,
x
:
torch
.
Tensor
,
residual
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
if
residual
is
not
None
:
x
=
x
+
residual
residual
=
x
args
=
_cast_if_autocast_enabled
(
x
,
self
.
normalized_shape
,
self
.
weight
+
1
,
self
.
bias
,
self
.
eps
)
with
torch
.
cuda
.
amp
.
autocast
(
enabled
=
False
):
x
=
torch
.
nn
.
functional
.
layer_norm
(
*
args
)
return
x
if
residual
is
None
else
(
x
,
residual
)
class
NemotronMLP
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
intermediate_size
:
int
,
hidden_act
:
str
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
bias
:
bool
=
False
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
self
.
up_proj
=
ColumnParallelLinear
(
input_size
=
hidden_size
,
output_size
=
intermediate_size
,
bias
=
bias
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.up_proj"
)
self
.
down_proj
=
RowParallelLinear
(
input_size
=
intermediate_size
,
output_size
=
hidden_size
,
bias
=
bias
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.down_proj"
)
self
.
act_fn
=
get_act_fn
(
hidden_act
)
def
forward
(
self
,
x
):
up
,
_
=
self
.
up_proj
(
x
)
x
=
self
.
act_fn
(
up
)
x
,
_
=
self
.
down_proj
(
x
)
return
x
class
NemotronAttention
(
nn
.
Module
):
def
__init__
(
self
,
config
:
NemotronConfig
,
hidden_size
:
int
,
num_heads
:
int
,
num_kv_heads
:
int
,
rope_theta
:
float
=
10000
,
rope_scaling
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
max_position_embeddings
:
int
=
8192
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
bias
:
bool
=
False
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
self
.
hidden_size
=
hidden_size
tp_size
=
get_tensor_model_parallel_world_size
()
self
.
total_num_heads
=
num_heads
assert
self
.
total_num_heads
%
tp_size
==
0
self
.
num_heads
=
self
.
total_num_heads
//
tp_size
self
.
total_num_kv_heads
=
num_kv_heads
if
self
.
total_num_kv_heads
>=
tp_size
:
# Number of KV heads is greater than TP size, so we partition
# the KV heads across multiple tensor parallel GPUs.
assert
self
.
total_num_kv_heads
%
tp_size
==
0
else
:
# Number of KV heads is less than TP size, so we replicate
# the KV heads across multiple tensor parallel GPUs.
assert
tp_size
%
self
.
total_num_kv_heads
==
0
self
.
num_kv_heads
=
max
(
1
,
self
.
total_num_kv_heads
//
tp_size
)
# MistralConfig has an optional head_dim introduced by Mistral-Nemo
self
.
head_dim
=
getattr
(
config
,
"head_dim"
,
self
.
hidden_size
//
self
.
total_num_heads
)
self
.
q_size
=
self
.
num_heads
*
self
.
head_dim
self
.
kv_size
=
self
.
num_kv_heads
*
self
.
head_dim
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
rope_theta
=
rope_theta
self
.
rotary_percent
=
config
.
rope_percent
self
.
max_position_embeddings
=
max_position_embeddings
self
.
qkv_proj
=
QKVParallelLinear
(
hidden_size
=
hidden_size
,
head_size
=
self
.
head_dim
,
total_num_heads
=
self
.
total_num_heads
,
total_num_kv_heads
=
self
.
total_num_kv_heads
,
bias
=
bias
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.qkv_proj"
,
)
self
.
o_proj
=
RowParallelLinear
(
input_size
=
self
.
total_num_heads
*
self
.
head_dim
,
output_size
=
hidden_size
,
bias
=
bias
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.o_proj"
,
)
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
max_position_embeddings
,
base
=
rope_theta
,
rope_scaling
=
rope_scaling
,
rotary_percent
=
self
.
rotary_percent
,
)
self
.
attn
=
Attention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
,
cache_config
=
cache_config
,
quant_config
=
quant_config
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
torch
.
Tensor
,
attn_metadata
:
AttentionMetadata
,
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
qkv_proj
(
hidden_states
)
q
,
k
,
v
=
qkv
.
split
([
self
.
q_size
,
self
.
kv_size
,
self
.
kv_size
],
dim
=-
1
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
attn_output
=
self
.
attn
(
q
,
k
,
v
,
kv_cache
,
attn_metadata
)
output
,
_
=
self
.
o_proj
(
attn_output
)
return
output
class
NemotronDecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
NemotronConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
self
.
hidden_size
=
config
.
hidden_size
rope_theta
=
getattr
(
config
,
"rope_theta"
,
10000
)
rope_scaling
=
getattr
(
config
,
"rope_scaling"
,
None
)
if
rope_scaling
is
not
None
and
getattr
(
config
,
"original_max_position_embeddings"
,
None
):
rope_scaling
[
"original_max_position_embeddings"
]
=
(
config
.
original_max_position_embeddings
)
max_position_embeddings
=
getattr
(
config
,
"max_position_embeddings"
,
8192
)
# Support abacusai/Smaug-72B-v0.1 with attention_bias
# Support internlm/internlm-7b with bias
attention_bias
=
getattr
(
config
,
"attention_bias"
,
False
)
or
getattr
(
config
,
"bias"
,
False
)
self
.
self_attn
=
NemotronAttention
(
config
=
config
,
hidden_size
=
self
.
hidden_size
,
num_heads
=
config
.
num_attention_heads
,
num_kv_heads
=
getattr
(
config
,
"num_key_value_heads"
,
config
.
num_attention_heads
),
rope_theta
=
rope_theta
,
rope_scaling
=
rope_scaling
,
max_position_embeddings
=
max_position_embeddings
,
quant_config
=
quant_config
,
bias
=
attention_bias
,
cache_config
=
cache_config
,
prefix
=
f
"
{
prefix
}
.self_attn"
,
)
self
.
mlp
=
NemotronMLP
(
hidden_size
=
self
.
hidden_size
,
intermediate_size
=
config
.
intermediate_size
,
hidden_act
=
config
.
hidden_act
,
quant_config
=
quant_config
,
bias
=
getattr
(
config
,
"mlp_bias"
,
False
),
prefix
=
f
"
{
prefix
}
.mlp"
,
)
self
.
input_layernorm
=
NemotronLayerNorm1P
(
config
.
hidden_size
,
eps
=
config
.
norm_eps
)
self
.
post_attention_layernorm
=
NemotronLayerNorm1P
(
config
.
hidden_size
,
eps
=
config
.
norm_eps
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
torch
.
Tensor
,
attn_metadata
:
AttentionMetadata
,
residual
:
Optional
[
torch
.
Tensor
],
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
# Self Attention
if
residual
is
None
:
residual
=
hidden_states
hidden_states
=
self
.
input_layernorm
(
hidden_states
)
else
:
hidden_states
,
residual
=
self
.
input_layernorm
(
hidden_states
,
residual
)
hidden_states
=
self
.
self_attn
(
positions
=
positions
,
hidden_states
=
hidden_states
,
kv_cache
=
kv_cache
,
attn_metadata
=
attn_metadata
,
)
# Fully Connected
hidden_states
,
residual
=
self
.
post_attention_layernorm
(
hidden_states
,
residual
)
hidden_states
=
self
.
mlp
(
hidden_states
)
return
hidden_states
,
residual
class
NemotronModel
(
nn
.
Module
):
def
__init__
(
self
,
config
:
NemotronConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
padding_idx
=
config
.
pad_token_id
lora_vocab
=
(
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
org_vocab_size
=
config
.
vocab_size
if
get_pp_group
().
is_first_rank
or
(
config
.
tie_word_embeddings
and
get_pp_group
().
is_last_rank
):
self
.
embed_tokens
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
)
else
:
self
.
embed_tokens
=
PPMissingLayer
()
self
.
start_layer
,
self
.
end_layer
,
self
.
layers
=
make_layers
(
config
.
num_hidden_layers
,
lambda
prefix
:
NemotronDecoderLayer
(
config
=
config
,
cache_config
=
cache_config
,
quant_config
=
quant_config
,
prefix
=
prefix
),
prefix
=
f
"
{
prefix
}
.layers"
)
if
get_pp_group
().
is_last_rank
:
self
.
norm
=
NemotronLayerNorm1P
(
config
.
hidden_size
,
eps
=
config
.
norm_eps
)
else
:
self
.
norm
=
PPMissingLayer
()
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
embed_tokens
(
input_ids
)
def
forward
(
self
,
input_ids
:
Optional
[
torch
.
Tensor
],
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
],
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
if
get_pp_group
().
is_first_rank
:
if
inputs_embeds
is
not
None
:
hidden_states
=
inputs_embeds
else
:
hidden_states
=
self
.
get_input_embeddings
(
input_ids
)
residual
=
None
else
:
assert
intermediate_tensors
is
not
None
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
residual
=
intermediate_tensors
[
"residual"
]
for
i
in
range
(
self
.
start_layer
,
self
.
end_layer
):
layer
=
self
.
layers
[
i
]
hidden_states
,
residual
=
layer
(
positions
,
hidden_states
,
kv_caches
[
i
-
self
.
start_layer
],
attn_metadata
,
residual
,
)
if
not
get_pp_group
().
is_last_rank
:
return
IntermediateTensors
({
"hidden_states"
:
hidden_states
,
"residual"
:
residual
})
hidden_states
,
_
=
self
.
norm
(
hidden_states
,
residual
)
return
hidden_states
class
NemotronForCausalLM
(
nn
.
Module
,
SupportsLoRA
):
packed_modules_mapping
=
{
"qkv_proj"
:
[
"q_proj"
,
"k_proj"
,
"v_proj"
,
],
}
# LoRA specific attributes
supported_lora_modules
=
[
"qkv_proj"
,
"o_proj"
,
"up_proj"
,
"down_proj"
,
"embed_tokens"
,
"lm_head"
]
embedding_modules
=
{
"embed_tokens"
:
"input_embeddings"
,
"lm_head"
:
"output_embeddings"
,
}
embedding_padding_modules
=
[
"lm_head"
]
bitsandbytes_stacked_params_mapping
=
{
# shard_name, weight_name, index
"q_proj"
:
(
"qkv_proj"
,
0
),
"k_proj"
:
(
"qkv_proj"
,
1
),
"v_proj"
:
(
"qkv_proj"
,
2
),
}
def
__init__
(
self
,
config
:
NemotronConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
)
->
None
:
super
().
__init__
()
assert
isinstance
(
config
,
NemotronConfig
)
self
.
config
=
config
self
.
lora_config
=
lora_config
self
.
model
=
NemotronModel
(
config
,
cache_config
,
quant_config
,
lora_config
=
lora_config
,
prefix
=
"model"
)
if
get_pp_group
().
is_last_rank
:
self
.
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
,
quant_config
=
quant_config
,
)
if
config
.
tie_word_embeddings
:
self
.
lm_head
.
weight
=
self
.
model
.
embed_tokens
.
weight
logit_scale
=
getattr
(
config
,
"logit_scale"
,
1.0
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
,
logit_scale
)
self
.
sampler
=
Sampler
()
else
:
self
.
lm_head
=
PPMissingLayer
()
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
model_output
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
attn_metadata
,
intermediate_tensors
)
return
model_output
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
torch
.
Tensor
:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
make_empty_intermediate_tensors
(
self
,
batch_size
:
int
,
dtype
:
torch
.
dtype
,
device
:
torch
.
device
)
->
IntermediateTensors
:
return
IntermediateTensors
({
"hidden_states"
:
torch
.
zeros
((
batch_size
,
self
.
config
.
hidden_size
),
dtype
=
dtype
,
device
=
device
),
"residual"
:
torch
.
zeros
((
batch_size
,
self
.
config
.
hidden_size
),
dtype
=
dtype
,
device
=
device
),
})
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
".qkv_proj"
,
".q_proj"
,
"q"
),
(
".qkv_proj"
,
".k_proj"
,
"k"
),
(
".qkv_proj"
,
".v_proj"
,
"v"
),
]
params_dict
=
dict
(
self
.
named_parameters
())
for
name
,
loaded_weight
in
weights
:
if
"rotary_emb.inv_freq"
in
name
:
continue
if
(
"rotary_emb.cos_cached"
in
name
or
"rotary_emb.sin_cached"
in
name
):
# Models trained using ColossalAI may include these tensors in
# the checkpoint. Skip them.
continue
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
if
is_pp_missing_parameter
(
name
,
self
):
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
# Remapping the name of FP8 kv-scale.
name
=
maybe_remap_kv_scale_name
(
name
,
params_dict
)
if
name
is
None
:
continue
if
is_pp_missing_parameter
(
name
,
self
):
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
vllm/model_executor/models/olmo.py
View file @
e661d594
...
@@ -343,6 +343,11 @@ class OlmoForCausalLM(nn.Module):
...
@@ -343,6 +343,11 @@ class OlmoForCausalLM(nn.Module):
# Models trained using ColossalAI may include these tensors in
# Models trained using ColossalAI may include these tensors in
# the checkpoint. Skip them.
# the checkpoint. Skip them.
continue
continue
# With tie_word_embeddings, we can skip lm_head.weight
# The weight might appear unnecessarily in the files if the model is
# processed with quantization, LoRA, fine-tuning, etc.
if
self
.
config
.
tie_word_embeddings
and
"lm_head.weight"
in
name
:
continue
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
if
weight_name
not
in
name
:
continue
continue
...
...
Prev
1
…
12
13
14
15
16
17
18
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment