Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
0640f227
Commit
0640f227
authored
Sep 09, 2024
by
zhuwenwen
Browse files
Merge tag 'v0.6.0' into v0.6.0-dev
parents
82f1ffdf
32e7db25
Changes
335
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
848 additions
and
129 deletions
+848
-129
vllm/model_executor/models/llava.py
vllm/model_executor/models/llava.py
+12
-3
vllm/model_executor/models/llava_next.py
vllm/model_executor/models/llava_next.py
+40
-18
vllm/model_executor/models/medusa.py
vllm/model_executor/models/medusa.py
+1
-1
vllm/model_executor/models/minicpm.py
vllm/model_executor/models/minicpm.py
+2
-2
vllm/model_executor/models/minicpmv.py
vllm/model_executor/models/minicpmv.py
+10
-5
vllm/model_executor/models/mixtral.py
vllm/model_executor/models/mixtral.py
+3
-2
vllm/model_executor/models/mixtral_quant.py
vllm/model_executor/models/mixtral_quant.py
+2
-2
vllm/model_executor/models/mlp_speculator.py
vllm/model_executor/models/mlp_speculator.py
+1
-2
vllm/model_executor/models/mpt.py
vllm/model_executor/models/mpt.py
+2
-2
vllm/model_executor/models/nemotron.py
vllm/model_executor/models/nemotron.py
+2
-2
vllm/model_executor/models/olmo.py
vllm/model_executor/models/olmo.py
+2
-2
vllm/model_executor/models/opt.py
vllm/model_executor/models/opt.py
+2
-2
vllm/model_executor/models/orion.py
vllm/model_executor/models/orion.py
+2
-2
vllm/model_executor/models/paligemma.py
vllm/model_executor/models/paligemma.py
+15
-11
vllm/model_executor/models/persimmon.py
vllm/model_executor/models/persimmon.py
+2
-2
vllm/model_executor/models/phi.py
vllm/model_executor/models/phi.py
+2
-2
vllm/model_executor/models/phi3_small.py
vllm/model_executor/models/phi3_small.py
+2
-2
vllm/model_executor/models/phi3v.py
vllm/model_executor/models/phi3v.py
+124
-65
vllm/model_executor/models/phimoe.py
vllm/model_executor/models/phimoe.py
+620
-0
vllm/model_executor/models/qwen.py
vllm/model_executor/models/qwen.py
+2
-2
No files found.
vllm/model_executor/models/llava.py
View file @
0640f227
...
...
@@ -11,10 +11,11 @@ from vllm.config import CacheConfig, MultiModalConfig
from
vllm.inputs
import
INPUT_REGISTRY
,
InputContext
,
LLMInputs
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
from
.clip
import
(
CLIPVisionModel
,
dummy_image_for_clip
,
dummy_seq_data_for_clip
,
get_max_clip_image_tokens
,
...
...
@@ -30,13 +31,13 @@ from .utils import (filter_weights, init_vllm_registered_model,
class
LlavaImagePixelInputs
(
TypedDict
):
type
:
Literal
[
"pixel_values"
]
data
:
torch
.
Tensor
"""Shape: `(batch_size, num_channels, height, width)`"""
"""Shape: `(batch_size
* num_images
, num_channels, height, width)`"""
class
LlavaImageEmbeddingInputs
(
TypedDict
):
type
:
Literal
[
"image_embeds"
]
data
:
torch
.
Tensor
"""Shape: `(batch_size, image_feature_size, hidden_size)`
"""Shape: `(batch_size
* num_images
, image_feature_size, hidden_size)`
`hidden_size` must match the hidden size of language model backbone.
"""
...
...
@@ -232,6 +233,10 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal):
if
not
isinstance
(
pixel_values
,
torch
.
Tensor
):
raise
ValueError
(
"Incorrect type of pixel values. "
f
"Got type:
{
type
(
pixel_values
)
}
"
)
# Remove the N dimension until multiple images are supported.
pixel_values
=
pixel_values
.
squeeze
(
1
)
return
LlavaImagePixelInputs
(
type
=
"pixel_values"
,
data
=
self
.
_validate_pixel_values
(
pixel_values
),
...
...
@@ -241,6 +246,10 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal):
if
not
isinstance
(
image_embeds
,
torch
.
Tensor
):
raise
ValueError
(
"Incorrect type of image embeddings. "
f
"Got type:
{
type
(
image_embeds
)
}
"
)
# Remove the N dimension until multiple images are supported.
image_embeds
=
image_embeds
.
squeeze
(
1
)
return
LlavaImageEmbeddingInputs
(
type
=
"image_embeds"
,
data
=
image_embeds
,
...
...
vllm/model_executor/models/llava_next.py
View file @
0640f227
...
...
@@ -15,10 +15,12 @@ from vllm.config import CacheConfig, MultiModalConfig
from
vllm.inputs
import
INPUT_REGISTRY
,
InputContext
,
LLMInputs
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils
import
is_list_of
from
.clip
import
(
CLIPVisionModel
,
dummy_image_for_clip
,
dummy_seq_data_for_clip
,
get_clip_image_feature_size
,
...
...
@@ -28,7 +30,7 @@ from .llava import LlavaMultiModalProjector
from
.siglip
import
(
SiglipVisionModel
,
dummy_image_for_siglip
,
dummy_seq_data_for_siglip
,
get_siglip_image_feature_size
,
get_siglip_patch_grid_length
,
input_processor_for_siglip
)
from
.utils
import
(
filter_weights
,
init_vllm_registered_model
,
from
.utils
import
(
filter_weights
,
flatten_bn
,
init_vllm_registered_model
,
merge_multimodal_embeddings
)
logger
=
init_logger
(
__name__
)
...
...
@@ -46,15 +48,16 @@ class LlavaNextImagePixelInputs(TypedDict):
type
:
Literal
[
"pixel_values"
]
data
:
Union
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]]
"""
Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
Shape:
`(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
Note that `num_patches` may be different
for each batch, in which case
the data is passed as a list instead of a batched tensor.
Note that `num_patches` may be different
per batch and image,
in which case
the data is passed as a list instead of a batched tensor.
"""
image_sizes
:
NotRequired
[
torch
.
Tensor
]
"""
Shape: `(batch_size, 2)`
Shape: `(batch_size
* num_images
, 2)`
This should be in `(height, width)` format.
"""
...
...
@@ -63,7 +66,7 @@ class LlavaNextImagePixelInputs(TypedDict):
class
LlavaNextImageEmbeddingInputs
(
TypedDict
):
type
:
Literal
[
"image_embeds"
]
data
:
torch
.
Tensor
"""Shape: `(batch_size, image_feature_size, hidden_size)`
"""Shape: `(batch_size
* num_images
, image_feature_size, hidden_size)`
`hidden_size` must match the hidden size of language model backbone.
"""
...
...
@@ -223,6 +226,13 @@ def input_processor_for_llava_next(ctx: InputContext, llm_inputs: LLMInputs):
input_height
=
height
,
input_width
=
width
,
)
elif
is_list_of
(
image_data
,
Image
.
Image
):
image_feature_size
=
[
get_llava_next_image_feature_size
(
hf_config
,
input_height
=
img
.
height
,
input_width
=
img
.
width
)
for
img
in
image_data
]
elif
isinstance
(
image_data
,
torch
.
Tensor
):
image_feature_size
=
image_data
.
shape
[
0
]
else
:
...
...
@@ -307,10 +317,19 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal):
torch
.
empty
(
config
.
text_config
.
hidden_size
))
def
_validate_image_sizes
(
self
,
data
:
torch
.
Tensor
)
->
torch
.
Tensor
:
if
list
(
data
.
shape
[
1
:])
!=
[
2
]:
raise
ValueError
(
f
"The expected image sizes shape is batch dimension plus "
f
"
{
[
2
]
}
. You supplied
{
data
.
shape
}
."
)
expected_dims
=
(
2
,
)
def
_validate_shape
(
d
:
torch
.
Tensor
):
actual_dims
=
tuple
(
d
.
shape
)
if
actual_dims
!=
expected_dims
:
expected_expr
=
str
(
expected_dims
)
raise
ValueError
(
f
"The expected shape of image sizes per image per batch "
f
"is
{
expected_expr
}
. You supplied
{
tuple
(
d
.
shape
)
}
."
)
for
d
in
data
:
_validate_shape
(
d
)
return
data
...
...
@@ -327,7 +346,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal):
if
actual_dims
!=
expected_dims
:
expected_expr
=
(
"num_patches"
,
*
map
(
str
,
expected_dims
))
raise
ValueError
(
"The expected shape of pixel values
in each batch element
"
"The expected shape of pixel values
per image per batch
"
f
"is
{
expected_expr
}
. You supplied
{
tuple
(
d
.
shape
)
}
."
)
for
d
in
data
:
...
...
@@ -349,14 +368,15 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal):
raise
ValueError
(
"Incorrect type of pixel values. "
f
"Got type:
{
type
(
pixel_values
)
}
"
)
if
not
isinstance
(
image_sizes
,
torch
.
Tensor
):
if
not
isinstance
(
image_sizes
,
(
torch
.
Tensor
,
list
)
):
raise
ValueError
(
"Incorrect type of image sizes. "
f
"Got type:
{
type
(
image_sizes
)
}
"
)
return
LlavaNextImagePixelInputs
(
type
=
"pixel_values"
,
data
=
self
.
_validate_pixel_values
(
pixel_values
),
image_sizes
=
self
.
_validate_image_sizes
(
image_sizes
),
data
=
self
.
_validate_pixel_values
(
flatten_bn
(
pixel_values
)),
image_sizes
=
self
.
_validate_image_sizes
(
flatten_bn
(
image_sizes
,
concat
=
True
)),
)
if
image_embeds
is
not
None
:
...
...
@@ -366,7 +386,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal):
return
LlavaNextImageEmbeddingInputs
(
type
=
"image_embeds"
,
data
=
image_embeds
,
data
=
flatten_bn
(
image_embeds
)
,
)
raise
AssertionError
(
"This line should be unreachable."
)
...
...
@@ -425,7 +445,10 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal):
self
.
config
.
image_grid_pinpoints
,
self
.
config
.
vision_config
.
image_size
,
)
other_patch_embeds
=
other_patch_embeds
\
num_patches
=
num_patch_height
*
num_patch_width
# Image patches might be padded for batch processing
other_patch_embeds
=
other_patch_embeds
[:
num_patches
]
\
.
view
(
num_patch_height
,
num_patch_width
,
height
,
width
,
-
1
)
if
"unpad"
in
strategy
:
...
...
@@ -496,7 +519,6 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal):
self
,
image_input
:
LlavaNextImageInputs
,
)
->
Union
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]]:
if
image_input
[
"type"
]
==
"image_embeds"
:
return
[
image_input
[
"data"
]]
...
...
vllm/model_executor/models/medusa.py
View file @
0640f227
...
...
@@ -4,11 +4,11 @@ import torch
import
torch.nn
as
nn
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.sampler
import
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
SamplerOutput
from
vllm.transformers_utils.configs.medusa
import
MedusaConfig
...
...
vllm/model_executor/models/minicpm.py
View file @
0640f227
...
...
@@ -44,13 +44,13 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsLoRA
...
...
vllm/model_executor/models/minicpmv.py
View file @
0640f227
...
...
@@ -44,7 +44,7 @@ from vllm.logger import init_logger
from
vllm.model_executor.layers.linear
import
ReplicatedLinear
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.model_loader.utils
import
set_default_torch_dtype
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
...
...
@@ -57,7 +57,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY
from
vllm.multimodal.image
import
cached_get_image_processor
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.sequence
import
(
VLLM_TOKEN_ID_ARRAY_TYPE
,
IntermediateTensors
,
SamplerOutput
,
SequenceData
)
SequenceData
)
from
.idefics2_vision_model
import
Idefics2VisionTransformer
...
...
@@ -594,9 +594,14 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal):
pixel_values_flat
:
List
[
torch
.
Tensor
]
=
[]
tgt_sizes_flat
:
List
[
torch
.
Tensor
]
=
[]
for
b
in
range
(
len
(
pixel_values
)):
pixel_values_flat
+=
pixel_values
[
b
]
tgt_sizes_flat
+=
tgt_sizes
[
b
]
for
pixel_b
,
tgt_b
in
zip
(
pixel_values
,
tgt_sizes
):
if
len
(
pixel_b
)
!=
len
(
tgt_b
):
raise
ValueError
(
"Inconsistent N lengths, found: "
f
"
{
len
(
pixel_b
)
}
vs
{
len
(
tgt_b
)
}
"
)
for
pixel_n
,
tgt_n
in
zip
(
pixel_b
,
tgt_b
):
pixel_values_flat
+=
pixel_n
tgt_sizes_flat
+=
tgt_n
# NOTE: Input IDs does not contain image tokens during memory profiling,
# so we allow it to be empty
...
...
vllm/model_executor/models/mixtral.py
View file @
0640f227
...
...
@@ -39,13 +39,13 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsLoRA
from
.utils
import
is_pp_missing_parameter
,
make_layers
...
...
@@ -73,6 +73,7 @@ class MixtralMoE(nn.Module):
self
.
hidden_size
=
hidden_size
# Gate always runs at half / full precision for now.
self
.
gate
=
ReplicatedLinear
(
hidden_size
,
num_experts
,
bias
=
False
,
...
...
vllm/model_executor/models/mixtral_quant.py
View file @
0640f227
...
...
@@ -42,12 +42,12 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
class
MixtralMLP
(
nn
.
Module
):
...
...
vllm/model_executor/models/mlp_speculator.py
View file @
0640f227
...
...
@@ -6,11 +6,10 @@ import torch.nn as nn
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.sequence
import
SamplerOutput
from
vllm.transformers_utils.configs
import
MLPSpeculatorConfig
SQRT2
=
2
**
0.5
...
...
vllm/model_executor/models/mpt.py
View file @
0640f227
...
...
@@ -17,12 +17,12 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs.mpt
import
MPTConfig
...
...
vllm/model_executor/models/nemotron.py
View file @
0640f227
...
...
@@ -37,13 +37,13 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.configs
import
NemotronConfig
from
.interfaces
import
SupportsLoRA
...
...
vllm/model_executor/models/olmo.py
View file @
0640f227
...
...
@@ -38,12 +38,12 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
class
OlmoAttention
(
nn
.
Module
):
...
...
vllm/model_executor/models/opt.py
View file @
0640f227
...
...
@@ -34,12 +34,12 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear,
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
class
OPTLearnedPositionalEmbedding
(
nn
.
Embedding
):
...
...
vllm/model_executor/models/orion.py
View file @
0640f227
...
...
@@ -21,12 +21,12 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
class
OrionMLP
(
nn
.
Module
):
...
...
vllm/model_executor/models/paligemma.py
View file @
0640f227
...
...
@@ -11,13 +11,13 @@ from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.gemma
import
GemmaModel
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsMultiModal
from
.siglip
import
(
SiglipVisionModel
,
dummy_image_for_siglip
,
...
...
@@ -34,13 +34,13 @@ _KEYS_TO_MODIFY_MAPPING = {
class
PaliGemmaImagePixelInputs
(
TypedDict
):
type
:
Literal
[
"pixel_values"
]
data
:
torch
.
Tensor
"""Shape: (batch_size, num_channels, height, width)"""
"""Shape:
`
(batch_size
* num_images
, num_channels, height, width)
`
"""
class
PaliGemmaImageEmbeddingInputs
(
TypedDict
):
type
:
Literal
[
"image_embeds"
]
data
:
torch
.
Tensor
"""Shape: `(batch_size, image_feature_size, hidden_size)`
"""Shape: `(batch_size
* num_images
, image_feature_size, hidden_size)`
`hidden_size` must match the hidden size of language model backbone.
"""
...
...
@@ -145,7 +145,6 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal):
self
.
config
=
config
self
.
multimodal_config
=
multimodal_config
# TODO(ywang96): Port over SiglipVisionModel & TP
self
.
vision_tower
=
SiglipVisionModel
(
config
.
vision_config
)
self
.
multi_modal_projector
=
PaliGemmaMultiModalProjector
(
vision_hidden_size
=
config
.
vision_config
.
hidden_size
,
...
...
@@ -185,6 +184,10 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal):
if
not
isinstance
(
pixel_values
,
torch
.
Tensor
):
raise
ValueError
(
"Incorrect type of pixel values. "
f
"Got type:
{
type
(
pixel_values
)
}
"
)
# Remove the N dimension until multiple images are supported.
pixel_values
=
pixel_values
.
squeeze
(
1
)
return
PaliGemmaImagePixelInputs
(
type
=
"pixel_values"
,
data
=
self
.
_validate_pixel_values
(
pixel_values
),
...
...
@@ -194,6 +197,10 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal):
if
not
isinstance
(
image_embeds
,
torch
.
Tensor
):
raise
ValueError
(
"Incorrect type of image embeddings. "
f
"Got type:
{
type
(
image_embeds
)
}
"
)
# Remove the N dimension until multiple images are supported.
image_embeds
=
image_embeds
.
squeeze
(
1
)
return
PaliGemmaImageEmbeddingInputs
(
type
=
"image_embeds"
,
data
=
image_embeds
,
...
...
@@ -300,12 +307,7 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal):
if
key_to_modify
in
name
:
name
=
name
.
replace
(
key_to_modify
,
new_key
)
use_default_weight_loading
=
False
if
"vision"
in
name
:
if
self
.
vision_tower
is
not
None
:
# We only do sharding for language model and
# not vision model for now.
use_default_weight_loading
=
True
else
:
if
"vision"
not
in
name
or
self
.
vision_tower
.
shard_weight
:
for
(
param_name
,
shard_name
,
shard_id
)
in
stacked_params_mapping
:
if
shard_name
not
in
name
:
...
...
@@ -328,6 +330,8 @@ class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal):
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
use_default_weight_loading
=
True
else
:
use_default_weight_loading
=
True
if
use_default_weight_loading
:
param
=
params_dict
[
name
]
...
...
vllm/model_executor/models/persimmon.py
View file @
0640f227
...
...
@@ -37,12 +37,12 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
class
PersimmonMLP
(
nn
.
Module
):
...
...
vllm/model_executor/models/phi.py
View file @
0640f227
...
...
@@ -52,12 +52,12 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsLoRA
...
...
vllm/model_executor/models/phi3_small.py
View file @
0640f227
...
...
@@ -16,12 +16,12 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
def
load_column_parallel_weight
(
param
:
torch
.
nn
.
Parameter
,
...
...
vllm/model_executor/models/phi3v.py
View file @
0640f227
...
...
@@ -13,6 +13,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
itertools
import
re
from
functools
import
lru_cache
from
typing
import
(
Any
,
Dict
,
Iterable
,
List
,
Literal
,
Mapping
,
Optional
,
...
...
@@ -30,20 +31,20 @@ from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
ParallelLMHead
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.clip
import
CLIPVisionModel
from
vllm.model_executor.models.llama
import
LlamaModel
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.utils
import
cached_get_tokenizer
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.multimodal.utils
import
cached_get_tokenizer
,
repeat_and_pad_token
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils
import
is_list_of
from
.clip
import
(
dummy_image_for_clip
,
dummy_seq_data_for_clip
,
input_processor_for_clip
)
from
.clip
import
dummy_image_for_clip
,
dummy_seq_data_for_clip
from
.interfaces
import
SupportsMultiModal
from
.utils
import
merge_multimodal_embeddings
from
.utils
import
flatten_bn
,
merge_multimodal_embeddings
logger
=
init_logger
(
__name__
)
...
...
@@ -70,19 +71,37 @@ CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(dropout=0.0,
projection_dim
=
768
)
def
_init_img_processor
(
hf_config
:
PretrainedConfig
):
clip_config
=
CLIP_VIT_LARGE_PATCH14_336_CONFIG
layer_idx
=
hf_config
.
img_processor
.
get
(
'layer_idx'
,
-
2
)
# Initialize the CLIP only up to the required feature layer
if
layer_idx
<
0
:
num_hidden_layers
=
clip_config
.
num_hidden_layers
+
\
layer_idx
+
1
else
:
num_hidden_layers
=
layer_idx
+
1
img_processor
=
CLIPVisionModel
(
clip_config
,
num_hidden_layers_override
=
num_hidden_layers
)
return
img_processor
class
Phi3VImagePixelInputs
(
TypedDict
):
type
:
Literal
[
"pixel_values"
]
data
:
Union
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]]
"""
Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
Shape:
`(batch_size * num_images, 1 + num_patches, num_channels, height, width)`
Note that `num_patches` may be different
for each batch, in which case
the data is passed as a list instead of a batched tensor.
Note that `num_patches` may be different
per batch and image,
in which case
the data is passed as a list instead of a batched tensor.
"""
image_sizes
:
torch
.
Tensor
"""
Shape: `(batch_size, 2)`
Shape: `(batch_size
* num_images
, 2)`
This should be in `(height, width)` format.
"""
...
...
@@ -91,7 +110,7 @@ class Phi3VImagePixelInputs(TypedDict):
class
Phi3VImageEmbeddingInputs
(
TypedDict
):
type
:
Literal
[
"image_embeds"
]
data
:
Union
[
torch
.
Tensor
,
List
[
torch
.
Tensor
]]
"""Shape: `(batch_size, image_feature_size, hidden_size)`
"""Shape: `(batch_size
* num_images
, image_feature_size, hidden_size)`
`hidden_size` must match the hidden size of language model backbone.
"""
...
...
@@ -137,18 +156,8 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
hidden_size
=
config
.
n_embd
if
hasattr
(
config
,
'n_embd'
)
else
config
.
hidden_size
clip_config
=
CLIP_VIT_LARGE_PATCH14_336_CONFIG
self
.
layer_idx
=
config
.
img_processor
.
get
(
'layer_idx'
,
-
2
)
# Initialize the CLIP only up to the required feature layer
if
self
.
layer_idx
<
0
:
num_hidden_layers
=
clip_config
.
num_hidden_layers
+
\
self
.
layer_idx
+
1
else
:
num_hidden_layers
=
self
.
layer_idx
+
1
self
.
img_processor
=
_init_img_processor
(
config
)
self
.
img_processor
=
CLIPVisionModel
(
clip_config
,
num_hidden_layers_override
=
num_hidden_layers
)
image_dim_out
=
config
.
img_processor
[
'image_dim_out'
]
self
.
num_img_tokens
=
config
.
img_processor
[
'num_img_tokens'
]
...
...
@@ -400,11 +409,20 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
image_data
=
multi_modal_data
[
"image"
]
if
isinstance
(
image_data
,
Image
.
Image
):
w
,
h
=
image_data
.
size
w
,
h
=
_calc_hd_transform_size
(
width
=
w
,
height
=
h
)
image_feature_size
=
get_phi3v_image_feature_size
(
hf_config
,
input_width
=
w
,
input_height
=
h
)
image_feature_size
=
[
get_phi3v_image_feature_size
(
hf_config
,
input_width
=
w
,
input_height
=
h
)
]
image_data
=
[
image_data
]
elif
is_list_of
(
image_data
,
Image
.
Image
):
image_feature_size
=
[]
for
image
in
image_data
:
w
,
h
=
image
.
size
image_feature_size
.
append
(
get_phi3v_image_feature_size
(
hf_config
,
input_width
=
w
,
input_height
=
h
))
elif
isinstance
(
image_data
,
torch
.
Tensor
):
image_feature_size
=
image_data
.
shape
[
0
]
else
:
...
...
@@ -412,45 +430,63 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
prompt
=
llm_inputs
.
get
(
"prompt"
)
if
prompt
is
None
:
# for async server request, we assume prompt and its token_ids is always
# in correct format. And num_image_tags == len(image_data) always True.
image_idx
=
range
(
1
,
len
(
image_data
)
+
1
)
new_prompt
=
None
else
:
image_idx
=
sorted
(
map
(
int
,
re
.
findall
(
r
"<\|image_(\d+)\|>+"
,
prompt
)))
if
prompt
.
count
(
"<|image|>"
)
>
0
:
logger
.
warning
(
"Please follow the prompt format that is "
"documented on HuggingFace which does not involve "
"repeating <|image|> tokens."
)
elif
len
(
re
.
findall
(
r
"(<\|image_\d+\|>)+"
,
prompt
))
>
1
:
logger
.
warning
(
"Multiple image input is not supported yet, "
"so any extra image tokens will be treated "
"as plain text."
)
elif
(
num_image_tags
:
=
len
(
image_idx
))
>
1
:
assert
num_image_tags
==
len
(
image_data
),
"The count of image_placeholder not match image's"
new_prompt
=
prompt
prompt_token_ids
=
llm_inputs
[
"prompt_token_ids"
]
image_1_token_ids
=
_get_image_placeholder_token_ids
(
model_config
,
idx
=
1
)
prompt_token_ids
=
llm_inputs
[
"prompt_token_ids"
].
copy
()
# masked place_holder with image token id
for
idx
in
image_idx
:
image_token_ids
=
_get_image_placeholder_token_ids
(
model_config
,
idx
=
idx
)
for
i
in
range
(
len
(
prompt_token_ids
)
-
len
(
image_token_ids
)
+
1
):
if
prompt_token_ids
[
i
:
i
+
len
(
image_token_ids
)]
==
image_token_ids
:
prompt_token_ids
[
i
:
i
+
len
(
image_token_ids
)]
=
[
_IMAGE_TOKEN_ID
]
*
len
(
image_token_ids
)
break
new_token_ids
:
List
[
int
]
=
[]
for
i
in
range
(
len
(
prompt_token_ids
)
-
len
(
image_1_token_ids
)
+
1
):
if
prompt_token_ids
[
i
:
i
+
len
(
image_1_token_ids
)]
==
image_1_token_ids
:
new_token_ids
.
append
(
_IMAGE_TOKEN_ID
)
# merge consecutive tag ids
merged_token_ids
:
List
[
int
]
=
[]
for
is_placeholder
,
token_ids
in
itertools
.
groupby
(
prompt_token_ids
,
lambda
x
:
x
==
_IMAGE_TOKEN_ID
):
if
is_placeholder
:
merged_token_ids
.
append
(
_IMAGE_TOKEN_ID
)
else
:
merged_token_ids
.
extend
(
list
(
token_ids
))
# No need to further scan the list since we only replace once
new_token_ids
.
extend
(
prompt_token_ids
[
i
+
len
(
image_1_token_ids
):])
break
# TODO: Move this to utils or integrate with clip.
new_token_ids
:
List
[
int
]
=
[]
placeholder_idx
=
0
while
merged_token_ids
:
token_id
=
merged_token_ids
.
pop
(
0
)
if
token_id
==
_IMAGE_TOKEN_ID
:
new_token_ids
.
extend
(
repeat_and_pad_token
(
_IMAGE_TOKEN_ID
,
repeat_count
=
image_feature_size
[
placeholder_idx
],
))
placeholder_idx
+=
1
else
:
new_token_ids
.
append
(
prompt_
token_id
s
[
i
]
)
new_token_ids
.
append
(
token_id
)
# NOTE: Create a defensive copy of the original inputs
llm_inputs
=
LLMInputs
(
prompt_token_ids
=
new_token_ids
,
prompt
=
new_prompt
,
multi_modal_data
=
multi_modal_data
)
return
input_processor_for_clip
(
model_config
,
CLIP_VIT_LARGE_PATCH14_336_CONFIG
,
llm_inputs
,
image_token_id
=
_IMAGE_TOKEN_ID
,
image_feature_size_override
=
image_feature_size
,
)
return
llm_inputs
@
MULTIMODAL_REGISTRY
.
register_image_input_mapper
()
...
...
@@ -483,10 +519,19 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal):
self
.
sampler
=
Sampler
()
def
_validate_image_sizes
(
self
,
data
:
torch
.
Tensor
)
->
torch
.
Tensor
:
if
list
(
data
.
shape
[
1
:])
!=
[
2
]:
raise
ValueError
(
f
"The expected shape of image sizes is batch dimension plus "
f
"
{
[
2
]
}
. You supplied
{
tuple
(
data
.
shape
)
}
."
)
expected_dims
=
(
2
,
)
def
_validate_shape
(
d
:
torch
.
Tensor
):
actual_dims
=
tuple
(
d
.
shape
)
if
actual_dims
!=
expected_dims
:
expected_expr
=
str
(
expected_dims
)
raise
ValueError
(
f
"The expected shape of image sizes per image per batch "
f
"is
{
expected_expr
}
. You supplied
{
tuple
(
d
.
shape
)
}
."
)
for
d
in
data
:
_validate_shape
(
d
)
return
data
...
...
@@ -503,7 +548,7 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal):
if
actual_dims
!=
expected_dims
:
expected_expr
=
(
"num_patches"
,
*
map
(
str
,
expected_dims
))
raise
ValueError
(
"The expected shape of pixel values
in each batch element
"
"The expected shape of pixel values
per image per batch
"
f
"is
{
expected_expr
}
. You supplied
{
tuple
(
d
.
shape
)
}
."
)
for
d
in
data
:
...
...
@@ -528,22 +573,24 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal):
raise
ValueError
(
"Incorrect type of pixel values. "
f
"Got type:
{
type
(
pixel_values
)
}
"
)
if
not
isinstance
(
image_sizes
,
torch
.
Tensor
):
if
not
isinstance
(
image_sizes
,
(
torch
.
Tensor
,
list
)
):
raise
ValueError
(
"Incorrect type of image sizes. "
f
"Got type:
{
type
(
image_sizes
)
}
"
)
return
Phi3VImagePixelInputs
(
type
=
"pixel_values"
,
data
=
self
.
_validate_pixel_values
(
pixel_values
),
image_sizes
=
self
.
_validate_image_sizes
(
image_sizes
))
data
=
self
.
_validate_pixel_values
(
flatten_bn
(
pixel_values
)),
image_sizes
=
self
.
_validate_image_sizes
(
flatten_bn
(
image_sizes
,
concat
=
True
)))
if
image_embeds
is
not
None
:
if
not
isinstance
(
image_embeds
,
torch
.
Tensor
):
raise
ValueError
(
"Incorrect type of image embeddings. "
f
"Got type:
{
type
(
image_embeds
)
}
"
)
return
Phi3VImageEmbeddingInputs
(
type
=
"image_embeds"
,
data
=
image_embeds
,
data
=
flatten_bn
(
image_embeds
)
,
)
raise
AssertionError
(
"This line should be unreachable."
)
...
...
@@ -616,23 +663,27 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal):
(
".gate_up_proj"
,
".gate_proj"
,
0
),
(
".gate_up_proj"
,
".up_proj"
,
1
),
]
# TODO(ChristopherCho): This is a temporary fix to load
# the vision weights with CLIPVisionModel.load_weights()
vision_weights
=
[]
params_dict
=
dict
(
self
.
named_parameters
())
for
name
,
loaded_weight
in
weights
:
if
"rotary_emb.inv_freq"
in
name
:
continue
# post_layernorm is not needed in CLIPVisionModel
if
"vision_model.post_layernorm"
in
name
:
# Skip loading the img_processor weights since they are
# loaded separately.
if
"vision_embed_tokens.img_processor"
in
name
:
vision_weights
.
append
((
name
,
loaded_weight
))
continue
for
key_to_modify
,
new_key
in
_KEYS_TO_MODIFY_MAPPING
.
items
():
if
key_to_modify
in
name
:
name
=
name
.
replace
(
key_to_modify
,
new_key
)
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
# We only do sharding for language model
# and not vision model for now.
if
"vision_embed_tokens"
in
name
and
self
.
vision_embed_tokens
:
continue
if
weight_name
not
in
name
:
continue
param
=
params_dict
[
name
.
replace
(
weight_name
,
param_name
)]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
...
...
@@ -646,3 +697,11 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal):
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
# We use regex to extract the sub-module name
# from "model.vision_embed_tokens.img_processor.*"
vision_weights
=
[
(
re
.
search
(
r
"vision_embed_tokens\.img_processor\.(.*)"
,
n
).
group
(
1
),
w
)
for
n
,
w
in
vision_weights
]
self
.
vision_embed_tokens
.
img_processor
.
load_weights
(
vision_weights
)
vllm/model_executor/models/phimoe.py
0 → 100644
View file @
0640f227
# coding=utf-8
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only PhiMoE model."""
from
typing
import
Iterable
,
List
,
Optional
,
Tuple
import
torch
from
torch
import
nn
from
transformers.configuration_utils
import
PretrainedConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.config
import
CacheConfig
,
LoRAConfig
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.linear
import
(
QKVParallelLinear
,
ReplicatedLinear
,
RowParallelLinear
)
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
DEFAULT_VOCAB_PADDING_SIZE
,
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsLoRA
class
PhiMoEConfig
(
PretrainedConfig
):
model_type
=
"phimoe"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
vocab_size
=
32000
,
hidden_size
=
4096
,
intermediate_size
=
14336
,
num_hidden_layers
=
32
,
num_attention_heads
=
32
,
num_key_value_heads
=
8
,
hidden_act
=
"silu"
,
max_position_embeddings
=
4096
*
32
,
initializer_range
=
0.02
,
rms_norm_eps
=
1e-5
,
use_cache
=
True
,
pad_token_id
=
None
,
bos_token_id
=
1
,
eos_token_id
=
2
,
tie_word_embeddings
=
False
,
rope_theta
=
1e6
,
sliding_window
=
None
,
attention_dropout
=
0.0
,
num_experts_per_tok
=
2
,
num_local_experts
=
16
,
output_router_logits
=
False
,
router_aux_loss_coef
=
0.001
,
router_jitter_noise
=
0.0
,
attention_bias
=
False
,
lm_head_bias
=
False
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
self
.
sliding_window
=
sliding_window
self
.
attention_bias
=
attention_bias
self
.
lm_head_bias
=
lm_head_bias
# for backward compatibility
if
num_key_value_heads
is
None
:
num_key_value_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
rms_norm_eps
self
.
use_cache
=
use_cache
self
.
rope_theta
=
rope_theta
self
.
attention_dropout
=
attention_dropout
self
.
num_experts_per_tok
=
num_experts_per_tok
self
.
num_local_experts
=
num_local_experts
self
.
output_router_logits
=
output_router_logits
self
.
router_aux_loss_coef
=
router_aux_loss_coef
self
.
router_jitter_noise
=
router_jitter_noise
super
().
__init__
(
pad_token_id
=
pad_token_id
,
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
,
)
class
mp
(
torch
.
autograd
.
Function
):
@
staticmethod
def
forward
(
ctx
,
scores
:
torch
.
Tensor
,
multiplier
:
torch
.
Tensor
,
selected_experts
:
torch
.
Tensor
,
masked_gates
:
torch
.
Tensor
,
mask_for_one
:
torch
.
Tensor
,
):
ctx
.
save_for_backward
(
multiplier
,
selected_experts
,
masked_gates
)
return
multiplier
*
mask_for_one
@
staticmethod
def
backward
(
ctx
,
grad_at_output
:
torch
.
Tensor
,
):
multiplier
,
selected_experts
,
masked_gates
=
ctx
.
saved_tensors
grad_at_output
=
grad_at_output
*
multiplier
grad_at_scores_expaned
=
masked_gates
*
grad_at_output
.
mul
(
-
1
)
grad_at_scores_expaned
.
scatter_add_
(
dim
=-
1
,
index
=
selected_experts
,
src
=
grad_at_output
,
)
return
(
grad_at_scores_expaned
,
None
,
None
,
None
,
None
,
)
def
sparsemixer
(
scores
,
jitter_eps
=
0.01
):
################ first expert ################
with
torch
.
no_grad
():
# compute mask for sparsity
mask_logits_threshold
,
max_ind
=
scores
.
max
(
dim
=-
1
,
keepdim
=
True
)
factor
=
scores
.
abs
().
clamp
(
min
=
mask_logits_threshold
)
mask_logits_threshold
=
(
(
mask_logits_threshold
-
scores
)
/
factor
)
>
(
2
*
jitter_eps
)
# apply mask
masked_gates
=
scores
.
masked_fill
(
mask_logits_threshold
,
float
(
"-inf"
))
selected_experts
=
max_ind
# compute scores for gradients
masked_gates
=
torch
.
softmax
(
masked_gates
,
dim
=-
1
)
multiplier_o
=
masked_gates
.
gather
(
dim
=-
1
,
index
=
selected_experts
)
multiplier
=
multiplier_o
# masked out first expert
masked_scores
=
torch
.
scatter
(
scores
,
-
1
,
selected_experts
,
float
(
"-inf"
),
)
with
torch
.
no_grad
():
# compute mask for sparsity
mask_logits_threshold
,
max_ind
=
masked_scores
.
max
(
dim
=-
1
,
keepdim
=
True
)
factor
=
scores
.
abs
().
clamp
(
min
=
mask_logits_threshold
)
mask_logits_threshold
=
(
(
mask_logits_threshold
-
scores
)
/
factor
)
>
(
2
*
jitter_eps
)
# apply mask
masked_gates_top2
=
masked_scores
.
masked_fill
(
mask_logits_threshold
,
float
(
"-inf"
))
selected_experts_top2
=
max_ind
# compute scores for gradients
masked_gates_top2
=
torch
.
softmax
(
masked_gates_top2
,
dim
=-
1
)
multiplier_top2
=
masked_gates_top2
.
gather
(
dim
=-
1
,
index
=
selected_experts_top2
)
multiplier
=
torch
.
concat
((
multiplier
,
multiplier_top2
),
dim
=-
1
)
selected_experts
=
torch
.
concat
((
selected_experts
,
selected_experts_top2
),
dim
=-
1
)
return
(
multiplier
,
selected_experts
,
)
def
phimoe_routing_function
(
hidden_states
:
torch
.
Tensor
,
gating_output
:
torch
.
Tensor
,
topk
:
int
,
renormalize
:
bool
,
):
assert
hidden_states
.
shape
[
0
]
==
gating_output
.
shape
[
0
],
(
"Number of tokens mismatch"
)
assert
topk
==
2
,
"Only top-2 routing is supported"
assert
renormalize
is
False
,
"Renormalization is not supported"
topk_weights
,
topk_ids
=
sparsemixer
(
gating_output
)
return
topk_weights
,
topk_ids
class
PhiMoE
(
nn
.
Module
):
"""A tensor-parallel MoE implementation for PhiMoE that shards each expert
across all ranks.
Each expert's weights are sharded across all ranks and a fused MoE
kernel is used for the forward pass, and finally we reduce the outputs
across ranks.
"""
def
__init__
(
self
,
num_experts
:
int
,
top_k
:
int
,
hidden_size
:
int
,
intermediate_size
:
int
,
params_dtype
:
Optional
[
torch
.
dtype
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
tp_size
:
Optional
[
int
]
=
None
,
):
super
().
__init__
()
self
.
hidden_size
=
hidden_size
# Gate always runs at half / full precision for now.
self
.
gate
=
ReplicatedLinear
(
hidden_size
,
num_experts
,
bias
=
False
,
params_dtype
=
params_dtype
,
quant_config
=
None
,
)
self
.
experts
=
FusedMoE
(
num_experts
=
num_experts
,
top_k
=
top_k
,
hidden_size
=
hidden_size
,
intermediate_size
=
intermediate_size
,
params_dtype
=
params_dtype
,
reduce_results
=
True
,
renormalize
=
False
,
quant_config
=
quant_config
,
tp_size
=
tp_size
,
custom_routing_function
=
phimoe_routing_function
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
# NOTE: hidden_states can have either 1D or 2D shape.
orig_shape
=
hidden_states
.
shape
hidden_states
=
hidden_states
.
view
(
-
1
,
self
.
hidden_size
)
# router_logits: (num_tokens, n_experts)
router_logits
,
_
=
self
.
gate
(
hidden_states
)
final_hidden_states
=
self
.
experts
(
hidden_states
,
router_logits
)
return
final_hidden_states
.
view
(
orig_shape
)
class
PhiMoEAttention
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
num_heads
:
int
,
num_kv_heads
:
int
,
max_position
:
int
=
4096
*
32
,
rope_theta
:
float
=
10000
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
rope_scaling
:
Optional
[
dict
]
=
None
,
)
->
None
:
super
().
__init__
()
self
.
hidden_size
=
hidden_size
tp_size
=
get_tensor_model_parallel_world_size
()
self
.
total_num_heads
=
num_heads
assert
self
.
total_num_heads
%
tp_size
==
0
self
.
num_heads
=
self
.
total_num_heads
//
tp_size
self
.
total_num_kv_heads
=
num_kv_heads
if
self
.
total_num_kv_heads
>=
tp_size
:
# Number of KV heads is greater than TP size, so we partition
# the KV heads across multiple tensor parallel GPUs.
assert
self
.
total_num_kv_heads
%
tp_size
==
0
else
:
# Number of KV heads is less than TP size, so we replicate
# the KV heads across multiple tensor parallel GPUs.
assert
tp_size
%
self
.
total_num_kv_heads
==
0
self
.
num_kv_heads
=
max
(
1
,
self
.
total_num_kv_heads
//
tp_size
)
self
.
head_dim
=
hidden_size
//
self
.
total_num_heads
self
.
q_size
=
self
.
num_heads
*
self
.
head_dim
self
.
kv_size
=
self
.
num_kv_heads
*
self
.
head_dim
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
rope_theta
=
rope_theta
self
.
rope_scaling
=
rope_scaling
self
.
qkv_proj
=
QKVParallelLinear
(
hidden_size
,
self
.
head_dim
,
self
.
total_num_heads
,
self
.
total_num_kv_heads
,
bias
=
True
,
quant_config
=
None
,
)
self
.
o_proj
=
RowParallelLinear
(
self
.
total_num_heads
*
self
.
head_dim
,
hidden_size
,
bias
=
True
,
quant_config
=
None
,
)
self
.
rotary_emb
=
get_rope
(
self
.
head_dim
,
rotary_dim
=
self
.
head_dim
,
max_position
=
max_position
,
base
=
int
(
self
.
rope_theta
),
is_neox_style
=
True
,
rope_scaling
=
self
.
rope_scaling
,
)
self
.
attn
=
Attention
(
self
.
num_heads
,
self
.
head_dim
,
self
.
scaling
,
num_kv_heads
=
self
.
num_kv_heads
,
cache_config
=
cache_config
,
quant_config
=
quant_config
,
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
torch
.
Tensor
,
attn_metadata
:
AttentionMetadata
,
)
->
torch
.
Tensor
:
qkv
,
_
=
self
.
qkv_proj
(
hidden_states
)
q
,
k
,
v
=
qkv
.
split
([
self
.
q_size
,
self
.
kv_size
,
self
.
kv_size
],
dim
=-
1
)
q
,
k
=
self
.
rotary_emb
(
positions
,
q
,
k
)
attn_output
=
self
.
attn
(
q
,
k
,
v
,
kv_cache
,
attn_metadata
)
output
,
_
=
self
.
o_proj
(
attn_output
)
return
output
class
PhiMoEDecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
PhiMoEConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
)
->
None
:
super
().
__init__
()
self
.
hidden_size
=
config
.
hidden_size
# Requires transformers > 4.32.0
rope_theta
=
getattr
(
config
,
"rope_theta"
,
10000
)
self
.
self_attn
=
PhiMoEAttention
(
hidden_size
=
self
.
hidden_size
,
num_heads
=
config
.
num_attention_heads
,
max_position
=
config
.
max_position_embeddings
,
num_kv_heads
=
config
.
num_key_value_heads
,
rope_theta
=
rope_theta
,
cache_config
=
cache_config
,
quant_config
=
quant_config
,
rope_scaling
=
config
.
rope_scaling
,
)
self
.
block_sparse_moe
=
PhiMoE
(
num_experts
=
config
.
num_local_experts
,
top_k
=
config
.
num_experts_per_tok
,
hidden_size
=
config
.
hidden_size
,
intermediate_size
=
config
.
intermediate_size
,
quant_config
=
quant_config
,
)
self
.
input_layernorm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
,
elementwise_affine
=
True
)
self
.
post_attention_layernorm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
,
elementwise_affine
=
True
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
torch
.
Tensor
,
attn_metadata
:
AttentionMetadata
,
residual
:
Optional
[
torch
.
Tensor
],
)
->
torch
.
Tensor
:
residual
=
hidden_states
# Self Attention
hidden_states
=
self
.
input_layernorm
(
hidden_states
)
hidden_states
=
self
.
self_attn
(
positions
=
positions
,
hidden_states
=
hidden_states
,
kv_cache
=
kv_cache
,
attn_metadata
=
attn_metadata
,
)
hidden_states
=
hidden_states
+
residual
# Fully Connected
residual
=
hidden_states
hidden_states
=
self
.
post_attention_layernorm
(
hidden_states
)
hidden_states
=
self
.
block_sparse_moe
(
hidden_states
)
hidden_states
=
hidden_states
+
residual
return
hidden_states
,
residual
class
PhiMoEModel
(
nn
.
Module
):
def
__init__
(
self
,
config
:
PhiMoEConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
)
->
None
:
super
().
__init__
()
self
.
padding_idx
=
config
.
pad_token_id
lora_vocab
=
((
lora_config
.
lora_extra_vocab_size
*
(
lora_config
.
max_loras
or
1
))
if
lora_config
else
0
)
self
.
vocab_size
=
config
.
vocab_size
+
lora_vocab
self
.
org_vocab_size
=
config
.
vocab_size
self
.
embed_tokens
=
VocabParallelEmbedding
(
self
.
vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
)
self
.
layers
=
nn
.
ModuleList
([
PhiMoEDecoderLayer
(
config
,
cache_config
,
quant_config
=
quant_config
)
for
_
in
range
(
config
.
num_hidden_layers
)
])
self
.
norm
=
nn
.
LayerNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
,
elementwise_affine
=
True
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
)
->
torch
.
Tensor
:
hidden_states
=
self
.
embed_tokens
(
input_ids
)
residual
=
None
for
i
in
range
(
len
(
self
.
layers
)):
layer
=
self
.
layers
[
i
]
hidden_states
,
residual
=
layer
(
positions
,
hidden_states
,
kv_caches
[
i
],
attn_metadata
,
residual
)
hidden_states
=
self
.
norm
(
hidden_states
)
return
hidden_states
class
PhiMoEForCausalLM
(
nn
.
Module
,
SupportsLoRA
):
fall_back_to_pt_during_load
=
False
packed_modules_mapping
=
{
"qkv_proj"
:
[
"q_proj"
,
"k_proj"
,
"v_proj"
,
],
}
# LoRA specific attributes
supported_lora_modules
=
[
"qkv_proj"
,
"o_proj"
,
"embed_tokens"
,
"lm_head"
,
]
embedding_modules
=
{
"embed_tokens"
:
"input_embeddings"
,
"lm_head"
:
"output_embeddings"
,
}
embedding_padding_modules
=
[
"lm_head"
]
def
__init__
(
self
,
config
:
PhiMoEConfig
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
lora_config
:
Optional
[
LoRAConfig
]
=
None
,
)
->
None
:
super
().
__init__
()
self
.
config
=
config
self
.
lora_config
=
lora_config
self
.
model
=
PhiMoEModel
(
config
,
cache_config
,
quant_config
,
lora_config
=
lora_config
)
self
.
unpadded_vocab_size
=
config
.
vocab_size
if
lora_config
:
self
.
unpadded_vocab_size
+=
lora_config
.
lora_extra_vocab_size
self
.
lm_head
=
ParallelLMHead
(
self
.
unpadded_vocab_size
,
config
.
hidden_size
,
org_num_embeddings
=
config
.
vocab_size
,
padding_size
=
(
DEFAULT_VOCAB_PADDING_SIZE
# We need bigger padding if using lora for kernel
# compatibility
if
not
lora_config
else
lora_config
.
lora_vocab_padding_size
),
quant_config
=
None
,
bias
=
True
,
)
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
config
.
vocab_size
)
self
.
sampler
=
Sampler
()
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
)
->
torch
.
Tensor
:
hidden_states
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
attn_metadata
)
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
)
->
torch
.
Tensor
:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
Optional
[
torch
.
Tensor
],
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
"qkv_proj"
,
"q_proj"
,
"q"
),
(
"qkv_proj"
,
"k_proj"
,
"k"
),
(
"qkv_proj"
,
"v_proj"
,
"v"
),
]
expert_params_mapping
=
FusedMoE
.
make_expert_params_mapping
(
ckpt_gate_proj_name
=
"w1"
,
ckpt_down_proj_name
=
"w2"
,
ckpt_up_proj_name
=
"w3"
,
num_experts
=
self
.
config
.
num_local_experts
)
params_dict
=
dict
(
self
.
named_parameters
())
for
name
,
loaded_weight
in
weights
:
if
"rotary_emb.inv_freq"
in
name
:
continue
for
param_name
,
weight_name
,
shard_id
in
stacked_params_mapping
:
if
weight_name
not
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
for
mapping
in
expert_params_mapping
:
param_name
,
weight_name
,
expert_id
,
shard_id
=
mapping
if
weight_name
not
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
weight_name
,
shard_id
=
shard_id
,
expert_id
=
expert_id
,
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
# Remapping the name of FP8 kv-scale.
name
=
maybe_remap_kv_scale_name
(
name
,
params_dict
)
if
name
is
None
:
continue
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
vllm/model_executor/models/qwen.py
View file @
0640f227
...
...
@@ -25,12 +25,12 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from
vllm.model_executor.layers.quantization.base_config
import
(
QuantizationConfig
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
Sampler
from
vllm.model_executor.layers.sampler
import
Sampler
,
SamplerOutput
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
,
SamplerOutput
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils
import
print_warning_once
from
vllm
import
_custom_ops
as
ops
...
...
Prev
1
…
10
11
12
13
14
15
16
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment