Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
e4bb2684
Unverified
Commit
e4bb2684
authored
Nov 19, 2025
by
Isotr0py
Committed by
GitHub
Nov 18, 2025
Browse files
[Models] Replace all `nn.Conv2d` with vLLM's Conv2dLayer (#28842)
Signed-off-by:
Isotr0py
<
mozf@mail2.sysu.edu.cn
>
parent
c64c0b78
Changes
20
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
83 additions
and
45 deletions
+83
-45
vllm/model_executor/layers/conv.py
vllm/model_executor/layers/conv.py
+22
-2
vllm/model_executor/models/aimv2.py
vllm/model_executor/models/aimv2.py
+2
-1
vllm/model_executor/models/blip.py
vllm/model_executor/models/blip.py
+2
-1
vllm/model_executor/models/chameleon.py
vllm/model_executor/models/chameleon.py
+14
-15
vllm/model_executor/models/deepencoder.py
vllm/model_executor/models/deepencoder.py
+8
-5
vllm/model_executor/models/dots_ocr.py
vllm/model_executor/models/dots_ocr.py
+2
-1
vllm/model_executor/models/glm4_1v.py
vllm/model_executor/models/glm4_1v.py
+2
-2
vllm/model_executor/models/glm4v.py
vllm/model_executor/models/glm4v.py
+3
-2
vllm/model_executor/models/idefics2_vision_model.py
vllm/model_executor/models/idefics2_vision_model.py
+2
-1
vllm/model_executor/models/intern_vit.py
vllm/model_executor/models/intern_vit.py
+2
-1
vllm/model_executor/models/interns1_vit.py
vllm/model_executor/models/interns1_vit.py
+2
-1
vllm/model_executor/models/keye.py
vllm/model_executor/models/keye.py
+2
-1
vllm/model_executor/models/midashenglm.py
vllm/model_executor/models/midashenglm.py
+2
-1
vllm/model_executor/models/moonvit.py
vllm/model_executor/models/moonvit.py
+2
-1
vllm/model_executor/models/paddleocr_vl.py
vllm/model_executor/models/paddleocr_vl.py
+2
-1
vllm/model_executor/models/pixtral.py
vllm/model_executor/models/pixtral.py
+3
-2
vllm/model_executor/models/qwen_vl.py
vllm/model_executor/models/qwen_vl.py
+2
-1
vllm/model_executor/models/siglip.py
vllm/model_executor/models/siglip.py
+2
-1
vllm/model_executor/models/siglip2navit.py
vllm/model_executor/models/siglip2navit.py
+3
-2
vllm/model_executor/models/step3_vl.py
vllm/model_executor/models/step3_vl.py
+4
-3
No files found.
vllm/model_executor/layers/conv.py
View file @
e4bb2684
...
@@ -3,6 +3,7 @@
...
@@ -3,6 +3,7 @@
"""Conv Layer Class."""
"""Conv Layer Class."""
import
math
import
math
from
typing
import
Literal
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
...
@@ -23,11 +24,11 @@ class ConvLayerBase(CustomOp):
...
@@ -23,11 +24,11 @@ class ConvLayerBase(CustomOp):
out_channels
:
int
,
out_channels
:
int
,
kernel_size
:
int
|
tuple
[
int
,
...],
kernel_size
:
int
|
tuple
[
int
,
...],
stride
:
int
|
tuple
[
int
,
...]
=
1
,
stride
:
int
|
tuple
[
int
,
...]
=
1
,
padding
:
int
|
tuple
[
int
,
...]
=
0
,
padding
:
int
|
tuple
[
int
,
...]
|
Literal
[
"same"
,
"valid"
]
=
0
,
dilation
:
int
|
tuple
[
int
,
...]
=
1
,
dilation
:
int
|
tuple
[
int
,
...]
=
1
,
groups
:
int
=
1
,
groups
:
int
=
1
,
bias
:
bool
=
True
,
bias
:
bool
=
True
,
padding_mode
:
str
=
"zeros"
,
padding_mode
:
Literal
[
"zeros"
,
"reflect"
,
"replicate"
,
"circular"
]
=
"zeros"
,
*
,
*
,
params_dtype
:
torch
.
dtype
|
None
=
None
,
params_dtype
:
torch
.
dtype
|
None
=
None
,
)
->
None
:
)
->
None
:
...
@@ -36,6 +37,22 @@ class ConvLayerBase(CustomOp):
...
@@ -36,6 +37,22 @@ class ConvLayerBase(CustomOp):
if
params_dtype
is
None
:
if
params_dtype
is
None
:
params_dtype
=
torch
.
get_default_dtype
()
params_dtype
=
torch
.
get_default_dtype
()
valid_padding_strings
=
{
"same"
,
"valid"
}
if
isinstance
(
padding
,
str
)
and
padding
not
in
valid_padding_strings
:
raise
ValueError
(
f
"Invalid padding string '
{
padding
}
'. "
f
"Expected one of
{
valid_padding_strings
}
."
)
if
padding
==
"same"
:
padding
=
(
kernel_size
//
2
if
isinstance
(
kernel_size
,
int
)
else
tuple
(
k
//
2
for
k
in
kernel_size
)
)
elif
padding
==
"valid"
:
padding
=
0
kernel_size
=
(
kernel_size
=
(
(
kernel_size
,)
*
self
.
num_dim
(
kernel_size
,)
*
self
.
num_dim
if
isinstance
(
kernel_size
,
int
)
if
isinstance
(
kernel_size
,
int
)
...
@@ -45,6 +62,9 @@ class ConvLayerBase(CustomOp):
...
@@ -45,6 +62,9 @@ class ConvLayerBase(CustomOp):
padding
=
(
padding
,)
*
self
.
num_dim
if
isinstance
(
padding
,
int
)
else
padding
padding
=
(
padding
,)
*
self
.
num_dim
if
isinstance
(
padding
,
int
)
else
padding
dilation
=
(
dilation
,)
*
self
.
num_dim
if
isinstance
(
dilation
,
int
)
else
dilation
dilation
=
(
dilation
,)
*
self
.
num_dim
if
isinstance
(
dilation
,
int
)
else
dilation
if
padding
==
"same"
and
any
(
s
!=
1
for
s
in
stride
):
raise
ValueError
(
"padding='same' is not supported for strided convolutions"
)
self
.
in_channels
=
in_channels
self
.
in_channels
=
in_channels
self
.
out_channels
=
out_channels
self
.
out_channels
=
out_channels
self
.
kernel_size
=
kernel_size
self
.
kernel_size
=
kernel_size
...
...
vllm/model_executor/models/aimv2.py
View file @
e4bb2684
...
@@ -12,6 +12,7 @@ from vllm.attention.layer import MultiHeadAttention
...
@@ -12,6 +12,7 @@ from vllm.attention.layer import MultiHeadAttention
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.distributed.utils
import
divide
from
vllm.distributed.utils
import
divide
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
MergedColumnParallelLinear
,
...
@@ -58,7 +59,7 @@ class AIMv2SwiGLUFFN(nn.Module):
...
@@ -58,7 +59,7 @@ class AIMv2SwiGLUFFN(nn.Module):
class
AIMv2PatchEmbed
(
nn
.
Module
):
class
AIMv2PatchEmbed
(
nn
.
Module
):
def
__init__
(
self
,
config
:
AIMv2Config
):
def
__init__
(
self
,
config
:
AIMv2Config
):
super
().
__init__
()
super
().
__init__
()
self
.
proj
=
nn
.
Conv2d
(
self
.
proj
=
Conv2d
Layer
(
config
.
num_channels
,
config
.
num_channels
,
config
.
hidden_size
,
config
.
hidden_size
,
kernel_size
=
(
config
.
patch_size
,
config
.
patch_size
),
kernel_size
=
(
config
.
patch_size
,
config
.
patch_size
),
...
...
vllm/model_executor/models/blip.py
View file @
e4bb2684
...
@@ -12,6 +12,7 @@ from transformers import Blip2VisionConfig, BlipVisionConfig
...
@@ -12,6 +12,7 @@ from transformers import Blip2VisionConfig, BlipVisionConfig
from
vllm.attention.layer
import
MultiHeadAttention
from
vllm.attention.layer
import
MultiHeadAttention
from
vllm.distributed
import
divide
,
get_tensor_model_parallel_world_size
from
vllm.distributed
import
divide
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
QKVParallelLinear
,
QKVParallelLinear
,
...
@@ -47,7 +48,7 @@ class BlipVisionEmbeddings(nn.Module):
...
@@ -47,7 +48,7 @@ class BlipVisionEmbeddings(nn.Module):
self
.
class_embedding
=
nn
.
Parameter
(
torch
.
randn
(
1
,
1
,
self
.
embed_dim
))
self
.
class_embedding
=
nn
.
Parameter
(
torch
.
randn
(
1
,
1
,
self
.
embed_dim
))
self
.
patch_embedding
=
nn
.
Conv2d
(
self
.
patch_embedding
=
Conv2d
Layer
(
in_channels
=
3
,
in_channels
=
3
,
out_channels
=
self
.
embed_dim
,
out_channels
=
self
.
embed_dim
,
kernel_size
=
self
.
patch_size
,
kernel_size
=
self
.
patch_size
,
...
...
vllm/model_executor/models/chameleon.py
View file @
e4bb2684
...
@@ -22,6 +22,7 @@ from vllm.config.multimodal import BaseDummyOptions
...
@@ -22,6 +22,7 @@ from vllm.config.multimodal import BaseDummyOptions
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.distributed
import
get_pp_group
,
get_tensor_model_parallel_world_size
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
MergedColumnParallelLinear
,
...
@@ -549,7 +550,7 @@ class ChameleonVQVAEVectorQuantizer(nn.Module):
...
@@ -549,7 +550,7 @@ class ChameleonVQVAEVectorQuantizer(nn.Module):
class
ChameleonVQVAEEncoderConvDownsample
(
nn
.
Module
):
class
ChameleonVQVAEEncoderConvDownsample
(
nn
.
Module
):
def
__init__
(
self
,
in_channels
:
int
):
def
__init__
(
self
,
in_channels
:
int
):
super
().
__init__
()
super
().
__init__
()
self
.
conv
=
nn
.
Conv2d
(
self
.
conv
=
Conv2d
Layer
(
in_channels
,
in_channels
,
kernel_size
=
3
,
stride
=
2
,
padding
=
0
in_channels
,
in_channels
,
kernel_size
=
3
,
stride
=
2
,
padding
=
0
)
)
...
@@ -577,23 +578,23 @@ class ChameleonVQVAEEncoderResnetBlock(nn.Module):
...
@@ -577,23 +578,23 @@ class ChameleonVQVAEEncoderResnetBlock(nn.Module):
self
.
norm1
=
torch
.
nn
.
GroupNorm
(
self
.
norm1
=
torch
.
nn
.
GroupNorm
(
num_groups
=
32
,
num_channels
=
in_channels
,
eps
=
1e-6
,
affine
=
True
num_groups
=
32
,
num_channels
=
in_channels
,
eps
=
1e-6
,
affine
=
True
)
)
self
.
conv1
=
torch
.
nn
.
Conv2d
(
self
.
conv1
=
Conv2d
Layer
(
in_channels
,
out_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
in_channels
,
out_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
)
self
.
norm2
=
torch
.
nn
.
GroupNorm
(
self
.
norm2
=
torch
.
nn
.
GroupNorm
(
num_groups
=
32
,
num_channels
=
out_channels
,
eps
=
1e-6
,
affine
=
True
num_groups
=
32
,
num_channels
=
out_channels
,
eps
=
1e-6
,
affine
=
True
)
)
self
.
dropout
=
torch
.
nn
.
Dropout
(
config
.
dropout
)
self
.
dropout
=
torch
.
nn
.
Dropout
(
config
.
dropout
)
self
.
conv2
=
torch
.
nn
.
Conv2d
(
self
.
conv2
=
Conv2d
Layer
(
out_channels
,
out_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
out_channels
,
out_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
)
if
self
.
in_channels
!=
self
.
out_channels
:
if
self
.
in_channels
!=
self
.
out_channels
:
if
self
.
use_conv_shortcut
:
if
self
.
use_conv_shortcut
:
self
.
conv_shortcut
=
torch
.
nn
.
Conv2d
(
self
.
conv_shortcut
=
Conv2d
Layer
(
in_channels
,
out_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
in_channels
,
out_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
)
else
:
else
:
self
.
nin_shortcut
=
torch
.
nn
.
Conv2d
(
self
.
nin_shortcut
=
Conv2d
Layer
(
in_channels
,
out_channels
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
in_channels
,
out_channels
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
)
)
...
@@ -626,16 +627,16 @@ class ChameleonVQVAEEncoderAttnBlock(nn.Module):
...
@@ -626,16 +627,16 @@ class ChameleonVQVAEEncoderAttnBlock(nn.Module):
self
.
norm
=
torch
.
nn
.
GroupNorm
(
self
.
norm
=
torch
.
nn
.
GroupNorm
(
num_groups
=
32
,
num_channels
=
in_channels
,
eps
=
1e-6
,
affine
=
True
num_groups
=
32
,
num_channels
=
in_channels
,
eps
=
1e-6
,
affine
=
True
)
)
self
.
q
=
torch
.
nn
.
Conv2d
(
self
.
q
=
Conv2d
Layer
(
in_channels
,
in_channels
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
in_channels
,
in_channels
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
)
)
self
.
k
=
torch
.
nn
.
Conv2d
(
self
.
k
=
Conv2d
Layer
(
in_channels
,
in_channels
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
in_channels
,
in_channels
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
)
)
self
.
v
=
torch
.
nn
.
Conv2d
(
self
.
v
=
Conv2d
Layer
(
in_channels
,
in_channels
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
in_channels
,
in_channels
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
)
)
self
.
proj_out
=
torch
.
nn
.
Conv2d
(
self
.
proj_out
=
Conv2d
Layer
(
in_channels
,
in_channels
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
in_channels
,
in_channels
,
kernel_size
=
1
,
stride
=
1
,
padding
=
0
)
)
...
@@ -681,7 +682,7 @@ class ChameleonVQVAEEncoder(nn.Module):
...
@@ -681,7 +682,7 @@ class ChameleonVQVAEEncoder(nn.Module):
latent_channels
=
config
.
latent_channels
latent_channels
=
config
.
latent_channels
channel_multiplier
=
config
.
channel_multiplier
channel_multiplier
=
config
.
channel_multiplier
self
.
conv_in
=
torch
.
nn
.
Conv2d
(
self
.
conv_in
=
Conv2d
Layer
(
in_channels
,
base_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
in_channels
,
base_channels
,
kernel_size
=
3
,
stride
=
1
,
padding
=
1
)
)
...
@@ -738,7 +739,7 @@ class ChameleonVQVAEEncoder(nn.Module):
...
@@ -738,7 +739,7 @@ class ChameleonVQVAEEncoder(nn.Module):
self
.
norm_out
=
torch
.
nn
.
GroupNorm
(
self
.
norm_out
=
torch
.
nn
.
GroupNorm
(
num_groups
=
32
,
num_channels
=
block_in
,
eps
=
1e-6
,
affine
=
True
num_groups
=
32
,
num_channels
=
block_in
,
eps
=
1e-6
,
affine
=
True
)
)
self
.
conv_out
=
torch
.
nn
.
Conv2d
(
self
.
conv_out
=
Conv2d
Layer
(
block_in
,
block_in
,
2
*
latent_channels
if
double_latent
else
latent_channels
,
2
*
latent_channels
if
double_latent
else
latent_channels
,
kernel_size
=
3
,
kernel_size
=
3
,
...
@@ -779,10 +780,8 @@ class ChameleonVQVAE(nn.Module):
...
@@ -779,10 +780,8 @@ class ChameleonVQVAE(nn.Module):
super
().
__init__
()
super
().
__init__
()
self
.
encoder
=
ChameleonVQVAEEncoder
(
config
)
self
.
encoder
=
ChameleonVQVAEEncoder
(
config
)
self
.
quantize
=
ChameleonVQVAEVectorQuantizer
(
config
)
self
.
quantize
=
ChameleonVQVAEVectorQuantizer
(
config
)
self
.
quant_conv
=
torch
.
nn
.
Conv2d
(
config
.
latent_channels
,
config
.
embed_dim
,
1
)
self
.
quant_conv
=
Conv2dLayer
(
config
.
latent_channels
,
config
.
embed_dim
,
1
)
self
.
post_quant_conv
=
torch
.
nn
.
Conv2d
(
self
.
post_quant_conv
=
Conv2dLayer
(
config
.
embed_dim
,
config
.
latent_channels
,
1
)
config
.
embed_dim
,
config
.
latent_channels
,
1
)
self
.
eval
()
# Chameleon's VQ model is frozen
self
.
eval
()
# Chameleon's VQ model is frozen
def
encode
(
def
encode
(
...
...
vllm/model_executor/models/deepencoder.py
View file @
e4bb2684
...
@@ -19,6 +19,7 @@ import torch.nn.functional as F
...
@@ -19,6 +19,7 @@ import torch.nn.functional as F
from
transformers
import
CLIPVisionConfig
from
transformers
import
CLIPVisionConfig
from
vllm.attention.layer
import
MultiHeadAttention
from
vllm.attention.layer
import
MultiHeadAttention
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
...
@@ -133,14 +134,14 @@ class ImageEncoderViT(nn.Module):
...
@@ -133,14 +134,14 @@ class ImageEncoderViT(nn.Module):
self
.
blocks
.
append
(
block
)
self
.
blocks
.
append
(
block
)
self
.
neck
=
nn
.
Sequential
(
self
.
neck
=
nn
.
Sequential
(
nn
.
Conv2d
(
Conv2d
Layer
(
embed_dim
,
embed_dim
,
out_chans
,
out_chans
,
kernel_size
=
1
,
kernel_size
=
1
,
bias
=
False
,
bias
=
False
,
),
),
LayerNorm2d
(
out_chans
),
LayerNorm2d
(
out_chans
),
nn
.
Conv2d
(
Conv2d
Layer
(
out_chans
,
out_chans
,
out_chans
,
out_chans
,
kernel_size
=
3
,
kernel_size
=
3
,
...
@@ -150,8 +151,10 @@ class ImageEncoderViT(nn.Module):
...
@@ -150,8 +151,10 @@ class ImageEncoderViT(nn.Module):
LayerNorm2d
(
out_chans
),
LayerNorm2d
(
out_chans
),
)
)
self
.
net_2
=
nn
.
Conv2d
(
256
,
512
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
,
bias
=
False
)
self
.
net_2
=
Conv2dLayer
(
self
.
net_3
=
nn
.
Conv2d
(
256
,
512
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
,
bias
=
False
)
self
.
net_3
=
Conv2dLayer
(
512
,
1024
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
,
bias
=
False
512
,
1024
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
,
bias
=
False
)
)
...
@@ -500,7 +503,7 @@ class PatchEmbed(nn.Module):
...
@@ -500,7 +503,7 @@ class PatchEmbed(nn.Module):
"""
"""
super
().
__init__
()
super
().
__init__
()
self
.
proj
=
nn
.
Conv2d
(
self
.
proj
=
Conv2d
Layer
(
in_chans
,
embed_dim
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
in_chans
,
embed_dim
,
kernel_size
=
kernel_size
,
stride
=
stride
,
padding
=
padding
)
)
...
...
vllm/model_executor/models/dots_ocr.py
View file @
e4bb2684
...
@@ -22,6 +22,7 @@ from vllm.distributed.parallel_state import (
...
@@ -22,6 +22,7 @@ from vllm.distributed.parallel_state import (
get_tensor_model_parallel_world_size
,
get_tensor_model_parallel_world_size
,
)
)
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
...
@@ -471,7 +472,7 @@ class DotsPatchEmbed(nn.Module):
...
@@ -471,7 +472,7 @@ class DotsPatchEmbed(nn.Module):
self
.
temporal_patch_size
=
config
.
temporal_patch_size
self
.
temporal_patch_size
=
config
.
temporal_patch_size
self
.
embed_dim
=
config
.
embed_dim
self
.
embed_dim
=
config
.
embed_dim
self
.
config
=
config
self
.
config
=
config
self
.
proj
=
nn
.
Conv2d
(
self
.
proj
=
Conv2d
Layer
(
config
.
num_channels
,
config
.
num_channels
,
config
.
embed_dim
,
config
.
embed_dim
,
kernel_size
=
(
config
.
patch_size
,
config
.
patch_size
),
kernel_size
=
(
config
.
patch_size
,
config
.
patch_size
),
...
...
vllm/model_executor/models/glm4_1v.py
View file @
e4bb2684
...
@@ -56,7 +56,7 @@ from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
...
@@ -56,7 +56,7 @@ from vllm.config.multimodal import BaseDummyOptions, VideoDummyOptions
from
vllm.distributed
import
get_tensor_model_parallel_world_size
,
parallel_state
from
vllm.distributed
import
get_tensor_model_parallel_world_size
,
parallel_state
from
vllm.distributed
import
utils
as
dist_utils
from
vllm.distributed
import
utils
as
dist_utils
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.conv
import
Conv3dLayer
from
vllm.model_executor.layers.conv
import
Conv2dLayer
,
Conv3dLayer
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
...
@@ -734,7 +734,7 @@ class Glm4vVisionTransformer(nn.Module):
...
@@ -734,7 +734,7 @@ class Glm4vVisionTransformer(nn.Module):
self
.
post_conv_layernorm
=
RMSNorm
(
self
.
post_conv_layernorm
=
RMSNorm
(
vision_config
.
hidden_size
,
eps
=
vision_config
.
rms_norm_eps
vision_config
.
hidden_size
,
eps
=
vision_config
.
rms_norm_eps
)
)
self
.
downsample
=
nn
.
Conv2d
(
self
.
downsample
=
Conv2d
Layer
(
in_channels
=
vision_config
.
hidden_size
,
in_channels
=
vision_config
.
hidden_size
,
out_channels
=
vision_config
.
out_hidden_size
,
out_channels
=
vision_config
.
out_hidden_size
,
kernel_size
=
vision_config
.
spatial_merge_size
,
kernel_size
=
vision_config
.
spatial_merge_size
,
...
...
vllm/model_executor/models/glm4v.py
View file @
e4bb2684
...
@@ -24,6 +24,7 @@ from vllm.config import VllmConfig
...
@@ -24,6 +24,7 @@ from vllm.config import VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
SiluAndMul
,
get_act_fn
from
vllm.model_executor.layers.activation
import
SiluAndMul
,
get_act_fn
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
MergedColumnParallelLinear
,
MergedColumnParallelLinear
,
...
@@ -78,7 +79,7 @@ class GLMVImagePixelInputs(TensorSchema):
...
@@ -78,7 +79,7 @@ class GLMVImagePixelInputs(TensorSchema):
class
EVA2CLIPPatchEmbedding
(
nn
.
Module
):
class
EVA2CLIPPatchEmbedding
(
nn
.
Module
):
def
__init__
(
self
,
config
):
def
__init__
(
self
,
config
):
super
().
__init__
()
super
().
__init__
()
self
.
proj
=
nn
.
Conv2d
(
self
.
proj
=
Conv2d
Layer
(
config
.
in_channels
,
config
.
in_channels
,
config
.
hidden_size
,
config
.
hidden_size
,
kernel_size
=
config
.
patch_size
,
kernel_size
=
config
.
patch_size
,
...
@@ -333,7 +334,7 @@ class EVA2CLIPModel(nn.Module):
...
@@ -333,7 +334,7 @@ class EVA2CLIPModel(nn.Module):
quant_config
=
quant_config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.linear_proj"
,
prefix
=
f
"
{
prefix
}
.linear_proj"
,
)
)
self
.
conv
=
nn
.
Conv2d
(
self
.
conv
=
Conv2d
Layer
(
in_channels
=
vision_config
.
hidden_size
,
in_channels
=
vision_config
.
hidden_size
,
out_channels
=
config
.
hidden_size
,
out_channels
=
config
.
hidden_size
,
kernel_size
=
2
,
kernel_size
=
2
,
...
...
vllm/model_executor/models/idefics2_vision_model.py
View file @
e4bb2684
...
@@ -30,6 +30,7 @@ from transformers.models.idefics2.configuration_idefics2 import (
...
@@ -30,6 +30,7 @@ from transformers.models.idefics2.configuration_idefics2 import (
from
vllm.attention.layer
import
MultiHeadAttention
from
vllm.attention.layer
import
MultiHeadAttention
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
QKVParallelLinear
,
QKVParallelLinear
,
...
@@ -60,7 +61,7 @@ class Idefics2VisionEmbeddings(nn.Module):
...
@@ -60,7 +61,7 @@ class Idefics2VisionEmbeddings(nn.Module):
self
.
embed_dim
=
config
.
hidden_size
self
.
embed_dim
=
config
.
hidden_size
self
.
image_size
=
config
.
image_size
self
.
image_size
=
config
.
image_size
self
.
patch_size
=
config
.
patch_size
self
.
patch_size
=
config
.
patch_size
self
.
patch_embedding
=
nn
.
Conv2d
(
self
.
patch_embedding
=
Conv2d
Layer
(
in_channels
=
config
.
num_channels
,
in_channels
=
config
.
num_channels
,
out_channels
=
self
.
embed_dim
,
out_channels
=
self
.
embed_dim
,
kernel_size
=
self
.
patch_size
,
kernel_size
=
self
.
patch_size
,
...
...
vllm/model_executor/models/intern_vit.py
View file @
e4bb2684
...
@@ -24,6 +24,7 @@ from vllm.distributed import (
...
@@ -24,6 +24,7 @@ from vllm.distributed import (
tensor_model_parallel_all_gather
,
tensor_model_parallel_all_gather
,
)
)
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
...
@@ -51,7 +52,7 @@ class InternVisionEmbeddings(nn.Module):
...
@@ -51,7 +52,7 @@ class InternVisionEmbeddings(nn.Module):
self
.
class_embedding
=
nn
.
Parameter
(
torch
.
randn
(
1
,
1
,
self
.
embed_dim
))
self
.
class_embedding
=
nn
.
Parameter
(
torch
.
randn
(
1
,
1
,
self
.
embed_dim
))
self
.
patch_embedding
=
nn
.
Conv2d
(
self
.
patch_embedding
=
Conv2d
Layer
(
in_channels
=
3
,
in_channels
=
3
,
out_channels
=
self
.
embed_dim
,
out_channels
=
self
.
embed_dim
,
kernel_size
=
self
.
patch_size
,
kernel_size
=
self
.
patch_size
,
...
...
vllm/model_executor/models/interns1_vit.py
View file @
e4bb2684
...
@@ -16,6 +16,7 @@ from transformers.utils import torch_int
...
@@ -16,6 +16,7 @@ from transformers.utils import torch_int
from
vllm.attention.layer
import
MultiHeadAttention
from
vllm.attention.layer
import
MultiHeadAttention
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
ColumnParallelLinear
,
RowParallelLinear
from
vllm.model_executor.layers.linear
import
ColumnParallelLinear
,
RowParallelLinear
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
...
@@ -43,7 +44,7 @@ class InternS1VisionPatchEmbeddings(nn.Module):
...
@@ -43,7 +44,7 @@ class InternS1VisionPatchEmbeddings(nn.Module):
self
.
num_patches
=
num_patches
self
.
num_patches
=
num_patches
self
.
patch_shape
=
patch_shape
self
.
patch_shape
=
patch_shape
self
.
projection
=
nn
.
Conv2d
(
self
.
projection
=
Conv2d
Layer
(
num_channels
,
hidden_size
,
kernel_size
=
patch_size
,
stride
=
patch_size
num_channels
,
hidden_size
,
kernel_size
=
patch_size
,
stride
=
patch_size
)
)
...
...
vllm/model_executor/models/keye.py
View file @
e4bb2684
...
@@ -24,6 +24,7 @@ from vllm.config import VllmConfig
...
@@ -24,6 +24,7 @@ from vllm.config import VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
QKVParallelLinear
,
QKVParallelLinear
,
...
@@ -204,7 +205,7 @@ class KeyeVisionEmbeddings(nn.Module):
...
@@ -204,7 +205,7 @@ class KeyeVisionEmbeddings(nn.Module):
self
.
image_size
=
config
.
image_size
self
.
image_size
=
config
.
image_size
self
.
patch_size
=
config
.
patch_size
self
.
patch_size
=
config
.
patch_size
self
.
patch_embedding
=
nn
.
Conv2d
(
self
.
patch_embedding
=
Conv2d
Layer
(
in_channels
=
config
.
num_channels
,
in_channels
=
config
.
num_channels
,
out_channels
=
self
.
embed_dim
,
out_channels
=
self
.
embed_dim
,
kernel_size
=
self
.
patch_size
,
kernel_size
=
self
.
patch_size
,
...
...
vllm/model_executor/models/midashenglm.py
View file @
e4bb2684
...
@@ -39,6 +39,7 @@ from vllm.config import VllmConfig
...
@@ -39,6 +39,7 @@ from vllm.config import VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
QKVParallelLinear
,
QKVParallelLinear
,
...
@@ -120,7 +121,7 @@ class AudioPatchEmbed(nn.Module):
...
@@ -120,7 +121,7 @@ class AudioPatchEmbed(nn.Module):
self
.
num_patches
=
self
.
grid_size
[
0
]
*
self
.
grid_size
[
1
]
self
.
num_patches
=
self
.
grid_size
[
0
]
*
self
.
grid_size
[
1
]
self
.
flatten
=
flatten
self
.
flatten
=
flatten
self
.
proj
=
nn
.
Conv2d
(
self
.
proj
=
Conv2d
Layer
(
in_chans
,
in_chans
,
embed_dim
,
embed_dim
,
kernel_size
=
self
.
patch_size
,
kernel_size
=
self
.
patch_size
,
...
...
vllm/model_executor/models/moonvit.py
View file @
e4bb2684
...
@@ -53,6 +53,7 @@ from transformers.activations import ACT2FN
...
@@ -53,6 +53,7 @@ from transformers.activations import ACT2FN
from
transformers.modeling_utils
import
PreTrainedModel
from
transformers.modeling_utils
import
PreTrainedModel
from
transformers.utils
import
is_flash_attn_2_available
from
transformers.utils
import
is_flash_attn_2_available
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
ReplicatedLinear
from
vllm.model_executor.layers.linear
import
ReplicatedLinear
from
vllm.model_executor.models.utils
import
maybe_prefix
from
vllm.model_executor.models.utils
import
maybe_prefix
from
vllm.transformers_utils.configs.moonvit
import
MoonViTConfig
from
vllm.transformers_utils.configs.moonvit
import
MoonViTConfig
...
@@ -244,7 +245,7 @@ class MoonVisionPatchEmbed(nn.Module):
...
@@ -244,7 +245,7 @@ class MoonVisionPatchEmbed(nn.Module):
)
)
self
.
patch_size
=
patch_size
self
.
patch_size
=
patch_size
self
.
proj
=
nn
.
Conv2d
(
self
.
proj
=
Conv2d
Layer
(
in_dim
,
out_dim
,
kernel_size
=
patch_size
,
stride
=
patch_size
in_dim
,
out_dim
,
kernel_size
=
patch_size
,
stride
=
patch_size
)
)
...
...
vllm/model_executor/models/paddleocr_vl.py
View file @
e4bb2684
...
@@ -45,6 +45,7 @@ from vllm.config.multimodal import BaseDummyOptions
...
@@ -45,6 +45,7 @@ from vllm.config.multimodal import BaseDummyOptions
from
vllm.distributed
import
parallel_state
from
vllm.distributed
import
parallel_state
from
vllm.distributed
import
utils
as
dist_utils
from
vllm.distributed
import
utils
as
dist_utils
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
QKVParallelLinear
,
QKVParallelLinear
,
...
@@ -419,7 +420,7 @@ class SiglipVisionEmbeddings(nn.Module):
...
@@ -419,7 +420,7 @@ class SiglipVisionEmbeddings(nn.Module):
self
.
image_size
=
config
.
image_size
self
.
image_size
=
config
.
image_size
self
.
patch_size
=
config
.
patch_size
self
.
patch_size
=
config
.
patch_size
self
.
patch_embedding
=
nn
.
Conv2d
(
self
.
patch_embedding
=
Conv2d
Layer
(
in_channels
=
config
.
num_channels
,
in_channels
=
config
.
num_channels
,
out_channels
=
self
.
embed_dim
,
out_channels
=
self
.
embed_dim
,
kernel_size
=
self
.
patch_size
,
kernel_size
=
self
.
patch_size
,
...
...
vllm/model_executor/models/pixtral.py
View file @
e4bb2684
...
@@ -31,6 +31,7 @@ from vllm.config import VllmConfig
...
@@ -31,6 +31,7 @@ from vllm.config import VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.distributed
import
divide
,
get_tensor_model_parallel_world_size
from
vllm.distributed
import
divide
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_and_mul_fn
from
vllm.model_executor.layers.activation
import
get_act_and_mul_fn
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
MergedColumnParallelLinear
,
MergedColumnParallelLinear
,
...
@@ -747,7 +748,7 @@ class VisionTransformer(nn.Module):
...
@@ -747,7 +748,7 @@ class VisionTransformer(nn.Module):
def
__init__
(
self
,
args
:
VisionEncoderArgs
):
def
__init__
(
self
,
args
:
VisionEncoderArgs
):
super
().
__init__
()
super
().
__init__
()
self
.
args
=
args
self
.
args
=
args
self
.
patch_conv
=
nn
.
Conv2d
(
self
.
patch_conv
=
Conv2d
Layer
(
in_channels
=
args
.
num_channels
,
in_channels
=
args
.
num_channels
,
out_channels
=
args
.
hidden_size
,
out_channels
=
args
.
hidden_size
,
kernel_size
=
args
.
patch_size
,
kernel_size
=
args
.
patch_size
,
...
@@ -1212,7 +1213,7 @@ class PixtralHFVisionModel(nn.Module):
...
@@ -1212,7 +1213,7 @@ class PixtralHFVisionModel(nn.Module):
self
.
config
=
config
self
.
config
=
config
self
.
patch_conv
=
nn
.
Conv2d
(
self
.
patch_conv
=
Conv2d
Layer
(
in_channels
=
config
.
num_channels
,
in_channels
=
config
.
num_channels
,
out_channels
=
config
.
hidden_size
,
out_channels
=
config
.
hidden_size
,
kernel_size
=
config
.
patch_size
,
kernel_size
=
config
.
patch_size
,
...
...
vllm/model_executor/models/qwen_vl.py
View file @
e4bb2684
...
@@ -25,6 +25,7 @@ from transformers.tokenization_utils_base import TextInput
...
@@ -25,6 +25,7 @@ from transformers.tokenization_utils_base import TextInput
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
ReplicatedLinear
,
ReplicatedLinear
,
...
@@ -333,7 +334,7 @@ class VisionTransformer(nn.Module):
...
@@ -333,7 +334,7 @@ class VisionTransformer(nn.Module):
patch_height
,
patch_width
=
self
.
patch_size
=
(
patch_size
,
patch_size
)
patch_height
,
patch_width
=
self
.
patch_size
=
(
patch_size
,
patch_size
)
self
.
grid_size
=
(
image_height
//
patch_height
,
image_width
//
patch_width
)
self
.
grid_size
=
(
image_height
//
patch_height
,
image_width
//
patch_width
)
self
.
output_dim
=
output_dim
self
.
output_dim
=
output_dim
self
.
conv1
=
nn
.
Conv2d
(
self
.
conv1
=
Conv2d
Layer
(
in_channels
=
3
,
in_channels
=
3
,
out_channels
=
width
,
out_channels
=
width
,
kernel_size
=
patch_size
,
kernel_size
=
patch_size
,
...
...
vllm/model_executor/models/siglip.py
View file @
e4bb2684
...
@@ -24,6 +24,7 @@ from vllm.config import VllmConfig
...
@@ -24,6 +24,7 @@ from vllm.config import VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.distributed
import
divide
,
get_tensor_model_parallel_world_size
from
vllm.distributed
import
divide
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
QKVParallelLinear
,
QKVParallelLinear
,
...
@@ -286,7 +287,7 @@ class SiglipVisionEmbeddings(nn.Module):
...
@@ -286,7 +287,7 @@ class SiglipVisionEmbeddings(nn.Module):
self
.
image_size
=
config
.
image_size
self
.
image_size
=
config
.
image_size
self
.
patch_size
=
config
.
patch_size
self
.
patch_size
=
config
.
patch_size
self
.
patch_embedding
=
nn
.
Conv2d
(
self
.
patch_embedding
=
Conv2d
Layer
(
in_channels
=
config
.
num_channels
,
in_channels
=
config
.
num_channels
,
out_channels
=
self
.
embed_dim
,
out_channels
=
self
.
embed_dim
,
kernel_size
=
self
.
patch_size
,
kernel_size
=
self
.
patch_size
,
...
...
vllm/model_executor/models/siglip2navit.py
View file @
e4bb2684
...
@@ -16,6 +16,7 @@ from vllm.attention.backends.registry import AttentionBackendEnum
...
@@ -16,6 +16,7 @@ from vllm.attention.backends.registry import AttentionBackendEnum
from
vllm.attention.layer
import
maybe_get_vit_flash_attn_backend
from
vllm.attention.layer
import
maybe_get_vit_flash_attn_backend
from
vllm.distributed
import
divide
,
get_tensor_model_parallel_world_size
from
vllm.distributed
import
divide
,
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
LinearBase
,
LinearBase
,
...
@@ -67,7 +68,7 @@ class Siglip2VisionEmbeddings(nn.Module):
...
@@ -67,7 +68,7 @@ class Siglip2VisionEmbeddings(nn.Module):
self
.
position_embedding
=
nn
.
Embedding
(
self
.
num_patches
,
self
.
embed_dim
)
self
.
position_embedding
=
nn
.
Embedding
(
self
.
num_patches
,
self
.
embed_dim
)
else
:
else
:
self
.
patch_embedding
=
nn
.
Conv2d
(
self
.
patch_embedding
=
Conv2d
Layer
(
in_channels
=
config
.
num_channels
,
in_channels
=
config
.
num_channels
,
out_channels
=
self
.
embed_dim
,
out_channels
=
self
.
embed_dim
,
kernel_size
=
self
.
patch_size
,
kernel_size
=
self
.
patch_size
,
...
@@ -99,7 +100,7 @@ class Siglip2VisionEmbeddings(nn.Module):
...
@@ -99,7 +100,7 @@ class Siglip2VisionEmbeddings(nn.Module):
target_dtype
=
self
.
patch_embedding
.
weight
.
dtype
target_dtype
=
self
.
patch_embedding
.
weight
.
dtype
if
isinstance
(
self
.
patch_embedding
,
LinearBase
):
if
isinstance
(
self
.
patch_embedding
,
LinearBase
):
patch_embeds
=
self
.
patch_embedding
(
pixel_values
.
to
(
dtype
=
target_dtype
))
patch_embeds
=
self
.
patch_embedding
(
pixel_values
.
to
(
dtype
=
target_dtype
))
elif
isinstance
(
self
.
patch_embedding
,
nn
.
Conv2d
):
elif
isinstance
(
self
.
patch_embedding
,
Conv2d
Layer
):
pixel_values
=
pixel_values
.
view
(
pixel_values
=
pixel_values
.
view
(
-
1
,
-
1
,
self
.
config
.
num_channels
*
self
.
config
.
temporal_patch_size
,
self
.
config
.
num_channels
*
self
.
config
.
temporal_patch_size
,
...
...
vllm/model_executor/models/step3_vl.py
View file @
e4bb2684
...
@@ -20,6 +20,7 @@ from vllm.config import VllmConfig
...
@@ -20,6 +20,7 @@ from vllm.config import VllmConfig
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.config.multimodal
import
BaseDummyOptions
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.distributed
import
get_tensor_model_parallel_world_size
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.activation
import
get_act_fn
from
vllm.model_executor.layers.conv
import
Conv2dLayer
from
vllm.model_executor.layers.linear
import
(
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
ColumnParallelLinear
,
QKVParallelLinear
,
QKVParallelLinear
,
...
@@ -667,7 +668,7 @@ class Step3VisionEmbeddings(nn.Module):
...
@@ -667,7 +668,7 @@ class Step3VisionEmbeddings(nn.Module):
self
.
class_embedding
=
nn
.
Parameter
(
torch
.
randn
(
1
,
self
.
embed_dim
))
self
.
class_embedding
=
nn
.
Parameter
(
torch
.
randn
(
1
,
self
.
embed_dim
))
self
.
patch_embedding
=
nn
.
Conv2d
(
self
.
patch_embedding
=
Conv2d
Layer
(
in_channels
=
config
.
num_channels
,
in_channels
=
config
.
num_channels
,
out_channels
=
self
.
embed_dim
,
out_channels
=
self
.
embed_dim
,
kernel_size
=
self
.
patch_size
,
kernel_size
=
self
.
patch_size
,
...
@@ -950,13 +951,13 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
...
@@ -950,13 +951,13 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
prefix
=
maybe_prefix
(
prefix
,
"vision_model"
),
prefix
=
maybe_prefix
(
prefix
,
"vision_model"
),
use_data_parallel
=
self
.
use_data_parallel
,
use_data_parallel
=
self
.
use_data_parallel
,
)
)
self
.
vit_downsampler
=
nn
.
Conv2d
(
self
.
vit_downsampler
=
Conv2d
Layer
(
config
.
vision_config
.
hidden_size
,
config
.
vision_config
.
hidden_size
,
config
.
vision_config
.
output_hidden_size
,
config
.
vision_config
.
output_hidden_size
,
kernel_size
=
2
,
kernel_size
=
2
,
stride
=
config
.
understand_projector_stride
,
stride
=
config
.
understand_projector_stride
,
)
)
self
.
vit_downsampler2
=
nn
.
Conv2d
(
self
.
vit_downsampler2
=
Conv2d
Layer
(
config
.
vision_config
.
output_hidden_size
,
config
.
vision_config
.
output_hidden_size
,
config
.
vision_config
.
output_hidden_size
*
2
,
config
.
vision_config
.
output_hidden_size
*
2
,
kernel_size
=
3
,
kernel_size
=
3
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment