Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
96ae75ad
Commit
96ae75ad
authored
Jan 04, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.6.6.post1' into v0.6.6.post1-dev
parents
f9f4a735
2339d59f
Changes
374
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1595 additions
and
322 deletions
+1595
-322
vllm/model_executor/layers/quantization/utils/fp8_utils.py
vllm/model_executor/layers/quantization/utils/fp8_utils.py
+353
-0
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+11
-2
vllm/model_executor/layers/rejection_sampler.py
vllm/model_executor/layers/rejection_sampler.py
+3
-7
vllm/model_executor/layers/sampler.py
vllm/model_executor/layers/sampler.py
+6
-45
vllm/model_executor/layers/utils.py
vllm/model_executor/layers/utils.py
+57
-0
vllm/model_executor/model_loader/loader.py
vllm/model_executor/model_loader/loader.py
+108
-1
vllm/model_executor/model_loader/tensorizer.py
vllm/model_executor/model_loader/tensorizer.py
+14
-11
vllm/model_executor/model_loader/utils.py
vllm/model_executor/model_loader/utils.py
+8
-3
vllm/model_executor/model_loader/weight_utils.py
vllm/model_executor/model_loader/weight_utils.py
+26
-1
vllm/model_executor/models/adapters.py
vllm/model_executor/models/adapters.py
+170
-20
vllm/model_executor/models/aria.py
vllm/model_executor/models/aria.py
+10
-10
vllm/model_executor/models/bert.py
vllm/model_executor/models/bert.py
+2
-2
vllm/model_executor/models/deepseek_v3.py
vllm/model_executor/models/deepseek_v3.py
+650
-0
vllm/model_executor/models/gemma2.py
vllm/model_executor/models/gemma2.py
+17
-1
vllm/model_executor/models/jamba.py
vllm/model_executor/models/jamba.py
+35
-1
vllm/model_executor/models/llava.py
vllm/model_executor/models/llava.py
+2
-2
vllm/model_executor/models/molmo.py
vllm/model_executor/models/molmo.py
+59
-46
vllm/model_executor/models/phi3v.py
vllm/model_executor/models/phi3v.py
+30
-17
vllm/model_executor/models/pixtral.py
vllm/model_executor/models/pixtral.py
+29
-149
vllm/model_executor/models/qwen2.py
vllm/model_executor/models/qwen2.py
+5
-4
No files found.
vllm/model_executor/layers/quantization/utils/fp8_utils.py
0 → 100644
View file @
96ae75ad
# Adapted from https://github.com/sgl-project/sglang/pull/2575
from
typing
import
List
,
Optional
,
Tuple
import
torch
import
triton
import
triton.language
as
tl
def
apply_w8a8_block_fp8_linear
(
input
:
torch
.
Tensor
,
weight
:
torch
.
Tensor
,
block_size
:
List
[
int
],
weight_scale
:
torch
.
Tensor
,
input_scale
:
Optional
[
torch
.
Tensor
]
=
None
,
bias
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
assert
input_scale
is
None
# View input as 2D matrix for fp8 methods
input_2d
=
input
.
view
(
-
1
,
input
.
shape
[
-
1
])
output_shape
=
[
*
input
.
shape
[:
-
1
],
weight
.
shape
[
0
]]
q_input
,
x_scale
=
per_token_group_quant_fp8
(
input_2d
,
block_size
[
1
])
output
=
w8a8_block_fp8_matmul
(
q_input
,
weight
,
x_scale
,
weight_scale
,
block_size
,
output_dtype
=
input
.
dtype
)
if
bias
is
not
None
:
output
=
output
+
bias
return
output
.
to
(
dtype
=
input
.
dtype
).
view
(
*
output_shape
)
def
input_to_float8
(
x
:
torch
.
Tensor
,
dtype
:
torch
.
dtype
=
torch
.
float8_e4m3fn
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""This function quantizes input values to float8 values "
"with tensor-wise quantization."""
finfo
=
torch
.
finfo
(
dtype
)
min_val
,
max_val
=
x
.
aminmax
()
amax
=
torch
.
maximum
(
min_val
.
abs
(),
max_val
.
abs
()).
clamp
(
min
=
1e-12
)
scale
=
finfo
.
max
/
amax
x_scl_sat
=
(
x
*
scale
).
clamp
(
min
=
finfo
.
min
,
max
=
finfo
.
max
)
return
x_scl_sat
.
to
(
dtype
).
contiguous
(),
scale
.
float
().
reciprocal
()
def
block_quant_to_tensor_quant
(
x_q_block
:
torch
.
Tensor
,
x_s
:
torch
.
Tensor
,
block_size
:
List
[
int
],
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""This function converts block-wise quantization to tensor-wise
quantization. The inputs are block-wise quantization tensor `x_q_block`,
block-wise quantization scale and the block size.
The outputs are tensor-wise quantization tensor and tensor-wise
quantization scale. Note only float8 is supported for now.
"""
block_n
,
block_k
=
block_size
[
0
],
block_size
[
1
]
n
,
k
=
x_q_block
.
shape
n_tiles
=
(
n
+
block_n
-
1
)
//
block_n
k_tiles
=
(
k
+
block_k
-
1
)
//
block_k
assert
n_tiles
==
x_s
.
shape
[
0
]
assert
k_tiles
==
x_s
.
shape
[
1
]
x_dq_block
=
x_q_block
.
to
(
torch
.
float32
)
x_dq_block_tiles
=
[[
x_dq_block
[
j
*
block_n
:
min
((
j
+
1
)
*
block_n
,
n
),
i
*
block_k
:
min
((
i
+
1
)
*
block_k
,
k
),
]
for
i
in
range
(
k_tiles
)
]
for
j
in
range
(
n_tiles
)]
for
i
in
range
(
k_tiles
):
for
j
in
range
(
n_tiles
):
x_dq_block_tiles
[
j
][
i
][:,
:]
=
x_dq_block_tiles
[
j
][
i
]
*
x_s
[
j
][
i
]
x_q_tensor
,
scale
=
input_to_float8
(
x_dq_block
,
dtype
=
x_q_block
.
dtype
)
return
x_q_tensor
,
scale
@
triton
.
jit
def
_per_token_group_quant_fp8
(
# Pointers to inputs and output
y_ptr
,
y_q_ptr
,
y_s_ptr
,
# Stride of input
y_stride
,
# Columns of input
N
,
# Avoid to divide zero
eps
,
# Information for float8
fp8_min
,
fp8_max
,
# Meta-parameters
BLOCK
:
tl
.
constexpr
,
):
"""A Triton-accelerated function to perform per-token-group
quantization on a tensor.
This function converts the tensor values into float8 values.
"""
# Map the program id to the row of X and Y it should compute.
g_id
=
tl
.
program_id
(
0
)
y_ptr
+=
g_id
*
y_stride
y_q_ptr
+=
g_id
*
y_stride
y_s_ptr
+=
g_id
cols
=
tl
.
arange
(
0
,
BLOCK
)
# N <= BLOCK
mask
=
cols
<
N
y
=
tl
.
load
(
y_ptr
+
cols
,
mask
=
mask
,
other
=
0.0
).
to
(
tl
.
float32
)
# Quant
_absmax
=
tl
.
maximum
(
tl
.
max
(
tl
.
abs
(
y
)),
eps
)
y_s
=
_absmax
/
fp8_max
y_q
=
tl
.
clamp
(
y
/
y_s
,
fp8_min
,
fp8_max
).
to
(
y_q_ptr
.
dtype
.
element_ty
)
tl
.
store
(
y_q_ptr
+
cols
,
y_q
,
mask
=
mask
)
tl
.
store
(
y_s_ptr
,
y_s
)
def
per_token_group_quant_fp8
(
x
:
torch
.
Tensor
,
group_size
:
int
,
eps
:
float
=
1e-10
,
dtype
:
torch
.
dtype
=
torch
.
float8_e4m3fn
,
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
"""Function to perform per-token-group quantization on an input tensor `x`.
It converts the tensor values into signed float8 values and returns the
quantized tensor along with the scaling factor used for quantization.
Args:
x: The input tenosr with ndim >= 2.
group_size: The group size used for quantization.
eps: The minimum to avoid dividing zero.
dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn`
is supported for now.
Returns:
Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the
scaling factor for quantization.
"""
assert
(
x
.
shape
[
-
1
]
%
group_size
==
0
),
(
f
"the last dimension of `x`
{
x
.
shape
[
-
1
]
}
must be divisible "
f
"by `group_size`
{
group_size
}
"
)
assert
x
.
is_contiguous
(),
"`x` must be contiguous"
finfo
=
torch
.
finfo
(
dtype
)
fp8_min
=
finfo
.
min
fp8_max
=
finfo
.
max
x_q
=
torch
.
empty_like
(
x
,
device
=
x
.
device
,
dtype
=
dtype
)
M
=
x
.
numel
()
//
group_size
N
=
group_size
x_s
=
torch
.
empty
(
x
.
shape
[:
-
1
]
+
(
x
.
shape
[
-
1
]
//
group_size
,
),
device
=
x
.
device
,
dtype
=
torch
.
float32
,
)
BLOCK
=
triton
.
next_power_of_2
(
N
)
# heuristics for number of warps
num_warps
=
min
(
max
(
BLOCK
//
256
,
1
),
8
)
num_stages
=
1
_per_token_group_quant_fp8
[(
M
,
)](
x
,
x_q
,
x_s
,
group_size
,
N
,
eps
,
fp8_min
=
fp8_min
,
fp8_max
=
fp8_max
,
BLOCK
=
BLOCK
,
num_warps
=
num_warps
,
num_stages
=
num_stages
,
)
return
x_q
,
x_s
@
triton
.
jit
def
_w8a8_block_fp8_matmul
(
# Pointers to inputs and output
A
,
B
,
C
,
As
,
Bs
,
# Shape for matmul
M
,
N
,
K
,
# Block size for block-wise quantization
group_n
,
group_k
,
# Stride for inputs and output
stride_am
,
stride_ak
,
stride_bk
,
stride_bn
,
stride_cm
,
stride_cn
,
stride_As_m
,
stride_As_k
,
stride_Bs_k
,
stride_Bs_n
,
# Meta-parameters
BLOCK_SIZE_M
:
tl
.
constexpr
,
BLOCK_SIZE_N
:
tl
.
constexpr
,
BLOCK_SIZE_K
:
tl
.
constexpr
,
GROUP_SIZE_M
:
tl
.
constexpr
,
):
"""Triton-accelerated function used to perform linear operations (dot
product) on input tensors `A` and `B` with block-wise quantization, and
store the result in output tensor `C`.
"""
pid
=
tl
.
program_id
(
axis
=
0
)
num_pid_m
=
tl
.
cdiv
(
M
,
BLOCK_SIZE_M
)
num_pid_n
=
tl
.
cdiv
(
N
,
BLOCK_SIZE_N
)
num_pid_in_group
=
GROUP_SIZE_M
*
num_pid_n
group_id
=
pid
//
num_pid_in_group
first_pid_m
=
group_id
*
GROUP_SIZE_M
group_size_m
=
min
(
num_pid_m
-
first_pid_m
,
GROUP_SIZE_M
)
pid_m
=
first_pid_m
+
(
pid
%
group_size_m
)
pid_n
=
(
pid
%
num_pid_in_group
)
//
group_size_m
offs_am
=
(
pid_m
*
BLOCK_SIZE_M
+
tl
.
arange
(
0
,
BLOCK_SIZE_M
))
%
M
offs_bn
=
(
pid_n
*
BLOCK_SIZE_N
+
tl
.
arange
(
0
,
BLOCK_SIZE_N
))
%
N
offs_k
=
tl
.
arange
(
0
,
BLOCK_SIZE_K
)
a_ptrs
=
A
+
(
offs_am
[:,
None
]
*
stride_am
+
offs_k
[
None
,
:]
*
stride_ak
)
b_ptrs
=
B
+
(
offs_k
[:,
None
]
*
stride_bk
+
offs_bn
[
None
,
:]
*
stride_bn
)
As_ptrs
=
As
+
offs_am
*
stride_As_m
offs_bsn
=
offs_bn
//
group_n
Bs_ptrs
=
Bs
+
offs_bsn
*
stride_Bs_n
accumulator
=
tl
.
zeros
((
BLOCK_SIZE_M
,
BLOCK_SIZE_N
),
dtype
=
tl
.
float32
)
for
k
in
range
(
0
,
tl
.
cdiv
(
K
,
BLOCK_SIZE_K
)):
a
=
tl
.
load
(
a_ptrs
,
mask
=
offs_k
[
None
,
:]
<
K
-
k
*
BLOCK_SIZE_K
,
other
=
0.0
)
b
=
tl
.
load
(
b_ptrs
,
mask
=
offs_k
[:,
None
]
<
K
-
k
*
BLOCK_SIZE_K
,
other
=
0.0
)
k_start
=
k
*
BLOCK_SIZE_K
offs_ks
=
k_start
//
group_k
a_s
=
tl
.
load
(
As_ptrs
+
offs_ks
*
stride_As_k
)
b_s
=
tl
.
load
(
Bs_ptrs
+
offs_ks
*
stride_Bs_k
)
accumulator
+=
tl
.
dot
(
a
,
b
)
*
a_s
[:,
None
]
*
b_s
[
None
,
:]
a_ptrs
+=
BLOCK_SIZE_K
*
stride_ak
b_ptrs
+=
BLOCK_SIZE_K
*
stride_bk
if
C
.
dtype
.
element_ty
==
tl
.
bfloat16
:
c
=
accumulator
.
to
(
tl
.
bfloat16
)
elif
C
.
dtype
.
element_ty
==
tl
.
float16
:
c
=
accumulator
.
to
(
tl
.
float16
)
else
:
c
=
accumulator
.
to
(
tl
.
float32
)
offs_cm
=
pid_m
*
BLOCK_SIZE_M
+
tl
.
arange
(
0
,
BLOCK_SIZE_M
)
offs_cn
=
pid_n
*
BLOCK_SIZE_N
+
tl
.
arange
(
0
,
BLOCK_SIZE_N
)
c_ptrs
=
C
+
stride_cm
*
offs_cm
[:,
None
]
+
stride_cn
*
offs_cn
[
None
,
:]
c_mask
=
(
offs_cm
[:,
None
]
<
M
)
&
(
offs_cn
[
None
,
:]
<
N
)
tl
.
store
(
c_ptrs
,
c
,
mask
=
c_mask
)
def
w8a8_block_fp8_matmul
(
A
:
torch
.
Tensor
,
B
:
torch
.
Tensor
,
As
:
torch
.
Tensor
,
Bs
:
torch
.
Tensor
,
block_size
:
List
[
int
],
output_dtype
:
torch
.
dtype
=
torch
.
float16
,
)
->
torch
.
Tensor
:
"""This function performs matrix multiplication with block-wise
quantization.
It takes two input tensors `A` and `B` with scales `As` and `Bs`.
The output is returned in the specified `output_dtype`.
Args:
A: The input tensor, e.g., activation.
B: The input tensor, e.g., weight.
As: The per-token-group quantization scale for `A`.
Bs: The per-block quantization scale for `B`.
block_size: The block size for per-block quantization. It should
be 2-dim, e.g., [128, 128].
output_dytpe: The dtype of the returned tensor.
Returns:
torch.Tensor: The result of matmul.
"""
assert
len
(
block_size
)
==
2
block_n
,
block_k
=
block_size
[
0
],
block_size
[
1
]
assert
A
.
shape
[
-
1
]
==
B
.
shape
[
-
1
]
assert
A
.
shape
[:
-
1
]
==
As
.
shape
[:
-
1
]
and
A
.
is_contiguous
()
assert
triton
.
cdiv
(
A
.
shape
[
-
1
],
block_k
)
==
As
.
shape
[
-
1
]
M
=
A
.
numel
()
//
A
.
shape
[
-
1
]
assert
B
.
ndim
==
2
and
B
.
is_contiguous
()
and
Bs
.
ndim
==
2
N
,
K
=
B
.
shape
assert
triton
.
cdiv
(
N
,
block_n
)
==
Bs
.
shape
[
0
]
assert
triton
.
cdiv
(
K
,
block_k
)
==
Bs
.
shape
[
1
]
C_shape
=
A
.
shape
[:
-
1
]
+
(
N
,
)
C
=
A
.
new_empty
(
C_shape
,
dtype
=
output_dtype
)
# TODO:
# BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N can be optimized.
# BLOCK_SIZE_K must be divisible by block_k
# BLOCK_SIZE_N and BLOCK_SIZE_M has no requirements
BLOCK_SIZE_M
=
128
if
M
<
BLOCK_SIZE_M
:
BLOCK_SIZE_M
=
triton
.
next_power_of_2
(
M
)
BLOCK_SIZE_M
=
max
(
BLOCK_SIZE_M
,
16
)
BLOCK_SIZE_K
=
block_k
assert
block_k
%
BLOCK_SIZE_K
==
0
BLOCK_SIZE_N
=
block_n
def
grid
(
META
):
return
(
triton
.
cdiv
(
M
,
META
[
"BLOCK_SIZE_M"
])
*
triton
.
cdiv
(
N
,
META
[
"BLOCK_SIZE_N"
]),
)
_w8a8_block_fp8_matmul
[
grid
](
A
,
B
,
C
,
As
,
Bs
,
M
,
N
,
K
,
block_n
,
block_k
,
A
.
stride
(
-
2
),
A
.
stride
(
-
1
),
B
.
stride
(
1
),
B
.
stride
(
0
),
C
.
stride
(
-
2
),
C
.
stride
(
-
1
),
As
.
stride
(
-
2
),
As
.
stride
(
-
1
),
Bs
.
stride
(
1
),
Bs
.
stride
(
0
),
BLOCK_SIZE_M
=
BLOCK_SIZE_M
,
BLOCK_SIZE_N
=
BLOCK_SIZE_N
,
BLOCK_SIZE_K
=
BLOCK_SIZE_K
,
GROUP_SIZE_M
=
8
,
)
return
C
vllm/model_executor/layers/quantization/utils/w8a8_utils.py
View file @
96ae75ad
...
@@ -12,9 +12,18 @@ TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32)
...
@@ -12,9 +12,18 @@ TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32)
W8A8_TRITONJSON
=
W8a8GetCacheJSON
()
W8A8_TRITONJSON
=
W8a8GetCacheJSON
()
def
sparse_cutlass_supported
()
->
bool
:
if
not
current_platform
.
is_cuda
():
return
False
capability_tuple
=
current_platform
.
get_device_capability
()
capability
=
-
1
if
capability_tuple
is
None
else
capability_tuple
.
to_int
()
return
ops
.
cutlass_sparse_scaled_mm_supported
(
capability
)
def
cutlass_fp8_supported
()
->
bool
:
def
cutlass_fp8_supported
()
->
bool
:
# cutlass is not supported on Rocm
if
not
current_platform
.
is_cuda
():
if
current_platform
.
is_rocm
():
return
False
return
False
capability_tuple
=
current_platform
.
get_device_capability
()
capability_tuple
=
current_platform
.
get_device_capability
()
...
...
vllm/model_executor/layers/rejection_sampler.py
View file @
96ae75ad
from
functools
import
cached_property
from
functools
import
cached_property
from
importlib.util
import
find_spec
from
importlib.util
import
find_spec
from
typing
import
Dict
,
List
,
Optional
,
Tuple
from
typing
import
Dict
,
Optional
,
Tuple
import
torch
import
torch
import
torch.jit
import
torch.jit
...
@@ -386,16 +386,12 @@ def _multinomial(
...
@@ -386,16 +386,12 @@ def _multinomial(
if
not
seeded_seqs
:
if
not
seeded_seqs
:
q
.
exponential_
(
1.0
)
q
.
exponential_
(
1.0
)
else
:
else
:
non_seeded_indices
:
List
[
int
]
=
[]
start
=
0
start
=
0
for
idx
in
range
(
len
(
q
)
//
k
):
for
idx
in
range
(
len
(
q
)
//
k
):
end
=
start
+
k
end
=
start
+
k
generator
=
seeded_seqs
.
get
(
idx
)
generator
=
seeded_seqs
.
get
(
idx
)
if
generator
is
None
:
# Note: generator might be None for non seeded
non_seeded_indices
.
extend
(
list
(
range
(
start
,
end
)))
q
[
start
:
end
].
exponential_
(
1.0
,
generator
=
generator
)
else
:
q
[
start
:
end
].
exponential_
(
1.0
,
generator
=
generator
)
start
=
end
start
=
end
q
[
non_seeded_indices
].
exponential_
(
1.0
)
return
probs
.
div_
(
q
).
argmax
(
dim
=
1
).
view
(
-
1
,
num_samples
)
return
probs
.
div_
(
q
).
argmax
(
dim
=
1
).
view
(
-
1
,
num_samples
)
vllm/model_executor/layers/sampler.py
View file @
96ae75ad
...
@@ -11,6 +11,7 @@ import torch
...
@@ -11,6 +11,7 @@ import torch
import
torch.nn
as
nn
import
torch.nn
as
nn
import
vllm.envs
as
envs
import
vllm.envs
as
envs
from
vllm.model_executor.layers.utils
import
apply_penalties
from
vllm.model_executor.sampling_metadata
import
(
SamplingMetadata
,
from
vllm.model_executor.sampling_metadata
import
(
SamplingMetadata
,
SamplingTensors
,
SamplingTensors
,
SequenceGroupToSample
)
SequenceGroupToSample
)
...
@@ -270,11 +271,11 @@ class Sampler(nn.Module):
...
@@ -270,11 +271,11 @@ class Sampler(nn.Module):
# Apply presence and frequency penalties.
# Apply presence and frequency penalties.
if
do_penalties
:
if
do_penalties
:
logits
=
_
apply_penalties
(
logits
,
sampling_tensors
.
prompt_tokens
,
logits
=
apply_penalties
(
logits
,
sampling_tensors
.
prompt_tokens
,
sampling_tensors
.
output_tokens
,
sampling_tensors
.
output_tokens
,
sampling_tensors
.
presence_penalties
,
sampling_tensors
.
presence_penalties
,
sampling_tensors
.
frequency_penalties
,
sampling_tensors
.
frequency_penalties
,
sampling_tensors
.
repetition_penalties
)
sampling_tensors
.
repetition_penalties
)
# Use float32 to apply temperature scaling.
# Use float32 to apply temperature scaling.
# Use in-place division to avoid creating a new tensor.
# Use in-place division to avoid creating a new tensor.
...
@@ -349,23 +350,6 @@ class Sampler(nn.Module):
...
@@ -349,23 +350,6 @@ class Sampler(nn.Module):
return
self
.
should_modify_greedy_probs_inplace
return
self
.
should_modify_greedy_probs_inplace
def
_get_bin_counts_and_mask
(
tokens
:
torch
.
Tensor
,
vocab_size
:
int
,
num_seqs
:
int
,
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
# Compute the bin counts for the tokens.
# vocab_size + 1 for padding.
bin_counts
=
torch
.
zeros
((
num_seqs
,
vocab_size
+
1
),
dtype
=
torch
.
long
,
device
=
tokens
.
device
)
bin_counts
.
scatter_add_
(
1
,
tokens
,
torch
.
ones_like
(
tokens
))
bin_counts
=
bin_counts
[:,
:
vocab_size
]
mask
=
bin_counts
>
0
return
bin_counts
,
mask
def
_apply_min_tokens_penalty
(
def
_apply_min_tokens_penalty
(
logits
:
torch
.
Tensor
,
logits
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
sampling_metadata
:
SamplingMetadata
,
...
@@ -413,29 +397,6 @@ def _apply_min_tokens_penalty(
...
@@ -413,29 +397,6 @@ def _apply_min_tokens_penalty(
return
logits
return
logits
def
_apply_penalties
(
logits
:
torch
.
Tensor
,
prompt_tokens_tensor
:
torch
.
Tensor
,
output_tokens_tensor
:
torch
.
Tensor
,
presence_penalties
:
torch
.
Tensor
,
frequency_penalties
:
torch
.
Tensor
,
repetition_penalties
:
torch
.
Tensor
)
->
torch
.
Tensor
:
num_seqs
,
vocab_size
=
logits
.
shape
_
,
prompt_mask
=
_get_bin_counts_and_mask
(
prompt_tokens_tensor
,
vocab_size
,
num_seqs
)
output_bin_counts
,
output_mask
=
_get_bin_counts_and_mask
(
output_tokens_tensor
,
vocab_size
,
num_seqs
)
repetition_penalties
=
repetition_penalties
[:,
None
].
repeat
(
1
,
vocab_size
)
repetition_penalties
[
~
(
prompt_mask
|
output_mask
)]
=
1.0
logits
=
torch
.
where
(
logits
>
0
,
logits
/
repetition_penalties
,
logits
*
repetition_penalties
)
# We follow the definition in OpenAI API.
# Refer to https://platform.openai.com/docs/api-reference/parameter-details
logits
-=
frequency_penalties
.
unsqueeze_
(
dim
=
1
)
*
output_bin_counts
logits
-=
presence_penalties
.
unsqueeze_
(
dim
=
1
)
*
output_mask
return
logits
def
_apply_top_k_top_p
(
def
_apply_top_k_top_p
(
logits
:
torch
.
Tensor
,
logits
:
torch
.
Tensor
,
p
:
torch
.
Tensor
,
p
:
torch
.
Tensor
,
...
...
vllm/model_executor/layers/utils.py
0 → 100644
View file @
96ae75ad
"""Utility methods for model layers."""
from
typing
import
Tuple
import
torch
def
get_token_bin_counts_and_mask
(
tokens
:
torch
.
Tensor
,
vocab_size
:
int
,
num_seqs
:
int
,
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
# Compute the bin counts for the tokens.
# vocab_size + 1 for padding.
bin_counts
=
torch
.
zeros
((
num_seqs
,
vocab_size
+
1
),
dtype
=
torch
.
long
,
device
=
tokens
.
device
)
bin_counts
.
scatter_add_
(
1
,
tokens
,
torch
.
ones_like
(
tokens
))
bin_counts
=
bin_counts
[:,
:
vocab_size
]
mask
=
bin_counts
>
0
return
bin_counts
,
mask
def
apply_penalties
(
logits
:
torch
.
Tensor
,
prompt_tokens_tensor
:
torch
.
Tensor
,
output_tokens_tensor
:
torch
.
Tensor
,
presence_penalties
:
torch
.
Tensor
,
frequency_penalties
:
torch
.
Tensor
,
repetition_penalties
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""
Applies penalties in place to the logits tensor
logits : The input logits tensor of shape [num_seqs, vocab_size]
prompt_tokens_tensor: A tensor containing the prompt tokens. The prompts
are padded to the maximum prompt length within the batch using
`vocab_size` as the padding value. The value `vocab_size` is used
for padding because it does not correspond to any valid token ID
in the vocabulary.
output_tokens_tensor: The output tokens tensor.
presence_penalties: The presence penalties of shape (num_seqs, )
frequency_penalties: The frequency penalties of shape (num_seqs, )
repetition_penalties: The repetition penalties of shape (num_seqs, )
"""
num_seqs
,
vocab_size
=
logits
.
shape
_
,
prompt_mask
=
get_token_bin_counts_and_mask
(
prompt_tokens_tensor
,
vocab_size
,
num_seqs
)
output_bin_counts
,
output_mask
=
get_token_bin_counts_and_mask
(
output_tokens_tensor
,
vocab_size
,
num_seqs
)
repetition_penalties
=
repetition_penalties
.
unsqueeze_
(
dim
=
1
).
repeat
(
1
,
vocab_size
)
logits
[
logits
>
0
]
/=
torch
.
where
(
prompt_mask
|
output_mask
,
repetition_penalties
,
1.0
)[
logits
>
0
]
logits
[
logits
<=
0
]
*=
torch
.
where
(
prompt_mask
|
output_mask
,
repetition_penalties
,
1.0
)[
logits
<=
0
]
# We follow the definition in OpenAI API.
# Refer to https://platform.openai.com/docs/api-reference/parameter-details
logits
-=
frequency_penalties
.
unsqueeze_
(
dim
=
1
)
*
output_bin_counts
logits
-=
presence_penalties
.
unsqueeze_
(
dim
=
1
)
*
output_mask
return
logits
vllm/model_executor/model_loader/loader.py
View file @
96ae75ad
...
@@ -45,9 +45,11 @@ from vllm.model_executor.model_loader.weight_utils import (
...
@@ -45,9 +45,11 @@ from vllm.model_executor.model_loader.weight_utils import (
filter_duplicate_safetensors_files
,
filter_files_not_needed_for_inference
,
filter_duplicate_safetensors_files
,
filter_files_not_needed_for_inference
,
get_gguf_extra_tensor_names
,
gguf_quant_weights_iterator
,
get_gguf_extra_tensor_names
,
gguf_quant_weights_iterator
,
initialize_dummy_weights
,
np_cache_weights_iterator
,
pt_weights_iterator
,
initialize_dummy_weights
,
np_cache_weights_iterator
,
pt_weights_iterator
,
safetensors_weights_iterator
)
runai_safetensors_weights_iterator
,
safetensors_weights_iterator
)
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.model_executor.utils
import
set_weight_attrs
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.transformers_utils.s3_utils
import
glob
as
s3_glob
from
vllm.transformers_utils.utils
import
is_s3
from
vllm.utils
import
is_pin_memory_available
from
vllm.utils
import
is_pin_memory_available
...
@@ -1234,6 +1236,108 @@ class GGUFModelLoader(BaseModelLoader):
...
@@ -1234,6 +1236,108 @@ class GGUFModelLoader(BaseModelLoader):
return
model
return
model
class
RunaiModelStreamerLoader
(
BaseModelLoader
):
"""
Model loader that can load safetensors
files from local FS or S3 bucket.
"""
def
__init__
(
self
,
load_config
:
LoadConfig
):
super
().
__init__
(
load_config
)
if
load_config
.
model_loader_extra_config
:
extra_config
=
load_config
.
model_loader_extra_config
if
(
"concurrency"
in
extra_config
and
isinstance
(
extra_config
.
get
(
"concurrency"
),
int
)):
os
.
environ
[
"RUNAI_STREAMER_CONCURRENCY"
]
=
str
(
extra_config
.
get
(
"concurrency"
))
if
(
"memory_limit"
in
extra_config
and
isinstance
(
extra_config
.
get
(
"memory_limit"
),
int
)):
os
.
environ
[
"RUNAI_STREAMER_MEMORY_LIMIT"
]
=
str
(
extra_config
.
get
(
"memory_limit"
))
runai_streamer_s3_endpoint
=
os
.
getenv
(
'RUNAI_STREAMER_S3_ENDPOINT'
)
aws_endpoint_url
=
os
.
getenv
(
'AWS_ENDPOINT_URL'
)
if
(
runai_streamer_s3_endpoint
is
None
and
aws_endpoint_url
is
not
None
):
os
.
environ
[
"RUNAI_STREAMER_S3_ENDPOINT"
]
=
aws_endpoint_url
def
_prepare_weights
(
self
,
model_name_or_path
:
str
,
revision
:
Optional
[
str
])
->
List
[
str
]:
"""Prepare weights for the model.
If the model is not local, it will be downloaded."""
is_s3_path
=
is_s3
(
model_name_or_path
)
is_local
=
os
.
path
.
isdir
(
model_name_or_path
)
safetensors_pattern
=
"*.safetensors"
index_file
=
SAFE_WEIGHTS_INDEX_NAME
hf_folder
=
(
model_name_or_path
if
(
is_local
or
is_s3_path
)
else
download_weights_from_hf
(
model_name_or_path
,
self
.
load_config
.
download_dir
,
[
safetensors_pattern
],
revision
,
ignore_patterns
=
self
.
load_config
.
ignore_patterns
,
))
if
is_s3_path
:
hf_weights_files
=
s3_glob
(
path
=
hf_folder
,
allow_pattern
=
[
safetensors_pattern
])
else
:
hf_weights_files
=
glob
.
glob
(
os
.
path
.
join
(
hf_folder
,
safetensors_pattern
))
if
not
is_local
and
not
is_s3_path
:
download_safetensors_index_file_from_hf
(
model_name_or_path
,
index_file
,
self
.
load_config
.
download_dir
,
revision
)
if
not
hf_weights_files
:
raise
RuntimeError
(
f
"Cannot find any safetensors model weights with "
f
"`
{
model_name_or_path
}
`"
)
return
hf_weights_files
def
_get_weights_iterator
(
self
,
model_or_path
:
str
,
revision
:
str
)
->
Generator
[
Tuple
[
str
,
torch
.
Tensor
],
None
,
None
]:
"""Get an iterator for the model weights based on the load format."""
hf_weights_files
=
self
.
_prepare_weights
(
model_or_path
,
revision
)
return
runai_safetensors_weights_iterator
(
hf_weights_files
)
def
download_model
(
self
,
model_config
:
ModelConfig
)
->
None
:
"""Download model if necessary"""
self
.
_prepare_weights
(
model_config
.
model
,
model_config
.
revision
)
def
load_model
(
self
,
vllm_config
:
VllmConfig
)
->
nn
.
Module
:
"""Perform streaming of the model to destination"""
device_config
=
vllm_config
.
device_config
model_config
=
vllm_config
.
model_config
target_device
=
torch
.
device
(
device_config
.
device
)
with
set_default_torch_dtype
(
model_config
.
dtype
):
with
target_device
:
model
=
_initialize_model
(
vllm_config
=
vllm_config
)
model_weights
=
model_config
.
model
if
hasattr
(
model_config
,
"model_weights"
):
model_weights
=
model_config
.
model_weights
model
.
load_weights
(
self
.
_get_weights_iterator
(
model_weights
,
model_config
.
revision
))
for
_
,
module
in
model
.
named_modules
():
quant_method
=
getattr
(
module
,
"quant_method"
,
None
)
if
quant_method
is
not
None
:
with
device_loading_context
(
module
,
target_device
):
quant_method
.
process_weights_after_loading
(
module
)
return
model
.
eval
()
def
get_model_loader
(
load_config
:
LoadConfig
)
->
BaseModelLoader
:
def
get_model_loader
(
load_config
:
LoadConfig
)
->
BaseModelLoader
:
"""Get a model loader based on the load format."""
"""Get a model loader based on the load format."""
...
@@ -1255,4 +1359,7 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
...
@@ -1255,4 +1359,7 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
if
load_config
.
load_format
==
LoadFormat
.
GGUF
:
if
load_config
.
load_format
==
LoadFormat
.
GGUF
:
return
GGUFModelLoader
(
load_config
)
return
GGUFModelLoader
(
load_config
)
if
load_config
.
load_format
==
LoadFormat
.
RUNAI_STREAMER
:
return
RunaiModelStreamerLoader
(
load_config
)
return
DefaultModelLoader
(
load_config
)
return
DefaultModelLoader
(
load_config
)
vllm/model_executor/model_loader/tensorizer.py
View file @
96ae75ad
...
@@ -19,9 +19,7 @@ from vllm.engine.llm_engine import LLMEngine
...
@@ -19,9 +19,7 @@ from vllm.engine.llm_engine import LLMEngine
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
)
VocabParallelEmbedding
)
from
vllm.utils
import
FlexibleArgumentParser
from
vllm.utils
import
FlexibleArgumentParser
,
PlaceholderModule
tensorizer_error_msg
=
None
try
:
try
:
from
tensorizer
import
(
DecryptionParams
,
EncryptionParams
,
from
tensorizer
import
(
DecryptionParams
,
EncryptionParams
,
...
@@ -34,8 +32,19 @@ try:
...
@@ -34,8 +32,19 @@ try:
open_stream
,
open_stream
,
mode
=
mode
,
mode
=
mode
,
)
for
mode
in
(
"rb"
,
"wb+"
))
)
for
mode
in
(
"rb"
,
"wb+"
))
except
ImportError
as
e
:
except
ImportError
:
tensorizer_error_msg
=
str
(
e
)
tensorizer
=
PlaceholderModule
(
"tensorizer"
)
DecryptionParams
=
tensorizer
.
placeholder_attr
(
"DecryptionParams"
)
EncryptionParams
=
tensorizer
.
placeholder_attr
(
"EncryptionParams"
)
TensorDeserializer
=
tensorizer
.
placeholder_attr
(
"TensorDeserializer"
)
TensorSerializer
=
tensorizer
.
placeholder_attr
(
"TensorSerializer"
)
open_stream
=
tensorizer
.
placeholder_attr
(
"stream_io.open_stream"
)
convert_bytes
=
tensorizer
.
placeholder_attr
(
"utils.convert_bytes"
)
get_mem_usage
=
tensorizer
.
placeholder_attr
(
"utils.get_mem_usage"
)
no_init_or_tensor
=
tensorizer
.
placeholder_attr
(
"utils.no_init_or_tensor"
)
_read_stream
=
tensorizer
.
placeholder_attr
(
"_read_stream"
)
_write_stream
=
tensorizer
.
placeholder_attr
(
"_write_stream"
)
__all__
=
[
__all__
=
[
'EncryptionParams'
,
'DecryptionParams'
,
'TensorDeserializer'
,
'EncryptionParams'
,
'DecryptionParams'
,
'TensorDeserializer'
,
...
@@ -267,12 +276,6 @@ class TensorizerAgent:
...
@@ -267,12 +276,6 @@ class TensorizerAgent:
"""
"""
def
__init__
(
self
,
tensorizer_config
:
TensorizerConfig
,
vllm_config
):
def
__init__
(
self
,
tensorizer_config
:
TensorizerConfig
,
vllm_config
):
if
tensorizer_error_msg
is
not
None
:
raise
ImportError
(
"Tensorizer is not installed. Please install tensorizer "
"to use this feature with `pip install vllm[tensorizer]`. "
"Error message: {}"
.
format
(
tensorizer_error_msg
))
self
.
tensorizer_config
=
tensorizer_config
self
.
tensorizer_config
=
tensorizer_config
self
.
tensorizer_args
=
(
self
.
tensorizer_args
=
(
self
.
tensorizer_config
.
_construct_tensorizer_args
())
self
.
tensorizer_config
.
_construct_tensorizer_args
())
...
...
vllm/model_executor/model_loader/utils.py
View file @
96ae75ad
...
@@ -8,8 +8,9 @@ from torch import nn
...
@@ -8,8 +8,9 @@ from torch import nn
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
from
vllm.model_executor.models
import
ModelRegistry
from
vllm.model_executor.models
import
ModelRegistry
from
vllm.model_executor.models.adapters
import
(
as_classification_model
,
from
vllm.model_executor.models.adapters
import
as_embedding_model
as_embedding_model
,
as_reward_model
)
@
contextlib
.
contextmanager
@
contextlib
.
contextmanager
...
@@ -69,8 +70,12 @@ def get_model_architecture(
...
@@ -69,8 +70,12 @@ def get_model_architecture(
architectures
=
[
"QuantMixtralForCausalLM"
]
architectures
=
[
"QuantMixtralForCausalLM"
]
model_cls
,
arch
=
ModelRegistry
.
resolve_model_cls
(
architectures
)
model_cls
,
arch
=
ModelRegistry
.
resolve_model_cls
(
architectures
)
if
model_config
.
runner_type
==
"pooling
"
:
if
model_config
.
task
==
"embed
"
:
model_cls
=
as_embedding_model
(
model_cls
)
model_cls
=
as_embedding_model
(
model_cls
)
elif
model_config
.
task
==
"classify"
:
model_cls
=
as_classification_model
(
model_cls
)
elif
model_config
.
task
==
"reward"
:
model_cls
=
as_reward_model
(
model_cls
)
return
model_cls
,
arch
return
model_cls
,
arch
...
...
vllm/model_executor/model_loader/weight_utils.py
View file @
96ae75ad
...
@@ -25,7 +25,15 @@ from vllm.model_executor.layers.quantization import (QuantizationConfig,
...
@@ -25,7 +25,15 @@ from vllm.model_executor.layers.quantization import (QuantizationConfig,
get_quantization_config
)
get_quantization_config
)
from
vllm.model_executor.layers.quantization.schema
import
QuantParamSchema
from
vllm.model_executor.layers.quantization.schema
import
QuantParamSchema
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils
import
print_warning_once
from
vllm.utils
import
PlaceholderModule
,
print_warning_once
try
:
from
runai_model_streamer
import
SafetensorsStreamer
except
ImportError
:
runai_model_streamer
=
PlaceholderModule
(
"runai_model_streamer"
)
# type: ignore[assignment]
SafetensorsStreamer
=
runai_model_streamer
.
placeholder_attr
(
"SafetensorsStreamer"
)
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
...
@@ -410,6 +418,23 @@ def safetensors_weights_iterator(
...
@@ -410,6 +418,23 @@ def safetensors_weights_iterator(
yield
name
,
param
yield
name
,
param
def
runai_safetensors_weights_iterator
(
hf_weights_files
:
List
[
str
]
)
->
Generator
[
Tuple
[
str
,
torch
.
Tensor
],
None
,
None
]:
"""Iterate over the weights in the model safetensor files."""
enable_tqdm
=
not
torch
.
distributed
.
is_initialized
(
)
or
torch
.
distributed
.
get_rank
()
==
0
with
SafetensorsStreamer
()
as
streamer
:
for
st_file
in
tqdm
(
hf_weights_files
,
desc
=
"Loading safetensors using Runai Model Streamer"
,
disable
=
not
enable_tqdm
,
bar_format
=
_BAR_FORMAT
,
):
streamer
.
stream_file
(
st_file
)
yield
from
streamer
.
get_tensors
()
def
pt_weights_iterator
(
def
pt_weights_iterator
(
hf_weights_files
:
List
[
str
]
hf_weights_files
:
List
[
str
]
)
->
Generator
[
Tuple
[
str
,
torch
.
Tensor
],
None
,
None
]:
)
->
Generator
[
Tuple
[
str
,
torch
.
Tensor
],
None
,
None
]:
...
...
vllm/model_executor/models/adapters.py
View file @
96ae75ad
from
collections.abc
import
Iterable
from
collections.abc
import
Iterable
from
typing
import
Any
,
TypeVar
from
typing
import
TYPE_CHECKING
,
Any
,
Optional
,
TypeVar
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
from
.interfaces_base
import
VllmModelForPooling
,
is_pooling_model
from
.interfaces_base
import
VllmModelForPooling
,
is_pooling_model
if
TYPE_CHECKING
:
from
vllm.model_executor.layers.pooler
import
PoolingType
_T
=
TypeVar
(
"_T"
,
bound
=
type
[
nn
.
Module
])
_T
=
TypeVar
(
"_T"
,
bound
=
type
[
nn
.
Module
])
_GENERATE_SUFFIXES
=
[
"ForCausalLM"
,
"ForConditionalGeneration"
,
"ChatModel"
,
"LMHeadModel"
,
]
def
as_embedding_model
(
cls
:
_T
)
->
_T
:
"""Subclass an existing vLLM model to support embeddings."""
# Avoid modifying existing embedding models
if
is_pooling_model
(
cls
):
return
cls
def
_get_pooling_model_name
(
orig_model_name
:
str
,
pooling_suffix
:
str
)
->
str
:
model_name
=
orig_model_name
for
generate_suffix
in
_GENERATE_SUFFIXES
:
model_name
=
model_name
.
removesuffix
(
generate_suffix
)
return
model_name
+
pooling_suffix
def
_create_pooling_model_cls
(
orig_cls
:
_T
,
*
,
default_pooling_type
:
"PoolingType"
,
default_normalize
:
bool
,
default_softmax
:
bool
,
)
->
_T
:
# Lazy import
# Lazy import
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.pooler
import
(
Pooler
,
PoolerOutput
,
from
vllm.model_executor.layers.pooler
import
Pooler
,
PoolerOutput
PoolingType
)
from
vllm.model_executor.pooling_metadata
import
PoolingMetadata
from
vllm.model_executor.pooling_metadata
import
PoolingMetadata
from
.utils
import
AutoWeightsLoader
,
WeightsMapper
from
.utils
import
AutoWeightsLoader
,
WeightsMapper
class
ModelFor
Embedding
(
cls
,
VllmModelForPooling
):
class
ModelFor
Pooling
(
orig_
cls
,
VllmModelForPooling
):
def
__init__
(
def
__init__
(
self
,
self
,
...
@@ -34,7 +53,7 @@ def as_embedding_model(cls: _T) -> _T:
...
@@ -34,7 +53,7 @@ def as_embedding_model(cls: _T) -> _T:
)
->
None
:
)
->
None
:
super
().
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
,
**
kwargs
)
super
().
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
,
**
kwargs
)
# These are not used in
embedd
ing models
# These are not used in
pool
ing models
for
attr
in
(
"lm_head"
,
"logits_processor"
):
for
attr
in
(
"lm_head"
,
"logits_processor"
):
if
hasattr
(
self
,
attr
):
if
hasattr
(
self
,
attr
):
delattr
(
self
,
attr
)
delattr
(
self
,
attr
)
...
@@ -46,9 +65,9 @@ def as_embedding_model(cls: _T) -> _T:
...
@@ -46,9 +65,9 @@ def as_embedding_model(cls: _T) -> _T:
if
not
getattr
(
self
,
"_pooler"
,
None
):
if
not
getattr
(
self
,
"_pooler"
,
None
):
self
.
_pooler
=
Pooler
.
from_config_with_defaults
(
self
.
_pooler
=
Pooler
.
from_config_with_defaults
(
pooler_config
,
pooler_config
,
pooling_type
=
P
ooling
T
ype
.
LAST
,
pooling_type
=
default_p
ooling
_t
ype
,
normalize
=
Tru
e
,
normalize
=
default_normaliz
e
,
softmax
=
False
,
softmax
=
default_softmax
,
)
)
def
pooler
(
def
pooler
(
...
@@ -82,17 +101,148 @@ def as_embedding_model(cls: _T) -> _T:
...
@@ -82,17 +101,148 @@ def as_embedding_model(cls: _T) -> _T:
return
return
# For most other models
# For most other models
if
hasattr
(
cls
,
"load_weights"
):
if
hasattr
(
orig_
cls
,
"load_weights"
):
cls
.
load_weights
(
self
,
weights
)
# type: ignore
orig_
cls
.
load_weights
(
self
,
weights
)
# type: ignore
# Fallback
# Fallback
else
:
else
:
loader
=
AutoWeightsLoader
(
self
)
loader
=
AutoWeightsLoader
(
self
)
loader
.
load_weights
(
weights
)
loader
.
load_weights
(
weights
)
ModelForEmbedding
.
__name__
=
cls
.
__name__
\
return
ModelForPooling
# type: ignore
.
removesuffix
(
"ForCausalLM"
)
\
.
removesuffix
(
"ForConditionalGeneration"
)
\
.
removesuffix
(
"ChatModel"
)
\
def
as_embedding_model
(
cls
:
_T
)
->
_T
:
.
removesuffix
(
"LMHeadModel"
)
+
"ForEmbedding"
"""
Subclass an existing vLLM model to support embeddings.
By default, the embeddings of the whole prompt are extracted from the
normalized hidden state corresponding to the last token.
Note:
We assume that no extra layers are added to the original model;
please implement your own model if this is not the case.
"""
# Avoid modifying existing embedding models
if
is_pooling_model
(
cls
):
return
cls
# Lazy import
from
vllm.model_executor.layers.pooler
import
PoolingType
ModelForEmbedding
=
_create_pooling_model_cls
(
cls
,
default_pooling_type
=
PoolingType
.
LAST
,
default_normalize
=
True
,
default_softmax
=
False
,
)
ModelForEmbedding
.
__name__
=
\
_get_pooling_model_name
(
cls
.
__name__
,
"ForEmbedding"
)
return
ModelForEmbedding
# type: ignore
return
ModelForEmbedding
# type: ignore
def
as_classification_model
(
cls
:
_T
)
->
_T
:
"""
Subclass an existing vLLM model to support classification.
By default, the class probabilities are extracted from the softmaxed
hidden state corresponding to the last token.
Note:
We assume that the classification head is a single linear layer
stored as the attribute `score` of the top-level model;
please implement your own model if this is not the case.
"""
# Avoid modifying existing classification models
if
is_pooling_model
(
cls
):
return
cls
# Lazy import
from
vllm.attention
import
AttentionMetadata
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.linear
import
RowParallelLinear
from
vllm.model_executor.layers.pooler
import
PoolingType
from
vllm.sequence
import
IntermediateTensors
from
.utils
import
maybe_prefix
ModelForPooling
=
_create_pooling_model_cls
(
cls
,
default_pooling_type
=
PoolingType
.
LAST
,
default_normalize
=
False
,
default_softmax
=
True
,
)
class
ModelForClassification
(
ModelForPooling
):
def
__init__
(
self
,
*
,
vllm_config
:
"VllmConfig"
,
prefix
:
str
=
""
,
**
kwargs
:
Any
,
)
->
None
:
super
().
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
,
**
kwargs
)
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
self
.
score
=
RowParallelLinear
(
config
.
hidden_size
,
config
.
num_labels
,
quant_config
=
quant_config
,
input_is_parallel
=
False
,
bias
=
False
,
prefix
=
maybe_prefix
(
prefix
,
"score"
))
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
list
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
torch
.
Tensor
:
hidden_states
=
super
().
forward
(
input_ids
,
positions
,
kv_caches
,
attn_metadata
,
intermediate_tensors
,
inputs_embeds
)
logits
,
_
=
self
.
score
(
hidden_states
)
return
logits
ModelForClassification
.
__name__
=
\
_get_pooling_model_name
(
cls
.
__name__
,
"ForClassification"
)
return
ModelForClassification
# type: ignore
def
as_reward_model
(
cls
:
_T
)
->
_T
:
"""
Subclass an existing vLLM model to support reward modeling.
By default, we return the hidden states of each token directly.
Note:
We assume that no extra layers are added to the original model;
please implement your own model if this is not the case.
"""
# Avoid modifying existing reward models
if
is_pooling_model
(
cls
):
return
cls
# Lazy import
from
vllm.model_executor.layers.pooler
import
PoolingType
ModelForReward
=
_create_pooling_model_cls
(
cls
,
default_pooling_type
=
PoolingType
.
ALL
,
default_normalize
=
False
,
default_softmax
=
False
,
)
ModelForReward
.
__name__
=
\
_get_pooling_model_name
(
cls
.
__name__
,
"ForReward"
)
return
ModelForReward
# type: ignore
vllm/model_executor/models/aria.py
View file @
96ae75ad
...
@@ -521,6 +521,15 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
...
@@ -521,6 +521,15 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
This model combines a vision tower, a multi-modal projector, and a language
This model combines a vision tower, a multi-modal projector, and a language
model to perform tasks that involve both image and text inputs.
model to perform tasks that involve both image and text inputs.
"""
"""
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"language_model.model"
:
"language_model"
,
"language_model.lm_head"
:
"lm_head"
,
},
orig_to_new_suffix
=
{
"router.weight"
:
"router_weight"
,
},
)
def
__init__
(
def
__init__
(
self
,
self
,
...
@@ -662,15 +671,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
...
@@ -662,15 +671,6 @@ class AriaForConditionalGeneration(nn.Module, SupportsMultiModal):
return
next_tokens
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"language_model.model"
:
"language_model"
,
"language_model.lm_head"
:
"lm_head"
,
},
orig_to_new_suffix
=
{
"router.weight"
:
"router_weight"
,
},
)
loader
=
AutoWeightsLoader
(
self
)
loader
=
AutoWeightsLoader
(
self
)
loader
.
load_weights
(
weights
,
mapper
=
hf_to_vllm_mapper
)
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
vllm/model_executor/models/bert.py
View file @
96ae75ad
...
@@ -409,6 +409,7 @@ class BertEmbeddingModel(nn.Module):
...
@@ -409,6 +409,7 @@ class BertEmbeddingModel(nn.Module):
model: An instance of BertModel used for forward operations.
model: An instance of BertModel used for forward operations.
_pooler: An instance of Pooler used for pooling operations.
_pooler: An instance of Pooler used for pooling operations.
"""
"""
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"model."
:
""
})
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
super
().
__init__
()
...
@@ -441,8 +442,7 @@ class BertEmbeddingModel(nn.Module):
...
@@ -441,8 +442,7 @@ class BertEmbeddingModel(nn.Module):
return
self
.
_pooler
(
hidden_states
,
pooling_metadata
)
return
self
.
_pooler
(
hidden_states
,
pooling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"model."
:
""
})
weights
=
self
.
hf_to_vllm_mapper
.
apply
(
weights
)
weights
=
hf_to_vllm_mapper
.
apply
(
weights
)
weights
=
((
name
,
data
)
for
name
,
data
in
weights
weights
=
((
name
,
data
)
for
name
,
data
in
weights
if
not
name
.
startswith
(
"lm_head."
))
if
not
name
.
startswith
(
"lm_head."
))
self
.
model
.
load_weights
(
weights
)
self
.
model
.
load_weights
(
weights
)
...
...
vllm/model_executor/models/deepseek_v3.py
0 → 100644
View file @
96ae75ad
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.
# Copyright 2023 DeepSeek-AI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only DeepseekV3 model."""
from
typing
import
Any
,
Dict
,
Iterable
,
List
,
Optional
,
Set
,
Tuple
,
Union
import
torch
from
torch
import
nn
from
transformers
import
PretrainedConfig
from
vllm.attention
import
Attention
,
AttentionMetadata
from
vllm.config
import
CacheConfig
,
VllmConfig
from
vllm.distributed
import
(
get_pp_group
,
get_tensor_model_parallel_world_size
,
tensor_model_parallel_all_reduce
)
from
vllm.model_executor.layers.activation
import
SiluAndMul
from
vllm.model_executor.layers.fused_moe
import
FusedMoE
from
vllm.model_executor.layers.layernorm
import
RMSNorm
from
vllm.model_executor.layers.linear
import
(
ColumnParallelLinear
,
MergedColumnParallelLinear
,
ReplicatedLinear
,
RowParallelLinear
)
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
ParallelLMHead
,
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
.interfaces
import
SupportsPP
from
.utils
import
(
PPMissingLayer
,
is_pp_missing_parameter
,
make_empty_intermediate_tensors_factory
,
make_layers
,
maybe_prefix
)
class
DeepseekV3MLP
(
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
intermediate_size
:
int
,
hidden_act
:
str
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
reduce_results
:
bool
=
True
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
self
.
gate_up_proj
=
MergedColumnParallelLinear
(
hidden_size
,
[
intermediate_size
]
*
2
,
bias
=
False
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.gate_up_proj"
)
self
.
down_proj
=
RowParallelLinear
(
intermediate_size
,
hidden_size
,
bias
=
False
,
quant_config
=
quant_config
,
reduce_results
=
reduce_results
,
prefix
=
f
"
{
prefix
}
.down_proj"
)
if
hidden_act
!=
"silu"
:
raise
ValueError
(
f
"Unsupported activation:
{
hidden_act
}
. "
"Only silu is supported for now."
)
self
.
act_fn
=
SiluAndMul
()
def
forward
(
self
,
x
):
gate_up
,
_
=
self
.
gate_up_proj
(
x
)
x
=
self
.
act_fn
(
gate_up
)
x
,
_
=
self
.
down_proj
(
x
)
return
x
class
DeepseekV3MoE
(
nn
.
Module
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
):
super
().
__init__
()
self
.
tp_size
=
get_tensor_model_parallel_world_size
()
self
.
routed_scaling_factor
=
config
.
routed_scaling_factor
self
.
n_shared_experts
=
config
.
n_shared_experts
self
.
routed_scaling_factor
=
config
.
routed_scaling_factor
if
self
.
tp_size
>
config
.
n_routed_experts
:
raise
ValueError
(
f
"Tensor parallel size
{
self
.
tp_size
}
is greater than "
f
"the number of experts
{
config
.
n_routed_experts
}
."
)
if
config
.
hidden_act
!=
"silu"
:
raise
ValueError
(
f
"Unsupported activation:
{
config
.
hidden_act
}
. "
"Only silu is supported for now."
)
self
.
gate
=
ReplicatedLinear
(
config
.
hidden_size
,
config
.
n_routed_experts
,
bias
=
False
,
quant_config
=
None
,
prefix
=
f
"
{
prefix
}
.gate"
)
if
config
.
topk_method
==
"noaux_tc"
:
self
.
gate
.
e_score_correction_bias
=
nn
.
Parameter
(
torch
.
empty
(
config
.
n_routed_experts
))
else
:
self
.
gate
.
e_score_correction_bias
=
None
self
.
experts
=
FusedMoE
(
num_experts
=
config
.
n_routed_experts
,
top_k
=
config
.
num_experts_per_tok
,
hidden_size
=
config
.
hidden_size
,
intermediate_size
=
config
.
moe_intermediate_size
,
reduce_results
=
False
,
renormalize
=
config
.
norm_topk_prob
,
quant_config
=
quant_config
,
use_grouped_topk
=
True
,
num_expert_group
=
config
.
n_group
,
topk_group
=
config
.
topk_group
,
prefix
=
f
"
{
prefix
}
.experts"
,
scoring_func
=
config
.
scoring_func
,
e_score_correction_bias
=
self
.
gate
.
e_score_correction_bias
)
if
config
.
n_shared_experts
is
not
None
:
intermediate_size
=
(
config
.
moe_intermediate_size
*
config
.
n_shared_experts
)
self
.
shared_experts
=
DeepseekV3MLP
(
hidden_size
=
config
.
hidden_size
,
intermediate_size
=
intermediate_size
,
hidden_act
=
config
.
hidden_act
,
quant_config
=
quant_config
,
reduce_results
=
False
,
)
def
forward
(
self
,
hidden_states
:
torch
.
Tensor
)
->
torch
.
Tensor
:
num_tokens
,
hidden_dim
=
hidden_states
.
shape
hidden_states
=
hidden_states
.
view
(
-
1
,
hidden_dim
)
if
self
.
n_shared_experts
is
not
None
:
shared_output
=
self
.
shared_experts
(
hidden_states
)
# router_logits: (num_tokens, n_experts)
router_logits
,
_
=
self
.
gate
(
hidden_states
)
final_hidden_states
=
self
.
experts
(
hidden_states
=
hidden_states
,
router_logits
=
router_logits
)
*
self
.
routed_scaling_factor
if
shared_output
is
not
None
:
final_hidden_states
=
final_hidden_states
+
shared_output
if
self
.
tp_size
>
1
:
final_hidden_states
=
tensor_model_parallel_all_reduce
(
final_hidden_states
)
return
final_hidden_states
.
view
(
num_tokens
,
hidden_dim
)
def
yarn_get_mscale
(
scale
:
float
=
1
,
mscale
:
float
=
1
)
->
float
:
import
math
if
scale
<=
1
:
return
1.0
return
0.1
*
mscale
*
math
.
log
(
scale
)
+
1.0
class
DeepseekV3Attention
(
nn
.
Module
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
hidden_size
:
int
,
num_heads
:
int
,
qk_nope_head_dim
:
int
,
qk_rope_head_dim
:
int
,
v_head_dim
:
int
,
q_lora_rank
:
int
,
kv_lora_rank
:
int
,
rope_theta
:
float
=
10000
,
rope_scaling
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
max_position_embeddings
:
int
=
8192
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
prefix
:
str
=
""
,
)
->
None
:
super
().
__init__
()
self
.
hidden_size
=
hidden_size
self
.
qk_nope_head_dim
=
qk_nope_head_dim
self
.
qk_rope_head_dim
=
qk_rope_head_dim
self
.
qk_head_dim
=
qk_nope_head_dim
+
qk_rope_head_dim
self
.
v_head_dim
=
v_head_dim
self
.
q_lora_rank
=
q_lora_rank
self
.
kv_lora_rank
=
kv_lora_rank
self
.
num_heads
=
num_heads
tp_size
=
get_tensor_model_parallel_world_size
()
assert
num_heads
%
tp_size
==
0
self
.
num_local_heads
=
num_heads
//
tp_size
self
.
scaling
=
self
.
qk_head_dim
**-
0.5
self
.
rope_theta
=
rope_theta
self
.
max_position_embeddings
=
max_position_embeddings
if
self
.
q_lora_rank
is
not
None
:
self
.
q_a_proj
=
ReplicatedLinear
(
self
.
hidden_size
,
self
.
q_lora_rank
,
bias
=
False
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.q_a_proj"
)
self
.
q_a_layernorm
=
RMSNorm
(
self
.
q_lora_rank
,
eps
=
config
.
rms_norm_eps
)
self
.
q_b_proj
=
ColumnParallelLinear
(
q_lora_rank
,
self
.
num_heads
*
self
.
qk_head_dim
,
bias
=
False
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.q_b_proj"
)
else
:
self
.
q_proj
=
ColumnParallelLinear
(
self
.
hidden_size
,
self
.
num_heads
*
self
.
qk_head_dim
,
bias
=
False
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.q_proj"
)
self
.
kv_a_proj_with_mqa
=
ReplicatedLinear
(
self
.
hidden_size
,
self
.
kv_lora_rank
+
self
.
qk_rope_head_dim
,
bias
=
False
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.kv_a_proj_with_mqa"
)
self
.
kv_a_layernorm
=
RMSNorm
(
self
.
kv_lora_rank
,
eps
=
config
.
rms_norm_eps
)
self
.
kv_b_proj
=
ColumnParallelLinear
(
self
.
kv_lora_rank
,
self
.
num_heads
*
(
self
.
qk_nope_head_dim
+
self
.
v_head_dim
),
bias
=
False
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.kv_b_proj"
)
# O projection.
self
.
o_proj
=
RowParallelLinear
(
self
.
num_heads
*
self
.
v_head_dim
,
self
.
hidden_size
,
bias
=
False
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.o_proj"
)
rope_scaling
[
"rope_type"
]
=
'deepseek_yarn'
self
.
rotary_emb
=
get_rope
(
qk_rope_head_dim
,
rotary_dim
=
qk_rope_head_dim
,
max_position
=
max_position_embeddings
,
base
=
rope_theta
,
rope_scaling
=
rope_scaling
,
is_neox_style
=
False
)
if
rope_scaling
:
mscale_all_dim
=
rope_scaling
.
get
(
"mscale_all_dim"
,
False
)
scaling_factor
=
rope_scaling
[
"factor"
]
mscale
=
yarn_get_mscale
(
scaling_factor
,
float
(
mscale_all_dim
))
self
.
scaling
=
self
.
scaling
*
mscale
*
mscale
# self.attn = Attention(self.num_heads,
# self.qk_head_dim,
# self.scaling,
# num_kv_heads=self.num_heads)
# TODO, support head_size 192
self
.
attn
=
Attention
(
self
.
num_local_heads
,
256
,
self
.
scaling
,
num_kv_heads
=
self
.
num_local_heads
,
cache_config
=
cache_config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.attn"
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
torch
.
Tensor
,
attn_metadata
:
AttentionMetadata
,
)
->
torch
.
Tensor
:
if
self
.
q_lora_rank
is
not
None
:
q
=
self
.
q_a_proj
(
hidden_states
)[
0
]
q
=
self
.
q_a_layernorm
(
q
)
q
=
self
.
q_b_proj
(
q
)[
0
].
view
(
-
1
,
self
.
num_local_heads
,
self
.
qk_head_dim
)
else
:
q
=
self
.
q_proj
(
hidden_states
)[
0
].
view
(
-
1
,
self
.
num_local_heads
,
self
.
qk_head_dim
)
q_nope
,
q_pe
=
q
.
split
([
self
.
qk_nope_head_dim
,
self
.
qk_rope_head_dim
],
dim
=-
1
)
latent_cache
=
self
.
kv_a_proj_with_mqa
(
hidden_states
)[
0
]
kv_a
,
_
=
latent_cache
.
split
(
[
self
.
kv_lora_rank
,
self
.
qk_rope_head_dim
],
dim
=-
1
)
latent_cache
=
latent_cache
.
unsqueeze
(
1
)
kv_a
=
self
.
kv_a_layernorm
(
kv_a
.
contiguous
())
kv
=
self
.
kv_b_proj
(
kv_a
)[
0
]
kv
=
kv
.
view
(
-
1
,
self
.
num_local_heads
,
self
.
qk_nope_head_dim
+
self
.
v_head_dim
)
k_nope
,
v
=
kv
.
split
([
self
.
qk_nope_head_dim
,
self
.
v_head_dim
],
dim
=-
1
)
k_pe
=
latent_cache
[:,
:,
self
.
kv_lora_rank
:]
q_pe
,
k_pe
=
self
.
rotary_emb
(
positions
,
q_pe
,
k_pe
)
q
[...,
self
.
qk_nope_head_dim
:]
=
q_pe
k
=
torch
.
empty_like
(
q
)
k
[...,
:
self
.
qk_nope_head_dim
]
=
k_nope
k
[...,
self
.
qk_nope_head_dim
:]
=
k_pe
q
=
torch
.
nn
.
functional
.
pad
(
q
,
[
0
,
256
-
self
.
qk_head_dim
],
value
=
0
).
view
(
-
1
,
self
.
num_local_heads
*
256
)
k
=
torch
.
nn
.
functional
.
pad
(
k
,
[
0
,
256
-
self
.
qk_head_dim
],
value
=
0
).
view
(
-
1
,
self
.
num_local_heads
*
256
)
v
=
torch
.
nn
.
functional
.
pad
(
v
,
[
0
,
256
-
self
.
v_head_dim
],
value
=
0
).
view
(
-
1
,
self
.
num_local_heads
*
256
)
attn_output
=
self
.
attn
(
q
,
k
,
v
,
kv_cache
,
attn_metadata
)
attn_output
=
attn_output
.
view
(
-
1
,
self
.
num_local_heads
,
256
)[...,
:
self
.
v_head_dim
].
reshape
(
-
1
,
self
.
num_local_heads
*
self
.
v_head_dim
)
output
,
_
=
self
.
o_proj
(
attn_output
)
return
output
class
DeepseekV3DecoderLayer
(
nn
.
Module
):
def
__init__
(
self
,
config
:
PretrainedConfig
,
prefix
:
str
,
cache_config
:
Optional
[
CacheConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
)
->
None
:
super
().
__init__
()
self
.
hidden_size
=
config
.
hidden_size
rope_theta
=
getattr
(
config
,
"rope_theta"
,
10000
)
rope_scaling
=
getattr
(
config
,
"rope_scaling"
,
None
)
max_position_embeddings
=
getattr
(
config
,
"max_position_embeddings"
,
8192
)
# DecoderLayers are created with `make_layers` which passes the prefix
# with the layer's index.
layer_idx
=
int
(
prefix
.
split
(
sep
=
'.'
)[
-
1
])
self
.
self_attn
=
DeepseekV3Attention
(
config
=
config
,
hidden_size
=
self
.
hidden_size
,
num_heads
=
config
.
num_attention_heads
,
qk_nope_head_dim
=
config
.
qk_nope_head_dim
,
qk_rope_head_dim
=
config
.
qk_rope_head_dim
,
v_head_dim
=
config
.
v_head_dim
,
q_lora_rank
=
config
.
q_lora_rank
if
hasattr
(
config
,
"q_lora_rank"
)
else
None
,
kv_lora_rank
=
config
.
kv_lora_rank
,
rope_theta
=
rope_theta
,
rope_scaling
=
rope_scaling
,
max_position_embeddings
=
max_position_embeddings
,
cache_config
=
cache_config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.self_attn"
,
)
if
(
config
.
n_routed_experts
is
not
None
and
layer_idx
>=
config
.
first_k_dense_replace
and
layer_idx
%
config
.
moe_layer_freq
==
0
):
self
.
mlp
=
DeepseekV3MoE
(
config
=
config
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.mlp"
,
)
else
:
self
.
mlp
=
DeepseekV3MLP
(
hidden_size
=
config
.
hidden_size
,
intermediate_size
=
config
.
intermediate_size
,
hidden_act
=
config
.
hidden_act
,
quant_config
=
quant_config
,
prefix
=
f
"
{
prefix
}
.mlp"
,
)
self
.
input_layernorm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
self
.
post_attention_layernorm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
def
forward
(
self
,
positions
:
torch
.
Tensor
,
hidden_states
:
torch
.
Tensor
,
kv_cache
:
torch
.
Tensor
,
attn_metadata
:
AttentionMetadata
,
residual
:
Optional
[
torch
.
Tensor
],
)
->
torch
.
Tensor
:
# Self Attention
if
residual
is
None
:
residual
=
hidden_states
hidden_states
=
self
.
input_layernorm
(
hidden_states
)
else
:
hidden_states
,
residual
=
self
.
input_layernorm
(
hidden_states
,
residual
)
hidden_states
=
self
.
self_attn
(
positions
=
positions
,
hidden_states
=
hidden_states
,
kv_cache
=
kv_cache
,
attn_metadata
=
attn_metadata
,
)
# Fully Connected
hidden_states
,
residual
=
self
.
post_attention_layernorm
(
hidden_states
,
residual
)
hidden_states
=
self
.
mlp
(
hidden_states
)
return
hidden_states
,
residual
# TODO(simon): check whether we support torch compile for Deepseek V3
# @support_torch_compile
class
DeepseekV3Model
(
nn
.
Module
):
fall_back_to_pt_during_load
=
False
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
cache_config
=
vllm_config
.
cache_config
quant_config
=
vllm_config
.
quant_config
self
.
padding_idx
=
config
.
pad_token_id
self
.
vocab_size
=
config
.
vocab_size
if
get_pp_group
().
is_first_rank
:
self
.
embed_tokens
=
VocabParallelEmbedding
(
config
.
vocab_size
,
config
.
hidden_size
,
)
else
:
self
.
embed_tokens
=
PPMissingLayer
()
self
.
start_layer
,
self
.
end_layer
,
self
.
layers
=
make_layers
(
config
.
num_hidden_layers
,
lambda
prefix
:
DeepseekV3DecoderLayer
(
config
,
prefix
,
cache_config
=
cache_config
,
quant_config
=
quant_config
,
),
prefix
=
f
"
{
prefix
}
.layers"
)
if
get_pp_group
().
is_last_rank
:
self
.
norm
=
RMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
)
else
:
self
.
norm
=
PPMissingLayer
()
self
.
make_empty_intermediate_tensors
=
(
make_empty_intermediate_tensors_factory
(
[
"hidden_states"
,
"residual"
],
config
.
hidden_size
))
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
embed_tokens
(
input_ids
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
],
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
if
get_pp_group
().
is_first_rank
:
if
inputs_embeds
is
not
None
:
hidden_states
=
inputs_embeds
else
:
hidden_states
=
self
.
get_input_embeddings
(
input_ids
)
residual
=
None
else
:
assert
intermediate_tensors
is
not
None
hidden_states
=
intermediate_tensors
[
"hidden_states"
]
residual
=
intermediate_tensors
[
"residual"
]
for
i
in
range
(
self
.
start_layer
,
self
.
end_layer
):
layer
=
self
.
layers
[
i
]
hidden_states
,
residual
=
layer
(
positions
,
hidden_states
,
kv_caches
[
i
-
self
.
start_layer
],
attn_metadata
,
residual
)
if
not
get_pp_group
().
is_last_rank
:
return
IntermediateTensors
({
"hidden_states"
:
hidden_states
,
"residual"
:
residual
})
hidden_states
,
_
=
self
.
norm
(
hidden_states
,
residual
)
return
hidden_states
class
DeepseekV3ForCausalLM
(
nn
.
Module
,
SupportsPP
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
quant_config
=
vllm_config
.
quant_config
self
.
config
=
config
self
.
quant_config
=
quant_config
self
.
model
=
DeepseekV3Model
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
))
self
.
lm_head
=
ParallelLMHead
(
config
.
vocab_size
,
config
.
hidden_size
,
quant_config
=
quant_config
)
self
.
logits_processor
=
LogitsProcessor
(
config
.
vocab_size
)
self
.
sampler
=
get_sampler
()
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
)
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
self
.
model
.
get_input_embeddings
(
input_ids
)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
kv_caches
:
List
[
torch
.
Tensor
],
attn_metadata
:
AttentionMetadata
,
intermediate_tensors
:
Optional
[
IntermediateTensors
]
=
None
,
inputs_embeds
:
Optional
[
torch
.
Tensor
]
=
None
,
)
->
Union
[
torch
.
Tensor
,
IntermediateTensors
]:
hidden_states
=
self
.
model
(
input_ids
,
positions
,
kv_caches
,
attn_metadata
,
intermediate_tensors
,
inputs_embeds
)
return
hidden_states
def
compute_logits
(
self
,
hidden_states
:
torch
.
Tensor
,
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
torch
.
Tensor
]:
logits
=
self
.
logits_processor
(
self
.
lm_head
,
hidden_states
,
sampling_metadata
)
return
logits
def
sample
(
self
,
logits
:
Optional
[
torch
.
Tensor
],
sampling_metadata
:
SamplingMetadata
,
)
->
Optional
[
SamplerOutput
]:
next_tokens
=
self
.
sampler
(
logits
,
sampling_metadata
)
return
next_tokens
def
make_empty_intermediate_tensors
(
self
,
batch_size
:
int
,
dtype
:
torch
.
dtype
,
device
:
torch
.
device
)
->
IntermediateTensors
:
return
IntermediateTensors
({
"hidden_states"
:
torch
.
zeros
((
batch_size
,
self
.
config
.
hidden_size
),
dtype
=
dtype
,
device
=
device
),
"residual"
:
torch
.
zeros
((
batch_size
,
self
.
config
.
hidden_size
),
dtype
=
dtype
,
device
=
device
),
})
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
(
"gate_up_proj"
,
"gate_proj"
,
0
),
(
"gate_up_proj"
,
"up_proj"
,
1
),
]
# Params for weights, fp8 weight scales, fp8 activation scales
# (param_name, weight_name, expert_id, shard_id)
expert_params_mapping
=
FusedMoE
.
make_expert_params_mapping
(
ckpt_gate_proj_name
=
"gate_proj"
,
ckpt_down_proj_name
=
"down_proj"
,
ckpt_up_proj_name
=
"up_proj"
,
num_experts
=
self
.
config
.
n_routed_experts
)
params_dict
=
dict
(
self
.
named_parameters
())
loaded_params
:
Set
[
str
]
=
set
()
for
name
,
loaded_weight
in
weights
:
if
"rotary_emb.inv_freq"
in
name
:
continue
# TODO(simon): support nextn predict layers
if
self
.
config
.
num_nextn_predict_layers
>
0
:
assert
self
.
config
.
num_nextn_predict_layers
==
1
layer_idx
=
self
.
config
.
num_hidden_layers
if
name
.
startswith
(
f
"model.layers.
{
layer_idx
}
"
):
continue
for
(
param_name
,
weight_name
,
shard_id
)
in
stacked_params_mapping
:
# Skip non-stacked layers and experts (experts handled below).
if
weight_name
not
in
name
:
continue
# We have mlp.experts[0].gate_proj in the checkpoint.
# Since we handle the experts below in expert_params_mapping,
# we need to skip here BEFORE we update the name, otherwise
# name will be updated to mlp.experts[0].gate_up_proj, which
# will then be updated below in expert_params_mapping
# for mlp.experts[0].gate_gate_up_proj, which breaks load.
if
((
"mlp.experts."
in
name
)
and
name
not
in
params_dict
):
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
if
is_pp_missing_parameter
(
name
,
self
):
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
shard_id
)
break
else
:
for
mapping
in
expert_params_mapping
:
param_name
,
weight_name
,
expert_id
,
shard_id
=
mapping
if
weight_name
not
in
name
:
continue
name
=
name
.
replace
(
weight_name
,
param_name
)
if
is_pp_missing_parameter
(
name
,
self
):
continue
param
=
params_dict
[
name
]
weight_loader
=
param
.
weight_loader
weight_loader
(
param
,
loaded_weight
,
name
,
shard_id
=
shard_id
,
expert_id
=
expert_id
)
break
else
:
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
if
is_pp_missing_parameter
(
name
,
self
):
continue
if
name
not
in
params_dict
:
for
key
in
params_dict
:
print
(
key
)
param
=
params_dict
[
name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
name
)
return
loaded_params
vllm/model_executor/models/gemma2.py
View file @
96ae75ad
...
@@ -31,11 +31,14 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
...
@@ -31,11 +31,14 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
RowParallelLinear
)
RowParallelLinear
)
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization.compressed_tensors.utils
import
(
get_compressed_tensors_cache_scale
)
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.rotary_embedding
import
get_rope
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
VocabParallelEmbedding
)
VocabParallelEmbedding
)
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
(
default_weight_loader
,
maybe_remap_kv_scale_name
)
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
...
@@ -326,6 +329,15 @@ class Gemma2Model(nn.Module):
...
@@ -326,6 +329,15 @@ class Gemma2Model(nn.Module):
params_dict
=
dict
(
self
.
named_parameters
())
params_dict
=
dict
(
self
.
named_parameters
())
loaded_params
:
Set
[
str
]
=
set
()
loaded_params
:
Set
[
str
]
=
set
()
for
name
,
loaded_weight
in
weights
:
for
name
,
loaded_weight
in
weights
:
if
scale_name
:
=
get_compressed_tensors_cache_scale
(
name
):
# Loading kv cache scales for compressed-tensors quantization
param
=
params_dict
[
scale_name
]
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
loaded_weight
=
loaded_weight
[
0
]
weight_loader
(
param
,
loaded_weight
)
loaded_params
.
add
(
scale_name
)
continue
for
(
param_name
,
shard_name
,
shard_id
)
in
stacked_params_mapping
:
for
(
param_name
,
shard_name
,
shard_id
)
in
stacked_params_mapping
:
if
shard_name
not
in
name
:
if
shard_name
not
in
name
:
continue
continue
...
@@ -343,6 +355,10 @@ class Gemma2Model(nn.Module):
...
@@ -343,6 +355,10 @@ class Gemma2Model(nn.Module):
# Skip loading extra bias for GPTQ models.
# Skip loading extra bias for GPTQ models.
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
if
name
.
endswith
(
".bias"
)
and
name
not
in
params_dict
:
continue
continue
# Remapping the name of FP8 kv-scale.
name
=
maybe_remap_kv_scale_name
(
name
,
params_dict
)
if
name
is
None
:
continue
if
is_pp_missing_parameter
(
name
,
self
):
if
is_pp_missing_parameter
(
name
,
self
):
continue
continue
param
=
params_dict
[
name
]
param
=
params_dict
[
name
]
...
...
vllm/model_executor/models/jamba.py
View file @
96ae75ad
...
@@ -17,6 +17,7 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear,
...
@@ -17,6 +17,7 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear,
RowParallelLinear
)
RowParallelLinear
)
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.layers.mamba.mamba_mixer
import
MambaMixer
from
vllm.model_executor.layers.mamba.mamba_mixer
import
MambaMixer
from
vllm.model_executor.layers.pooler
import
Pooler
,
PoolingType
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
from
vllm.model_executor.layers.vocab_parallel_embedding
import
(
...
@@ -24,8 +25,9 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
...
@@ -24,8 +25,9 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.mamba_cache
import
(
MambaCacheManager
,
from
vllm.model_executor.models.mamba_cache
import
(
MambaCacheManager
,
MambaCacheParams
)
MambaCacheParams
)
from
vllm.model_executor.pooling_metadata
import
PoolingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
,
PoolerOutput
from
vllm.utils
import
LayerBlockType
from
vllm.utils
import
LayerBlockType
from
.interfaces
import
HasInnerState
,
IsHybrid
,
SupportsLoRA
,
SupportsPP
from
.interfaces
import
HasInnerState
,
IsHybrid
,
SupportsLoRA
,
SupportsPP
...
@@ -593,3 +595,35 @@ def _is_moe_layer(name: str):
...
@@ -593,3 +595,35 @@ def _is_moe_layer(name: str):
"experts"
,
"experts"
,
"router"
,
"router"
,
]])
]])
class
JambaForSequenceClassification
(
JambaForCausalLM
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
config
=
vllm_config
.
model_config
.
hf_config
num_labels
:
int
=
config
.
num_labels
score_bias
:
bool
=
getattr
(
config
,
'score_bias'
,
False
)
self
.
score
=
nn
.
Linear
(
config
.
hidden_size
,
num_labels
,
bias
=
score_bias
)
pooler_config
=
vllm_config
.
model_config
.
pooler_config
self
.
_pooler
=
Pooler
.
from_config_with_defaults
(
pooler_config
,
pooling_type
=
PoolingType
.
LAST
,
normalize
=
False
,
softmax
=
False
)
def
pooler
(
self
,
hidden_states
:
torch
.
Tensor
,
pooling_metadata
:
PoolingMetadata
,
)
->
Optional
[
PoolerOutput
]:
hidden_states
=
hidden_states
.
float
()
logits
=
self
.
score
(
hidden_states
)
return
self
.
_pooler
(
logits
,
pooling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
# TODO: The reward weights themselves have float32 accuracy data, we
# would like to load them in fp32 to get that extra precision.
super
().
load_weights
(
weights
)
self
.
score
=
self
.
score
.
float
()
vllm/model_executor/models/llava.py
View file @
96ae75ad
...
@@ -133,8 +133,8 @@ class LlavaMultiModalProcessor(BaseMultiModalProcessor):
...
@@ -133,8 +133,8 @@ class LlavaMultiModalProcessor(BaseMultiModalProcessor):
hf_processor
.
__is_patched__
=
True
# type: ignore
hf_processor
.
__is_patched__
=
True
# type: ignore
def
_get_hf_processor
(
self
)
->
Union
[
LlavaProcessor
,
PixtralProcessor
]:
def
_get_hf_processor
(
self
)
->
Union
[
LlavaProcessor
,
PixtralProcessor
]:
hf_processor
=
self
.
ctx
.
get_hf_processor
(
)
hf_processor
=
self
.
ctx
.
get_hf_processor
(
assert
isinstance
(
hf_processor
,
(
LlavaProcessor
,
PixtralProcessor
))
(
LlavaProcessor
,
PixtralProcessor
))
if
isinstance
(
hf_processor
,
PixtralProcessor
):
if
isinstance
(
hf_processor
,
PixtralProcessor
):
self
.
_patch_pixtral_processor
(
hf_processor
)
self
.
_patch_pixtral_processor
(
hf_processor
)
...
...
vllm/model_executor/models/molmo.py
View file @
96ae75ad
...
@@ -464,24 +464,27 @@ class MolmoAttention(nn.Module):
...
@@ -464,24 +464,27 @@ class MolmoAttention(nn.Module):
class
MolmoMLP
(
nn
.
Module
):
class
MolmoMLP
(
nn
.
Module
):
"""Molmo's LLM mlp."""
"""Molmo's LLM mlp."""
def
__init__
(
def
__init__
(
self
,
self
,
config
:
PretrainedConfig
,
config
:
PretrainedConfig
,
input_dim
:
Optional
[
int
]
=
None
,
input_dim
:
Optional
[
int
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
quant_config
:
Optional
[
QuantizationConfig
]
=
None
,
proj_name
:
str
=
"gate_up_proj"
)
->
None
:
)
->
None
:
super
().
__init__
()
super
().
__init__
()
self
.
hidden_size
=
config
.
hidden_size
self
.
hidden_size
=
config
.
hidden_size
self
.
intermediate_size
=
config
.
intermediate_size
//
2
self
.
intermediate_size
=
config
.
intermediate_size
//
2
# Feed-forward input projection.
# Molmo's LLM proj weights are already merged into the disk, while
self
.
gate_up_proj
=
MergedColumnParallelLinear
(
# image_projector proj is separate. If the same proj_name were used, it
input_dim
or
self
.
hidden_size
,
# would create ambiguity and make it difficult to support BNB and LoRA.
[
self
.
intermediate_size
]
*
2
,
self
.
proj_name
=
proj_name
bias
=
False
,
setattr
(
quant_config
=
quant_config
,
self
,
proj_name
,
)
MergedColumnParallelLinear
(
input_dim
or
self
.
hidden_size
,
[
self
.
intermediate_size
]
*
2
,
bias
=
False
,
quant_config
=
quant_config
,
))
# Activation function.
# Activation function.
self
.
act_fn
=
SiluAndMul
()
self
.
act_fn
=
SiluAndMul
()
...
@@ -497,7 +500,7 @@ class MolmoMLP(nn.Module):
...
@@ -497,7 +500,7 @@ class MolmoMLP(nn.Module):
self
,
self
,
x
:
torch
.
Tensor
,
x
:
torch
.
Tensor
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
gate_up
,
_
=
self
.
gate_up_proj
(
x
)
gate_up
,
_
=
getattr
(
self
,
self
.
proj_name
)
(
x
)
x
=
self
.
act_fn
(
gate_up
)
x
=
self
.
act_fn
(
gate_up
)
x
,
_
=
self
.
down_proj
(
x
)
x
,
_
=
self
.
down_proj
(
x
)
return
x
return
x
...
@@ -520,7 +523,9 @@ class MolmoDecoderLayer(nn.Module):
...
@@ -520,7 +523,9 @@ class MolmoDecoderLayer(nn.Module):
prefix
=
f
"
{
prefix
}
.self_attn"
)
prefix
=
f
"
{
prefix
}
.self_attn"
)
# MLP block.
# MLP block.
self
.
mlp
=
MolmoMLP
(
config
,
quant_config
=
quant_config
)
self
.
mlp
=
MolmoMLP
(
config
,
quant_config
=
quant_config
,
proj_name
=
"gate_up_proj"
)
# LayerNorm
# LayerNorm
assert
config
.
layer_norm_type
==
"rms"
assert
config
.
layer_norm_type
==
"rms"
...
@@ -616,6 +621,7 @@ class MolmoVisionBackbone(nn.Module):
...
@@ -616,6 +621,7 @@ class MolmoVisionBackbone(nn.Module):
config
,
config
,
input_dim
=
vision_config
.
image_emb_dim
,
input_dim
=
vision_config
.
image_emb_dim
,
quant_config
=
quant_config
,
quant_config
=
quant_config
,
proj_name
=
"merged_linear"
,
)
)
image_dim
=
vision_config
.
image_emb_dim
*
len
(
self
.
vit_layers
)
image_dim
=
vision_config
.
image_emb_dim
*
len
(
self
.
vit_layers
)
...
@@ -714,8 +720,8 @@ class MolmoVisionBackbone(nn.Module):
...
@@ -714,8 +720,8 @@ class MolmoVisionBackbone(nn.Module):
torch
.
Tensor
]])
->
Set
[
str
]:
torch
.
Tensor
]])
->
Set
[
str
]:
stacked_params_mapping
=
[
stacked_params_mapping
=
[
# (param_name, shard_name, shard_id)
# (param_name, shard_name, shard_id)
(
"
gate_up_proj
"
,
"gate_proj"
,
0
),
(
"
merged_linear
"
,
"gate_proj"
,
0
),
(
"
gate_up_proj
"
,
"up_proj"
,
1
),
(
"
merged_linear
"
,
"up_proj"
,
1
),
]
]
params_dict
=
dict
(
self
.
named_parameters
())
params_dict
=
dict
(
self
.
named_parameters
())
loaded_params
:
Set
[
str
]
=
set
()
loaded_params
:
Set
[
str
]
=
set
()
...
@@ -928,7 +934,11 @@ def image_input_mapper_for_molmo(
...
@@ -928,7 +934,11 @@ def image_input_mapper_for_molmo(
data
:
object
,
data
:
object
,
):
):
if
isinstance
(
data
,
list
):
if
isinstance
(
data
,
list
):
assert
len
(
data
)
==
1
,
"Molmo supports only one image per prompt."
data
=
data
[
0
]
data
=
data
[
0
]
# Remove unused dummy PIL image
data
.
pop
(
'raw_mm_data'
,
None
)
return
MultiModalKwargs
(
data
)
return
MultiModalKwargs
(
data
)
...
@@ -974,6 +984,7 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
...
@@ -974,6 +984,7 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
dummy_imgdata
=
{
dummy_imgdata
=
{
"images"
:
out
[
"images"
],
"images"
:
out
[
"images"
],
"image_input_idx"
:
out
[
"image_input_idx"
],
"image_input_idx"
:
out
[
"image_input_idx"
],
"raw_mm_data"
:
dummy_image
,
}
}
if
"image_masks"
in
out
:
if
"image_masks"
in
out
:
dummy_imgdata
[
"image_masks"
]
=
out
[
"image_masks"
]
dummy_imgdata
[
"image_masks"
]
=
out
[
"image_masks"
]
...
@@ -1118,6 +1129,34 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
...
@@ -1118,6 +1129,34 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
@
INPUT_REGISTRY
.
register_input_processor
(
input_processor_for_molmo
)
@
INPUT_REGISTRY
.
register_input_processor
(
input_processor_for_molmo
)
class
MolmoForCausalLM
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
class
MolmoForCausalLM
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_substr
=
{
# vision backbone mapping
"image_projector.w1."
:
"image_projector.gate_proj."
,
"image_projector.w3."
:
"image_projector.up_proj."
,
"image_projector.w2."
:
"image_projector.down_proj."
,
# language backbone mapping
"att_proj"
:
"self_attn.qkv_proj"
,
"attn_out"
:
"self_attn.o_proj"
,
"q_norm"
:
"self_attn.q_norm"
,
"k_norm"
:
"self_attn.k_norm"
,
"ff_proj"
:
"mlp.gate_up_proj"
,
"ff_out"
:
"mlp.down_proj"
,
"attn_norm"
:
"input_layernorm"
,
"ff_norm"
:
"post_attention_layernorm"
,
},
orig_to_new_prefix
=
{
# vision backbone mapping
"model.vision_backbone."
:
"vision_backbone."
,
# language backbone mapping
"model.transformer.blocks."
:
"model.layers."
,
"model.transformer.ln_f."
:
"model.norm."
,
# lm_head is renamed to model.transformer.mlp.down_proj firstly,
# we need to run a second renaming for it
"model.transformer.mlp.down_proj."
:
"lm_head."
,
},
)
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
config
=
vllm_config
.
model_config
.
hf_config
...
@@ -1293,36 +1332,10 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -1293,36 +1332,10 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
return
next_tokens
return
next_tokens
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_substr
=
{
# vision backbone mapping
"image_projector.w1."
:
"image_projector.gate_proj."
,
"image_projector.w3."
:
"image_projector.up_proj."
,
"image_projector.w2."
:
"image_projector.down_proj."
,
# language backbone mapping
"att_proj"
:
"self_attn.qkv_proj"
,
"attn_out"
:
"self_attn.o_proj"
,
"q_norm"
:
"self_attn.q_norm"
,
"k_norm"
:
"self_attn.k_norm"
,
"ff_proj"
:
"mlp.gate_up_proj"
,
"ff_out"
:
"mlp.down_proj"
,
"attn_norm"
:
"input_layernorm"
,
"ff_norm"
:
"post_attention_layernorm"
,
},
orig_to_new_prefix
=
{
# vision backbone mapping
"model.vision_backbone."
:
"vision_backbone."
,
# language backbone mapping
"model.transformer.blocks."
:
"model.layers."
,
"model.transformer.ln_f."
:
"model.norm."
,
# lm_head is renamed to model.transformer.mlp.down_proj firstly,
# we need to run a second renaming for it
"model.transformer.mlp.down_proj."
:
"lm_head."
,
},
)
loader
=
AutoWeightsLoader
(
self
)
loader
=
AutoWeightsLoader
(
self
)
weights
=
_get_weights_with_merged_embedding
(
weights
)
weights
=
_get_weights_with_merged_embedding
(
weights
)
return
loader
.
load_weights
(
weights
,
mapper
=
hf_to_vllm_mapper
)
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
def
_get_weights_with_merged_embedding
(
def
_get_weights_with_merged_embedding
(
...
...
vllm/model_executor/models/phi3v.py
View file @
96ae75ad
...
@@ -34,7 +34,6 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
...
@@ -34,7 +34,6 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
from
vllm.multimodal.inputs
import
NestedTensors
from
vllm.multimodal.inputs
import
NestedTensors
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
from
vllm.multimodal.processing
import
(
BaseMultiModalProcessor
,
MultiModalDataDict
,
MultiModalDataItems
,
ProcessorInputs
,
MultiModalDataItems
,
ProcessorInputs
,
PromptReplacement
)
PromptReplacement
)
from
vllm.sequence
import
IntermediateTensors
from
vllm.sequence
import
IntermediateTensors
...
@@ -302,11 +301,18 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
...
@@ -302,11 +301,18 @@ class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase):
return
image_features_hd_newline
return
image_features_hd_newline
def
get_max_phi3v_image_tokens
(
ctx
:
InputContext
)
->
int
:
def
get_max_phi3v_image_tokens
(
processor
=
ctx
.
get_hf_processor
()
ctx
:
InputContext
,
image_processor
=
processor
.
image_processor
# type: ignore
*
,
num_crops
:
Optional
[
int
]
=
None
,
)
->
int
:
mm_processor_kwargs
=
{}
if
num_crops
:
mm_processor_kwargs
[
"num_crops"
]
=
num_crops
return
image_processor
.
calc_num_image_tokens_from_image_size
(
processor
=
ctx
.
get_hf_processor
(
**
mm_processor_kwargs
)
return
processor
.
calc_num_image_tokens_from_image_size
(
width
=
MAX_IMAGE_FEATURE_SIZE_WIDTH
,
width
=
MAX_IMAGE_FEATURE_SIZE_WIDTH
,
height
=
MAX_IMAGE_FEATURE_SIZE_HEIGHT
,
height
=
MAX_IMAGE_FEATURE_SIZE_HEIGHT
,
)
)
...
@@ -323,20 +329,27 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor):
...
@@ -323,20 +329,27 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor):
return
self
.
ctx
.
get_hf_processor
(
num_crops
=
num_crops
)
return
self
.
ctx
.
get_hf_processor
(
num_crops
=
num_crops
)
return
self
.
ctx
.
get_hf_processor
()
return
self
.
ctx
.
get_hf_processor
()
def
_
apply
_hf_processor
(
def
_
call
_hf_processor
(
self
,
self
,
hf_processor
:
ProcessorMixin
,
prompt
:
str
,
prompt
:
str
,
mm_data
:
MultiModalDataDi
ct
,
processor_data
:
Mapping
[
str
,
obje
ct
]
,
mm_processor_kwargs
:
Mapping
[
str
,
object
],
mm_processor_kwargs
:
Mapping
[
str
,
object
],
)
->
BatchFeature
:
)
->
BatchFeature
:
processed_outputs
=
super
().
_apply_hf_processor
(
processed_outputs
=
super
().
_call_hf_processor
(
prompt
,
mm_data
,
mm_processor_kwargs
)
hf_processor
,
prompt
=
prompt
,
processor_data
=
processor_data
,
mm_processor_kwargs
=
mm_processor_kwargs
,
)
# Phi3v processor has inserted -1, -2 etc as placeholder in prompt_ids,
# Phi3v processor has inserted -1, -2 etc as placeholder in prompt_ids,
# which will cause OverflowError when decoding the prompt_ids.
# which will cause OverflowError when decoding the prompt_ids.
# Therefore, we need to do an early replacement here
# Therefore, we need to do an early replacement here
token_ids
=
processed_outputs
[
'input_ids'
]
token_ids
=
processed_outputs
[
'input_ids'
]
token_ids
[
token_ids
<
0
]
=
_IMAGE_TOKEN_ID
token_ids
[
token_ids
<
0
]
=
_IMAGE_TOKEN_ID
processed_outputs
[
'input_ids'
]
=
token_ids
processed_outputs
[
'input_ids'
]
=
token_ids
return
processed_outputs
return
processed_outputs
def
_get_prompt_replacements
(
def
_get_prompt_replacements
(
...
@@ -395,6 +408,13 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor):
...
@@ -395,6 +408,13 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor):
@
MULTIMODAL_REGISTRY
.
register_max_image_tokens
(
get_max_phi3v_image_tokens
)
@
MULTIMODAL_REGISTRY
.
register_max_image_tokens
(
get_max_phi3v_image_tokens
)
@
MULTIMODAL_REGISTRY
.
register_processor
(
Phi3VMultiModalProcessor
)
@
MULTIMODAL_REGISTRY
.
register_processor
(
Phi3VMultiModalProcessor
)
class
Phi3VForCausalLM
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
class
Phi3VForCausalLM
(
nn
.
Module
,
SupportsMultiModal
,
SupportsPP
):
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"model.vision_embed_tokens.wte"
:
"embed_tokens"
,
"model.vision_embed_tokens."
:
"vision_embed_tokens."
,
"lm_head."
:
"language_model.lm_head."
,
"model."
:
"language_model.model."
,
})
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
super
().
__init__
()
...
@@ -603,17 +623,10 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
...
@@ -603,17 +623,10 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]])
->
Set
[
str
]:
torch
.
Tensor
]])
->
Set
[
str
]:
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"model.vision_embed_tokens.wte"
:
"embed_tokens"
,
"model.vision_embed_tokens."
:
"vision_embed_tokens."
,
"lm_head."
:
"language_model.lm_head."
,
"model."
:
"language_model.model."
,
})
loader
=
AutoWeightsLoader
(
self
)
loader
=
AutoWeightsLoader
(
self
)
autoloaded_weights
=
loader
.
load_weights
(
weights
,
autoloaded_weights
=
loader
.
load_weights
(
weights
,
mapper
=
hf_to_vllm_mapper
)
mapper
=
self
.
hf_to_vllm_mapper
)
# The HF config doesn't specify whether these are tied,
# The HF config doesn't specify whether these are tied,
# so we detect it this way
# so we detect it this way
...
...
vllm/model_executor/models/pixtral.py
View file @
96ae75ad
...
@@ -10,12 +10,12 @@ from mistral_common.protocol.instruct.messages import ImageChunk
...
@@ -10,12 +10,12 @@ from mistral_common.protocol.instruct.messages import ImageChunk
from
PIL
import
Image
from
PIL
import
Image
from
transformers
import
PixtralVisionConfig
from
transformers
import
PixtralVisionConfig
from
transformers.models.pixtral.image_processing_pixtral
import
(
from
transformers.models.pixtral.image_processing_pixtral
import
(
_num_image_tokens
)
_num_image_tokens
as
_get_pixtral_hf_num_image_tokens
)
from
transformers.models.pixtral.modeling_pixtral
import
(
from
transformers.models.pixtral.modeling_pixtral
import
(
PixtralRotaryEmbedding
,
apply_rotary_pos_emb
,
position_ids_in_meshgrid
)
PixtralRotaryEmbedding
,
apply_rotary_pos_emb
,
position_ids_in_meshgrid
)
from
vllm.attention
import
AttentionMetadata
from
vllm.attention
import
AttentionMetadata
from
vllm.config
import
ModelConfig
,
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.distributed
import
divide
,
get_tensor_model_parallel_world_size
from
vllm.distributed
import
divide
,
get_tensor_model_parallel_world_size
from
vllm.inputs
import
(
INPUT_REGISTRY
,
DecoderOnlyInputs
,
DummyData
,
from
vllm.inputs
import
(
INPUT_REGISTRY
,
DecoderOnlyInputs
,
DummyData
,
InputContext
,
token_inputs
)
InputContext
,
token_inputs
)
...
@@ -27,7 +27,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
...
@@ -27,7 +27,6 @@ from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.layers.sampler
import
SamplerOutput
,
get_sampler
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.model_loader.weight_utils
import
default_weight_loader
from
vllm.model_executor.models.utils
import
merge_multimodal_embeddings
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.model_executor.sampling_metadata
import
SamplingMetadata
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalKwargs
from
vllm.multimodal
import
MULTIMODAL_REGISTRY
,
MultiModalKwargs
from
vllm.multimodal.inputs
import
NestedTensors
,
PlaceholderRange
from
vllm.multimodal.inputs
import
NestedTensors
,
PlaceholderRange
...
@@ -35,11 +34,10 @@ from vllm.multimodal.utils import (cached_get_tokenizer,
...
@@ -35,11 +34,10 @@ from vllm.multimodal.utils import (cached_get_tokenizer,
consecutive_placeholder_ranges
,
consecutive_placeholder_ranges
,
resolve_visual_encoder_outputs
)
resolve_visual_encoder_outputs
)
from
vllm.sequence
import
IntermediateTensors
,
SequenceData
from
vllm.sequence
import
IntermediateTensors
,
SequenceData
from
vllm.transformers_utils.processor
import
cached_get_processor
from
vllm.utils
import
is_list_of
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.interfaces
import
SupportsMultiModal
,
SupportsPP
from
.utils
import
init_vllm_registered_model
,
maybe_prefix
from
.utils
import
(
init_vllm_registered_model
,
maybe_prefix
,
merge_multimodal_embeddings
)
try
:
try
:
from
xformers
import
ops
as
xops
from
xformers
import
ops
as
xops
...
@@ -47,8 +45,12 @@ try:
...
@@ -47,8 +45,12 @@ try:
except
ImportError
:
except
ImportError
:
USE_XFORMERS_OPS
=
False
USE_XFORMERS_OPS
=
False
PIXTRAL_IMAGE_BREAK_ID
=
12
# These token ids cannot be retrieved from model config
PIXTRAL_IMAGE_END_ID
=
13
# so we hardcode them here.
PIXTRAL_12B_IMAGE_BREAK_ID
=
12
PIXTRAL_12B_IMAGE_END_ID
=
13
PIXTRAL_LARGE_IMAGE_BREAK_ID
=
14
PIXTRAL_LARGE_IMAGE_END_ID
=
15
def
get_max_pixtral_image_tokens
(
ctx
:
InputContext
):
def
get_max_pixtral_image_tokens
(
ctx
:
InputContext
):
...
@@ -120,8 +122,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
...
@@ -120,8 +122,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
for
image_data
in
data_list
:
for
image_data
in
data_list
:
image
=
ImageChunk
(
image
=
image_data
)
image
=
ImageChunk
(
image
=
image_data
)
encoding
=
tokenizer
.
instruct
.
mm_encoder
(
image
)
encoding
=
tokenizer
.
instruct
.
mm_encoder
(
image
)
image
=
torch
.
from_numpy
(
encoding
.
image
).
to
(
device
=
"cuda"
,
image
=
torch
.
from_numpy
(
encoding
.
image
).
to
(
dtype
=
torch
.
float16
)
dtype
=
torch
.
float16
)
images
.
append
(
image
)
images
.
append
(
image
)
image_tokens_list
.
append
(
encoding
.
tokens
)
image_tokens_list
.
append
(
encoding
.
tokens
)
...
@@ -239,8 +240,9 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -239,8 +240,9 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
# NOTE: Image embeddings are split into separate tensors for each image
# NOTE: Image embeddings are split into separate tensors for each image
# by the indices of `[IMG_END]` token.
# by the indices of `[IMG_END]` token.
split_indices
=
torch
.
where
(
image_end_condition
=
(
image_tokens
==
PIXTRAL_12B_IMAGE_END_ID
)
|
(
image_tokens
==
PIXTRAL_IMAGE_END_ID
)[
0
]
+
1
image_tokens
==
PIXTRAL_LARGE_IMAGE_END_ID
)
split_indices
=
torch
.
where
(
image_end_condition
)[
0
]
+
1
if
len
(
split_indices
)
<=
1
:
if
len
(
split_indices
)
<=
1
:
# Do not split, return as tensor of shape [1, fs, hs]
# Do not split, return as tensor of shape [1, fs, hs]
return
image_embeds
.
unsqueeze
(
0
)
return
image_embeds
.
unsqueeze
(
0
)
...
@@ -262,8 +264,11 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
...
@@ -262,8 +264,11 @@ class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
if
multimodal_embeddings
is
not
None
:
if
multimodal_embeddings
is
not
None
:
inputs_embeds
=
merge_multimodal_embeddings
(
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
[
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
[
self
.
vision_args
.
image_token_id
,
PIXTRAL_IMAGE_END_ID
,
self
.
vision_args
.
image_token_id
,
PIXTRAL_IMAGE_BREAK_ID
PIXTRAL_12B_IMAGE_END_ID
,
PIXTRAL_12B_IMAGE_BREAK_ID
,
PIXTRAL_LARGE_IMAGE_BREAK_ID
,
PIXTRAL_LARGE_IMAGE_END_ID
,
])
])
return
inputs_embeds
return
inputs_embeds
...
@@ -699,37 +704,14 @@ def get_pixtral_hf_num_patches(*, image_size: int, patch_size: int) -> int:
...
@@ -699,37 +704,14 @@ def get_pixtral_hf_num_patches(*, image_size: int, patch_size: int) -> int:
return
grid_length
*
grid_length
return
grid_length
*
grid_length
def
get_max_pixtral_hf_image_feature_size
(
hf_config
:
PixtralVisionConfig
)
->
int
:
return
get_pixtral_hf_num_patches
(
image_size
=
hf_config
.
image_size
,
patch_size
=
hf_config
.
patch_size
)
def
get_max_pixtral_hf_image_tokens
(
hf_config
:
PixtralVisionConfig
)
->
int
:
def
get_max_pixtral_hf_image_tokens
(
hf_config
:
PixtralVisionConfig
)
->
int
:
return
get_max_pixtral_hf_image_feature_size
(
hf_config
)
grid_length
=
get_pixtral_hf_patch_grid_length
(
image_size
=
hf_config
.
image_size
,
patch_size
=
hf_config
.
patch_size
,
)
# Consider the image_break_token
def
dummy_seq_data_for_pixtral_hf
(
return
(
grid_length
+
1
)
*
grid_length
hf_config
:
PixtralVisionConfig
,
seq_len
:
int
,
num_images
:
int
,
*
,
image_token_id
:
int
,
image_feature_size_override
:
Optional
[
int
]
=
None
,
mm_key
:
str
=
"image"
):
if
image_feature_size_override
is
None
:
image_feature_size
=
get_max_pixtral_hf_image_feature_size
(
hf_config
)
else
:
image_feature_size
=
image_feature_size_override
return
SequenceData
.
from_prompt_token_counts
(
(
image_token_id
,
image_feature_size
*
num_images
),
(
0
,
seq_len
-
image_feature_size
*
num_images
),
),
{
mm_key
:
consecutive_placeholder_ranges
(
num_items
=
num_images
,
item_size
=
image_feature_size
)
}
def
dummy_image_for_pixtral_hf
(
def
dummy_image_for_pixtral_hf
(
...
@@ -763,116 +745,14 @@ def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig,
...
@@ -763,116 +745,14 @@ def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig,
image_width
=
int
(
numpy
.
ceil
(
image_width
/
ratio
))
image_width
=
int
(
numpy
.
ceil
(
image_width
/
ratio
))
image_height
=
int
(
numpy
.
ceil
(
image_height
/
ratio
))
image_height
=
int
(
numpy
.
ceil
(
image_height
/
ratio
))
num_height_tokens
,
num_width_tokens
=
_num_image_tokens
(
num_height_tokens
,
num_width_tokens
=
_get_pixtral_hf_num_image_tokens
(
(
image_height
,
image_width
),
(
patch_height
,
patch_width
))
(
image_height
,
image_width
),
(
patch_height
,
patch_width
),
)
return
num_width_tokens
,
num_height_tokens
return
num_width_tokens
,
num_height_tokens
def
input_processor_for_pixtral_hf
(
model_config
:
ModelConfig
,
hf_config
:
PixtralVisionConfig
,
inputs
:
DecoderOnlyInputs
,
*
,
image_token_id
:
int
,
image_feature_size_override
:
Optional
[
Union
[
int
,
List
[
int
]]]
=
None
,
)
->
DecoderOnlyInputs
:
assert
image_feature_size_override
is
None
,
(
"image_feature_size_override is not supported for Pixtral"
)
multi_modal_data
=
inputs
.
get
(
"multi_modal_data"
)
if
multi_modal_data
is
None
or
"image"
not
in
multi_modal_data
:
return
inputs
processor
=
cached_get_processor
(
model_config
.
model
)
image_data
=
multi_modal_data
[
"image"
]
if
isinstance
(
image_data
,
Image
.
Image
):
image_data
=
[
image_data
]
elif
not
is_list_of
(
image_data
,
Image
.
Image
):
raise
TypeError
(
f
"Invalid image type:
{
type
(
image_data
)
}
"
)
new_prompt
=
inputs
.
get
(
"prompt"
)
new_token_ids
=
inputs
[
"prompt_token_ids"
]
image_token
=
processor
.
image_token
image_break_token
=
processor
.
image_break_token
image_end_token
=
processor
.
image_end_token
# Update new_prompt if present
if
new_prompt
:
parts
=
new_prompt
.
split
(
image_token
)
assert
len
(
parts
)
-
1
==
len
(
image_data
)
new_parts
=
[
parts
[
0
]]
# Start with the part before any image tokens
for
image
,
next_part
in
zip
(
image_data
,
parts
[
1
:]):
w
,
h
=
image
.
size
(
num_width_tokens
,
num_height_tokens
)
=
get_pixtral_hf_image_feature_size
(
hf_config
,
image_width
=
w
,
image_height
=
h
)
replace_tokens
=
[
image_token
]
*
num_width_tokens
+
[
image_break_token
]
replace_tokens
=
replace_tokens
*
num_height_tokens
replace_tokens
[
-
1
]
=
image_end_token
new_parts
.
append
(
""
.
join
(
replace_tokens
))
new_parts
.
append
(
next_part
)
new_prompt
=
""
.
join
(
new_parts
)
# Update new_token_ids
convert_tokens_to_ids
=
processor
.
tokenizer
.
convert_tokens_to_ids
image_token_id
=
convert_tokens_to_ids
(
image_token
)
image_break_id
=
convert_tokens_to_ids
(
image_break_token
)
image_end_id
=
convert_tokens_to_ids
(
image_end_token
)
placeholder_token_id
=
-
999
# Find all image token indices at once
placeholder_indices
=
[
idx
for
idx
,
token_id
in
enumerate
(
new_token_ids
)
if
token_id
==
image_token_id
]
assert
len
(
placeholder_indices
)
==
len
(
image_data
)
replace_tokens_list
=
[]
for
placeholder_idx
,
image
in
zip
(
placeholder_indices
,
image_data
):
new_token_ids
[
placeholder_idx
]
=
placeholder_token_id
w
,
h
=
image
.
size
(
num_width_tokens
,
num_height_tokens
)
=
get_pixtral_hf_image_feature_size
(
hf_config
,
image_width
=
w
,
image_height
=
h
)
replace_tokens
=
[
image_token_id
]
*
num_width_tokens
+
[
image_break_id
]
replace_tokens
=
replace_tokens
*
num_height_tokens
replace_tokens
[
-
1
]
=
image_end_id
replace_tokens_list
.
append
(
replace_tokens
)
reverse_offsets
:
List
[
int
]
=
[]
# Backward iteration for replacement without affecting known indices
for
placeholder_idx
,
replace_tokens
in
zip
(
reversed
(
placeholder_indices
),
reversed
(
replace_tokens_list
)):
reverse_offsets
.
append
(
len
(
new_token_ids
)
-
placeholder_idx
+
len
(
replace_tokens
))
new_token_ids
[
placeholder_idx
:
placeholder_idx
+
1
]
=
replace_tokens
placeholder_ranges
:
List
[
PlaceholderRange
]
=
[]
for
reverse_offset
,
replace_tokens
in
zip
(
reversed
(
reverse_offsets
),
replace_tokens_list
):
placeholder_ranges
.
append
(
PlaceholderRange
(
offset
=
len
(
new_token_ids
)
-
reverse_offset
,
length
=
len
(
replace_tokens
),
))
# NOTE: Create a defensive copy of the original inputs
return
token_inputs
(
prompt_token_ids
=
new_token_ids
,
prompt
=
new_prompt
,
multi_modal_data
=
multi_modal_data
,
multi_modal_placeholders
=
{
"image"
:
placeholder_ranges
})
class
PixtralHFMLP
(
nn
.
Module
):
class
PixtralHFMLP
(
nn
.
Module
):
def
__init__
(
def
__init__
(
...
...
vllm/model_executor/models/qwen2.py
View file @
96ae75ad
...
@@ -663,6 +663,8 @@ class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
...
@@ -663,6 +663,8 @@ class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
embedding_modules
=
{}
embedding_modules
=
{}
embedding_padding_modules
=
[]
embedding_padding_modules
=
[]
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"model."
:
""
})
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
()
super
().
__init__
()
config
=
vllm_config
.
model_config
.
hf_config
config
=
vllm_config
.
model_config
.
hf_config
...
@@ -677,8 +679,8 @@ class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
...
@@ -677,8 +679,8 @@ class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
self
.
model
=
Qwen2Model
(
vllm_config
=
vllm_config
,
self
.
model
=
Qwen2Model
(
vllm_config
=
vllm_config
,
prefix
=
maybe_prefix
(
prefix
,
"model"
))
prefix
=
maybe_prefix
(
prefix
,
"model"
))
# TODO: Replace this model class with
for
_embedding
(Qwen2ForCausalLM),
# TODO: Replace this model class with
as
_embedding
_model(
# after changing the default pooling method
#
Qwen2ForCausalLM)
after changing the default pooling method
if
pooler_config
.
pooling_type
is
None
:
if
pooler_config
.
pooling_type
is
None
:
logger
.
warning
(
logger
.
warning
(
"This embedding model will default to last-token pooling in "
"This embedding model will default to last-token pooling in "
...
@@ -711,8 +713,7 @@ class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
...
@@ -711,8 +713,7 @@ class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
return
self
.
_pooler
(
hidden_states
,
pooling_metadata
)
return
self
.
_pooler
(
hidden_states
,
pooling_metadata
)
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
def
load_weights
(
self
,
weights
:
Iterable
[
Tuple
[
str
,
torch
.
Tensor
]]):
hf_to_vllm_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"model."
:
""
})
weights
=
self
.
hf_to_vllm_mapper
.
apply
(
weights
)
weights
=
hf_to_vllm_mapper
.
apply
(
weights
)
weights
=
((
name
,
data
)
for
name
,
data
in
weights
weights
=
((
name
,
data
)
for
name
,
data
in
weights
if
not
name
.
startswith
(
"lm_head."
))
if
not
name
.
startswith
(
"lm_head."
))
self
.
model
.
load_weights
(
weights
)
self
.
model
.
load_weights
(
weights
)
Prev
1
…
12
13
14
15
16
17
18
19
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment