Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
53076d70
Commit
53076d70
authored
Mar 24, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.2' into v0.8.2-ori
parents
322a0be6
9c5c81b0
Changes
219
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
334 additions
and
367 deletions
+334
-367
vllm/model_executor/models/gemma3_mm.py
vllm/model_executor/models/gemma3_mm.py
+1
-1
vllm/model_executor/models/h2ovl.py
vllm/model_executor/models/h2ovl.py
+6
-16
vllm/model_executor/models/internvl.py
vllm/model_executor/models/internvl.py
+104
-69
vllm/model_executor/models/llava.py
vllm/model_executor/models/llava.py
+7
-1
vllm/model_executor/models/llava_onevision.py
vllm/model_executor/models/llava_onevision.py
+38
-45
vllm/model_executor/models/mllama.py
vllm/model_executor/models/mllama.py
+1
-1
vllm/model_executor/models/nvlm_d.py
vllm/model_executor/models/nvlm_d.py
+8
-13
vllm/model_executor/models/qwen2_5_vl.py
vllm/model_executor/models/qwen2_5_vl.py
+23
-8
vllm/model_executor/models/qwen2_vl.py
vllm/model_executor/models/qwen2_vl.py
+12
-6
vllm/model_executor/models/registry.py
vllm/model_executor/models/registry.py
+8
-5
vllm/model_executor/models/teleflm.py
vllm/model_executor/models/teleflm.py
+79
-0
vllm/multimodal/processing.py
vllm/multimodal/processing.py
+4
-4
vllm/outputs.py
vllm/outputs.py
+6
-1
vllm/platforms/__init__.py
vllm/platforms/__init__.py
+0
-17
vllm/platforms/cuda.py
vllm/platforms/cuda.py
+20
-12
vllm/platforms/interface.py
vllm/platforms/interface.py
+3
-5
vllm/platforms/openvino.py
vllm/platforms/openvino.py
+0
-152
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+3
-2
vllm/platforms/tpu.py
vllm/platforms/tpu.py
+4
-0
vllm/spec_decode/spec_decode_worker.py
vllm/spec_decode/spec_decode_worker.py
+7
-9
No files found.
vllm/model_executor/models/gemma3_mm.py
View file @
53076d70
...
...
@@ -183,7 +183,7 @@ class Gemma3ProcessingInfo(BaseProcessingInfo):
image_width
:
int
,
image_height
:
int
,
processor
:
Optional
[
Gemma3Processor
],
)
->
PromptUpdateDetails
:
)
->
PromptUpdateDetails
[
str
]
:
if
processor
is
None
:
processor
=
self
.
get_hf_processor
()
...
...
vllm/model_executor/models/h2ovl.py
View file @
53076d70
...
...
@@ -249,20 +249,15 @@ class H2OVLProcessor(BaseInternVLProcessor):
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_CONTEXT
]
def
get_image_repl
_features
(
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
Optional
[
int
],
)
->
str
:
return
IMG_CONTEXT
*
feature_size
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
def
get_image_repl_full
(
self
,
feature_size
:
int
,
num_patches
:
Optional
[
int
],
)
->
str
:
features
=
self
.
get_image_repl_features
(
feature_size
,
num_patches
)
return
IMG_START
+
features
+
IMG_END
return
PromptUpdateDetails
(
full
=
repl_full
,
features
=
repl_features
)
def
resolve_min_max_num
(
self
,
...
...
@@ -501,12 +496,7 @@ class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo]
if
num_patches
is
not
None
:
assert
isinstance
(
num_patches
,
int
)
return
PromptUpdateDetails
(
full
=
hf_processor
.
get_image_repl_full
(
feature_size
,
num_patches
),
features
=
hf_processor
.
get_image_repl_features
(
feature_size
,
num_patches
),
)
return
hf_processor
.
get_image_repl
(
feature_size
,
num_patches
)
return
[
PromptReplacement
(
...
...
vllm/model_executor/models/internvl.py
View file @
53076d70
...
...
@@ -9,14 +9,13 @@
from
abc
import
ABC
,
abstractmethod
from
collections.abc
import
Iterable
,
Mapping
,
Sequence
from
functools
import
cached_property
from
typing
import
(
List
,
Literal
,
Optional
,
Set
,
Tuple
,
TypedDict
,
TypeVar
,
Union
)
from
typing
import
Literal
,
Optional
,
Set
,
Tuple
,
TypedDict
,
TypeVar
,
Union
import
torch
import
torch.nn
as
nn
import
torchvision.transforms
as
T
from
PIL
import
Image
from
transformers
import
Batch
Feature
,
PretrainedConfig
,
TensorType
from
transformers
import
Batch
Encoding
,
PretrainedConfig
,
TensorType
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.quantization
import
QuantizationConfig
...
...
@@ -36,10 +35,12 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
from
vllm.multimodal.profiling
import
BaseDummyInputsBuilder
,
ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
vllm.utils
import
flatten_2d_lists
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
from
.utils
import
(
AutoWeightsLoader
,
flatten_bn
,
init_vllm_registered_model
,
maybe_prefix
,
merge_multimodal_embeddings
)
from
.vision
import
scatter_patch_features
,
select_patch_features
IMG_START
=
'<img>'
IMG_END
=
'</img>'
...
...
@@ -51,16 +52,26 @@ IMAGENET_STD = (0.229, 0.224, 0.225)
class
InternVLImagePixelInputs
(
TypedDict
):
type
:
Literal
[
"pixel_values"
]
d
at
a
:
torch
.
Tensor
pixel_values_fl
at
:
torch
.
Tensor
"""
Shape:
`(batch_size * num_images * (1 + num_patches), num_channels, height, width)`
"""
patches_per_image
:
List
[
int
]
num_patches
:
torch
.
Tensor
"""Shape: `(batch_size * num_images)`"""
embed_is_patch
:
Union
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]]
"""
List of number of total patches for each image in the batch.
A boolean mask indicating which image embeddings correspond
to patch tokens.
Shape: `(batch_size, num_images, num_embeds)`
"""
num_embeds
:
Union
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]]
"""Shape: `(batch_size, num_images)`"""
class
InternVLImageEmbeddingInputs
(
TypedDict
):
type
:
Literal
[
"image_embeds"
]
...
...
@@ -286,19 +297,11 @@ class BaseInternVLProcessor(ABC):
raise
NotImplementedError
@
abstractmethod
def
get_image_repl
_features
(
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
Optional
[
int
],
)
->
str
:
raise
NotImplementedError
@
abstractmethod
def
get_image_repl_full
(
self
,
feature_size
:
int
,
num_patches
:
Optional
[
int
],
)
->
str
:
)
->
PromptUpdateDetails
[
str
]:
raise
NotImplementedError
def
resolve_min_max_num
(
...
...
@@ -394,7 +397,7 @@ class BaseInternVLProcessor(ABC):
max_dynamic_patch
:
Optional
[
int
]
=
None
,
dynamic_image_size
:
Optional
[
bool
]
=
None
,
return_tensors
:
Optional
[
Union
[
str
,
TensorType
]]
=
None
,
)
->
BatchFeature
:
)
->
Mapping
[
str
,
NestedTensors
]
:
if
text
is
None
:
text
=
[]
if
not
isinstance
(
text
,
list
):
...
...
@@ -413,28 +416,41 @@ class BaseInternVLProcessor(ABC):
max_dynamic_patch
=
max_dynamic_patch
,
dynamic_image_size
=
dynamic_image_size
,
)
image_inputs
=
{
"pixel_values_flat"
:
torch
.
cat
(
pixel_values_lst
),
"image_num_patches"
:
list
(
map
(
len
,
pixel_values_lst
)),
image_inputs
:
dict
[
str
,
NestedTensors
]
=
{
"pixel_values_flat"
:
torch
.
cat
(
pixel_values_lst
),
"image_num_patches"
:
torch
.
tensor
([
len
(
item
)
for
item
in
pixel_values_lst
]),
}
tokenizer
=
self
.
tokenizer
image_token_id
=
self
.
image_token_id
num_embeds
=
list
[
int
]()
embed_is_patch
=
list
[
torch
.
Tensor
]()
for
pixel_values
in
pixel_values_lst
:
num_patches
=
pixel_values
.
shape
[
0
]
feature_size
=
num_patches
*
self
.
num_image_token
image_repl
=
self
.
get_image_repl_full
(
feature_size
,
num_patches
)
text
=
[
t
.
replace
(
'<image>'
,
image_repl
,
1
)
for
t
in
text
]
image_repl
=
self
.
get_image_repl
(
feature_size
,
num_patches
)
feature_tokens
=
tokenizer
.
encode
(
image_repl
.
features
,
add_special_tokens
=
False
)
text
=
[
t
.
replace
(
'<image>'
,
image_repl
.
full
,
1
)
for
t
in
text
]
num_embeds
.
append
(
len
(
feature_tokens
))
embed_is_patch
.
append
(
torch
.
tensor
(
feature_tokens
)
==
image_token_id
)
image_inputs
[
"num_embeds"
]
=
torch
.
tensor
(
num_embeds
)
image_inputs
[
"embed_is_patch"
]
=
embed_is_patch
text_inputs
=
self
.
tokenizer
(
text
)
return
BatchFeature
(
{
**
text_inputs
,
**
image_inputs
,
},
tensor_type
=
return_tensors
,
)
return
{
**
BatchEncoding
(
text_inputs
,
tensor_type
=
return_tensors
),
**
image_inputs
,
}
class
InternVLProcessor
(
BaseInternVLProcessor
):
...
...
@@ -443,20 +459,15 @@ class InternVLProcessor(BaseInternVLProcessor):
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_CONTEXT
]
def
get_image_repl
_features
(
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
Optional
[
int
],
)
->
str
:
return
IMG_CONTEXT
*
feature_size
)
->
PromptUpdateDetails
[
str
]:
repl_features
=
IMG_CONTEXT
*
feature_size
repl_full
=
IMG_START
+
repl_features
+
IMG_END
def
get_image_repl_full
(
self
,
feature_size
:
int
,
num_patches
:
Optional
[
int
],
)
->
str
:
features
=
self
.
get_image_repl_features
(
feature_size
,
num_patches
)
return
IMG_START
+
features
+
IMG_END
return
PromptUpdateDetails
(
full
=
repl_full
,
features
=
repl_features
)
class
BaseInternVLProcessingInfo
(
BaseProcessingInfo
):
...
...
@@ -566,16 +577,15 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
prompt
:
str
,
mm_data
:
Mapping
[
str
,
object
],
mm_kwargs
:
Mapping
[
str
,
object
],
)
->
BatchFeature
:
)
->
Mapping
[
str
,
NestedTensors
]
:
processed_outputs
=
super
().
_call_hf_processor
(
prompt
=
prompt
,
mm_data
=
mm_data
,
mm_kwargs
=
mm_kwargs
,
)
image_token_id
=
self
.
info
.
get_hf_processor
(
**
mm_kwargs
).
image_token_id
image_data
=
mm_data
.
get
(
"images"
,
[])
assert
isinstance
(
image_data
,
list
)
hf_processor
=
self
.
info
.
get_hf_processor
(
**
mm_kwargs
)
image_token_id
=
hf_processor
.
image_token_id
# Since there may be extra tokens in the feature placeholders,
# we need to pass the image token ID to the model to select the
...
...
@@ -586,7 +596,7 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
def
_get_mm_fields_config
(
self
,
hf_inputs
:
BatchFeature
,
hf_inputs
:
Mapping
[
str
,
NestedTensors
]
,
hf_processor_mm_kwargs
:
Mapping
[
str
,
object
],
)
->
Mapping
[
str
,
MultiModalFieldConfig
]:
image_num_patches
=
hf_inputs
.
get
(
"image_num_patches"
,
torch
.
empty
(
0
))
...
...
@@ -596,6 +606,8 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
pixel_values_flat
=
MultiModalFieldConfig
.
flat_from_sizes
(
"image"
,
image_num_patches
),
image_num_patches
=
MultiModalFieldConfig
.
batched
(
"image"
),
embed_is_patch
=
MultiModalFieldConfig
.
batched
(
"image"
),
num_embeds
=
MultiModalFieldConfig
.
batched
(
"image"
),
image_embeds
=
MultiModalFieldConfig
.
batched
(
"image"
),
image_token_id
=
MultiModalFieldConfig
.
shared
(
"image"
,
num_images
),
)
...
...
@@ -637,12 +649,7 @@ class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
if
num_patches
is
not
None
:
assert
isinstance
(
num_patches
,
int
)
return
PromptUpdateDetails
(
full
=
hf_processor
.
get_image_repl_full
(
feature_size
,
num_patches
),
features
=
hf_processor
.
get_image_repl_features
(
feature_size
,
num_patches
),
)
return
hf_processor
.
get_image_repl
(
feature_size
,
num_patches
)
return
[
PromptReplacement
(
...
...
@@ -832,6 +839,8 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
self
,
**
kwargs
:
object
)
->
Optional
[
InternVLImageInputs
]:
pixel_values_flat
=
kwargs
.
pop
(
"pixel_values_flat"
,
None
)
image_num_patches
=
kwargs
.
pop
(
"image_num_patches"
,
None
)
embed_is_patch
=
kwargs
.
pop
(
"embed_is_patch"
,
None
)
num_embeds
=
kwargs
.
pop
(
"num_embeds"
,
None
)
image_embeds
=
kwargs
.
pop
(
"image_embeds"
,
None
)
if
pixel_values_flat
is
None
and
image_embeds
is
None
:
...
...
@@ -858,35 +867,47 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
if
not
isinstance
(
image_num_patches
,
(
torch
.
Tensor
,
list
)):
raise
ValueError
(
"Incorrect type of image_num_patches. "
f
"Got type:
{
type
(
pixel_values_flat
)
}
"
)
f
"Got type:
{
type
(
image_num_patches
)
}
"
)
if
not
isinstance
(
embed_is_patch
,
(
torch
.
Tensor
,
list
)):
raise
ValueError
(
"Incorrect type of embed_is_patch. "
f
"Got type:
{
type
(
embed_is_patch
)
}
"
)
if
not
isinstance
(
num_embeds
,
(
torch
.
Tensor
,
list
)):
raise
ValueError
(
"Incorrect type of num_embeds. "
f
"Got type:
{
type
(
num_embeds
)
}
"
)
pixel_values_flat
=
flatten_bn
(
pixel_values_flat
,
concat
=
True
)
image_num_patches
=
flatten_bn
(
image_num_patches
,
concat
=
True
)
return
InternVLImagePixelInputs
(
type
=
"pixel_values"
,
data
=
self
.
_validate_pixel_values
(
flatten_bn
(
pixel_values_flat
,
concat
=
True
)),
patches_per_image
=
flatten_bn
(
image_num_patches
,
concat
=
True
).
tolist
())
pixel_values_flat
=
self
.
_validate_pixel_values
(
pixel_values_flat
),
num_patches
=
image_num_patches
,
embed_is_patch
=
embed_is_patch
,
num_embeds
=
num_embeds
,
)
raise
AssertionError
(
"This line should be unreachable."
)
def
_process_image_input
(
self
,
image_input
:
InternVLImageInputs
,
)
->
tuple
[
torch
.
Tensor
,
...]:
)
->
Union
[
torch
.
Tensor
,
tuple
[
torch
.
Tensor
,
...]
]
:
if
image_input
[
"type"
]
==
"image_embeds"
:
return
image_input
[
"data"
]
assert
self
.
vision_model
is
not
None
image_embeds
=
self
.
extract_feature
(
image_input
[
"
d
at
a
"
])
image_embeds
=
self
.
extract_feature
(
image_input
[
"
pixel_values_fl
at"
])
patches
_per_image
=
image_input
[
"patches
_per_image
"
]
num_
patches
=
image_input
[
"
num_
patches"
]
# Only one image in the current batch
if
len
(
patches
_per_image
)
==
1
:
image_embeds
=
image_embeds
.
view
(
if
len
(
num_
patches
)
==
1
:
return
image_embeds
.
view
(
-
1
,
self
.
config
.
text_config
.
hidden_size
).
unsqueeze
(
0
)
return
image_embeds
# NOTE: Image embeddings are split into separate tensors for each image
# by the size of each embedding.
...
...
@@ -894,10 +915,9 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
image_embeds
=
image_embeds
.
view
(
-
1
,
self
.
config
.
text_config
.
hidden_size
)
image_feature_sizes
=
[
num_patches
*
feature_size
for
num_patches
in
patches
_per_image
num_patches
*
feature_size
for
num_patches
in
num_
patches
]
image_embeds
=
image_embeds
.
split
(
image_feature_sizes
)
return
image_embeds
return
image_embeds
.
split
(
image_feature_sizes
)
def
_set_visual_token_mask
(
self
,
input_ids
:
torch
.
Tensor
)
->
None
:
if
self
.
is_mono
:
...
...
@@ -911,8 +931,19 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
image_input
=
self
.
_parse_and_validate_image_input
(
**
kwargs
)
if
image_input
is
None
:
return
None
vision_embeddings
=
self
.
_process_image_input
(
image_input
)
return
vision_embeddings
image_features
=
self
.
_process_image_input
(
image_input
)
if
(
kwargs
.
get
(
"v0_path"
,
False
)
or
image_input
[
"type"
]
!=
"pixel_values"
):
return
image_features
return
flatten_2d_lists
(
scatter_patch_features
(
*
args
)
for
args
in
zip
(
image_features
,
image_input
[
"num_embeds"
],
image_input
[
"embed_is_patch"
],
))
def
get_input_embeddings
(
self
,
...
...
@@ -924,8 +955,11 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
assert
self
.
img_context_token_id
is
not
None
self
.
_set_visual_token_mask
(
input_ids
)
inputs_embeds
=
merge_multimodal_embeddings
(
input_ids
,
inputs_embeds
,
multimodal_embeddings
,
self
.
img_context_token_id
)
input_ids
,
inputs_embeds
,
select_patch_features
(
multimodal_embeddings
),
self
.
img_context_token_id
,
)
return
inputs_embeds
def
forward
(
...
...
@@ -944,6 +978,7 @@ class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
# NOTE: In v1, inputs_embeds is always generated at model runner, this
# condition is for v0 compatibility.
elif
inputs_embeds
is
None
:
kwargs
.
update
({
"v0_path"
:
True
})
vision_embeddings
=
self
.
get_multimodal_embeddings
(
**
kwargs
)
inputs_embeds
=
self
.
get_input_embeddings
(
input_ids
,
vision_embeddings
)
...
...
vllm/model_executor/models/llava.py
View file @
53076d70
...
...
@@ -233,7 +233,13 @@ class LlavaDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
class
LlavaProcessingInfo
(
BaseLlavaProcessingInfo
):
def
get_hf_processor
(
self
,
**
kwargs
:
object
):
return
self
.
ctx
.
get_hf_processor
(
LlavaProcessor
,
**
kwargs
)
hf_processor
=
self
.
ctx
.
get_hf_processor
(
LlavaProcessor
,
**
kwargs
)
# In case patch_size is omitted from `processor_config.json`
# e.g. for E5-V: https://huggingface.co/royokong/e5-v
if
hf_processor
.
patch_size
is
None
:
patch_size
=
self
.
get_vision_encoder_info
().
get_patch_size
()
hf_processor
.
patch_size
=
patch_size
return
hf_processor
class
BaseLlavaMultiModalProcessor
(
BaseMultiModalProcessor
[
_I
]):
...
...
vllm/model_executor/models/llava_onevision.py
View file @
53076d70
...
...
@@ -25,7 +25,6 @@ from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
from
vllm.multimodal.processing
import
PromptReplacement
,
PromptUpdate
from
vllm.multimodal.profiling
import
ProcessorInputs
from
vllm.sequence
import
IntermediateTensors
from
vllm.utils
import
is_list_of
from
.clip
import
CLIPVisionModel
from
.interfaces
import
MultiModalEmbeddings
,
SupportsMultiModal
,
SupportsPP
...
...
@@ -44,7 +43,7 @@ class LlavaOnevisionVideoPixelInputs(TypedDict):
type
:
Literal
[
"pixel_values_videos"
]
pixel_values_videos
:
Union
[
torch
.
Tensor
,
list
[
torch
.
Tensor
]]
"""
Shape: `(batch_size
,
num_videos, num_frames, num_channels, height, width)`
Shape: `(batch_size
*
num_videos, num_frames, num_channels, height, width)`
Note that `num_videos` may be different for each batch, and 'num_frames'
may be different for each video, in which case the data is passed as a
...
...
@@ -580,7 +579,7 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
return
LlavaOnevisionVideoPixelInputs
(
type
=
"pixel_values_videos"
,
pixel_values_videos
=
pixel_values_videos
,
pixel_values_videos
=
flatten_bn
(
pixel_values_videos
)
,
)
def
_parse_and_validate_multimodal_inputs
(
self
,
**
kwargs
:
object
)
->
dict
:
...
...
@@ -768,22 +767,6 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
for
i
,
patch_features_batch
in
enumerate
(
patch_embeddings
)
]
def
_add_image_newline
(
self
,
video_features
:
torch
.
Tensor
,
videos
:
int
=
1
,
frames
:
int
=
1
,
strategy
:
str
=
"one_token"
,
)
->
torch
.
Tensor
:
if
strategy
==
"one_token"
:
video_features
=
video_features
.
reshape
(
videos
,
frames
*
video_features
.
shape
[
1
],
-
1
)
image_newline
=
self
.
image_newline
[
None
,
None
,
:].
repeat
(
videos
,
1
,
1
).
to
(
video_features
.
device
)
video_features
=
torch
.
cat
((
video_features
,
image_newline
),
dim
=
1
)
return
video_features
raise
ValueError
(
f
"Unexpected video newline strategy:
{
strategy
}
"
)
def
_video_pixels_to_features
(
self
,
vision_tower
:
Union
[
CLIPVisionModel
,
SiglipVisionModel
],
...
...
@@ -807,33 +790,43 @@ class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
video_pixels
=
inputs
[
"pixel_values_videos"
]
if
isinstance
(
video_pixels
,
torch
.
Tensor
):
b
,
num_videos
,
frames
,
c
,
h
,
w
=
video_pixels
.
shape
pixel_values
=
video_pixels
.
view
(
b
*
num_videos
*
frames
,
c
,
h
,
w
)
stacked_embeddings
=
self
.
_video_pixels_to_features
(
self
.
vision_tower
,
pixel_values
)
stacked_embeddings
=
self
.
_add_image_newline
(
stacked_embeddings
,
videos
=
b
*
num_videos
,
frames
=
frames
,
strategy
=
"one_token"
)
return
stacked_embeddings
elif
is_list_of
(
video_pixels
,
torch
.
Tensor
):
stacked_embeddings
=
[]
for
video_pixel
in
video_pixels
:
num_videos
,
frames
,
c
,
h
,
w
=
video_pixel
.
shape
pixel_values
=
video_pixel
.
view
(
num_videos
*
frames
,
c
,
h
,
w
)
embeddings
=
self
.
_video_pixels_to_features
(
self
.
vision_tower
,
pixel_values
)
embeddings
=
self
.
_add_image_newline
(
embeddings
,
videos
=
num_videos
,
frames
=
frames
,
strategy
=
"one_token"
)
stacked_embeddings
.
append
(
embeddings
)
return
stacked_embeddings
else
:
raise
ValueError
(
f
"Unsupported type of video input
{
type
(
video_pixels
)
}
"
)
total_videos
,
frames
,
c
,
h
,
w
=
video_pixels
.
shape
video_pixels_flat
=
video_pixels
.
view
(
total_videos
*
frames
,
c
,
h
,
w
)
embeddings_flat
=
self
.
_video_pixels_to_features
(
self
.
vision_tower
,
video_pixels_flat
)
embeddings_flat
=
embeddings_flat
.
reshape
(
total_videos
,
frames
*
embeddings_flat
.
shape
[
1
],
-
1
)
image_newline
=
self
.
image_newline
[
None
,
None
,
:].
expand
(
total_videos
,
-
1
,
-
1
)
return
torch
.
cat
((
embeddings_flat
,
image_newline
),
dim
=
1
)
frames_per_video
=
[
len
(
video
)
for
video
in
video_pixels
]
video_pixels_flat
=
torch
.
cat
(
video_pixels
)
embeddings_flat
=
self
.
_video_pixels_to_features
(
self
.
vision_tower
,
video_pixels_flat
)
image_newline
=
self
.
image_newline
[
None
,
None
,
:]
return
[
torch
.
cat
(
(
embeds
.
reshape
(
1
,
num_frame
*
embeddings_flat
.
shape
[
1
],
-
1
),
image_newline
,
),
dim
=
1
,
)
for
num_frame
,
embeds
in
zip
(
frames_per_video
,
torch
.
split
(
embeddings_flat
,
frames_per_video
),
)
]
def
apply_pooling
(
self
,
image_features
,
stride
=
2
):
def
apply_pooling
(
self
,
image_features
:
torch
.
Tensor
,
stride
:
int
=
2
):
vision_config
=
self
.
config
.
vision_config
height
=
width
=
vision_config
.
image_size
//
vision_config
.
patch_size
batch_frames
,
_
,
dim
=
image_features
.
shape
...
...
vllm/model_executor/models/mllama.py
View file @
53076d70
...
...
@@ -1368,7 +1368,7 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal,
full_text_row_masked_out_mask
=
(
attn_metadata
.
encoder_seq_lens_tensor
!=
0
).
reshape
(
-
1
,
1
).
to
(
input_ids
.
device
)
skip_cross_attention
=
max
(
attn_metadata
.
encoder_seq_len
s
)
==
0
skip_cross_attention
=
attn_metadata
.
max_
encoder_seq_len
==
0
# For image-present prefill.
else
:
...
...
vllm/model_executor/models/nvlm_d.py
View file @
53076d70
...
...
@@ -36,11 +36,11 @@ class NVLMProcessor(BaseInternVLProcessor):
def
image_token_id
(
self
)
->
int
:
return
self
.
tokenizer
.
get_vocab
()[
IMG_PAD
]
def
get_image_repl
_features
(
def
get_image_repl
(
self
,
feature_size
:
int
,
num_patches
:
Optional
[
int
],
)
->
str
:
)
->
PromptUpdateDetails
[
str
]
:
if
num_patches
is
None
:
raise
NotImplementedError
(
"Embedding inputs are not supported"
)
...
...
@@ -55,14 +55,9 @@ class NVLMProcessor(BaseInternVLProcessor):
# We include the start and end as well because "<Image><tile" is
# tokenized as ["<Image", "><", "tile"], resulting in assertion error
# when trying to find "<tile" as a subsequence of "<Image><tile"
re
turn
"<Image>"
+
features
+
"</Image>"
re
pl
=
"<Image>"
+
features
+
"</Image>"
def
get_image_repl_full
(
self
,
feature_size
:
int
,
num_patches
:
Optional
[
int
],
)
->
str
:
return
self
.
get_image_repl_features
(
feature_size
,
num_patches
)
return
PromptUpdateDetails
(
full
=
repl
,
features
=
repl
)
class
NVLMProcessingInfo
(
BaseInternVLProcessingInfo
):
...
...
@@ -180,11 +175,11 @@ class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]):
if
num_patches
is
not
None
:
assert
isinstance
(
num_patches
,
int
)
repl
=
hf_processor
.
get_image_repl
(
feature_size
,
num_patches
)
return
PromptUpdateDetails
(
full
=
hf_processor
.
get_image_repl_full
(
feature_size
,
num_patches
)
+
"
\n
"
,
features
=
hf_processor
.
get_image_repl_features
(
feature_size
,
num_patches
)
+
"
\n
"
,
full
=
repl
.
full
+
"
\n
"
,
features
=
repl
.
features
+
"
\n
"
,
)
# See note in dummy data regarding why we have the extra newline
...
...
vllm/model_executor/models/qwen2_5_vl.py
View file @
53076d70
...
...
@@ -608,6 +608,17 @@ class Qwen2_5_VisionTransformer(nn.Module):
window_index
=
torch
.
cat
(
window_index
,
dim
=
0
)
return
window_index
,
cu_window_seqlens
def
compute_attn_mask_seqlen
(
self
,
cu_seqlens
:
torch
.
Tensor
,
)
->
tuple
[
Optional
[
int
],
Optional
[
list
[
int
]]]:
max_seqlen
,
seqlens
=
None
,
None
if
self
.
attn_backend
==
_Backend
.
FLASH_ATTN
:
max_seqlen
=
(
cu_seqlens
[
1
:]
-
cu_seqlens
[:
-
1
]).
max
().
item
()
elif
self
.
attn_backend
==
_Backend
.
XFORMERS
:
seqlens
=
(
cu_seqlens
[
1
:]
-
cu_seqlens
[:
-
1
]).
tolist
()
return
max_seqlen
,
seqlens
def
forward
(
self
,
x
:
torch
.
Tensor
,
...
...
@@ -645,23 +656,27 @@ class Qwen2_5_VisionTransformer(nn.Module):
# transformers
hidden_states
=
hidden_states
.
unsqueeze
(
1
)
max_seqlen
=
None
seqlens
=
None
if
self
.
attn_backend
==
_Backend
.
FLASH_ATTN
:
max_seqlen
=
(
cu_seqlens
[
1
:]
-
cu_seqlens
[:
-
1
]).
max
().
item
()
elif
self
.
attn_backend
==
_Backend
.
XFORMERS
:
seqlens
=
(
cu_seqlens
[
1
:]
-
cu_seqlens
[:
-
1
]).
tolist
()
# pre-compute seqlens for window/full attn to reduce cuMemcpy operations
max_seqlen_full
,
seqlens_full
=
self
.
compute_attn_mask_seqlen
(
cu_seqlens
)
max_seqlen_window
,
seqlens_window
=
self
.
compute_attn_mask_seqlen
(
cu_window_seqlens
)
for
layer_num
,
blk
in
enumerate
(
self
.
blocks
):
if
layer_num
in
self
.
fullatt_block_indexes
:
cu_seqlens_now
=
cu_seqlens
max_seqlen_now
=
max_seqlen_full
seqlens_now
=
seqlens_full
else
:
cu_seqlens_now
=
cu_window_seqlens
max_seqlen_now
=
max_seqlen_window
seqlens_now
=
seqlens_window
hidden_states
=
blk
(
hidden_states
,
cu_seqlens
=
cu_seqlens_now
,
rotary_pos_emb
=
rotary_pos_emb
,
max_seqlen
=
max_seqlen
,
seqlens
=
seqlens
,
max_seqlen
=
max_seqlen
_now
,
seqlens
=
seqlens
_now
,
)
# For Qwen2.5-VL-3B, float16 will overflow at last block
...
...
vllm/model_executor/models/qwen2_vl.py
View file @
53076d70
...
...
@@ -617,6 +617,16 @@ class Qwen2VisionTransformer(nn.Module):
rotary_pos_emb
=
rotary_pos_emb_full
[
pos_ids
].
flatten
(
1
)
return
rotary_pos_emb
def
compute_attn_mask_seqlen
(
self
,
cu_seqlens
:
torch
.
Tensor
)
->
tuple
[
Optional
[
int
],
Optional
[
list
[
int
]]]:
max_seqlen
,
seqlens
=
None
,
None
if
self
.
attn_backend
==
_Backend
.
FLASH_ATTN
:
max_seqlen
=
(
cu_seqlens
[
1
:]
-
cu_seqlens
[:
-
1
]).
max
().
item
()
elif
self
.
attn_backend
==
_Backend
.
XFORMERS
:
seqlens
=
(
cu_seqlens
[
1
:]
-
cu_seqlens
[:
-
1
]).
tolist
()
return
max_seqlen
,
seqlens
def
forward
(
self
,
x
:
torch
.
Tensor
,
...
...
@@ -638,12 +648,8 @@ class Qwen2VisionTransformer(nn.Module):
# transformers
x
=
x
.
unsqueeze
(
1
)
max_seqlen
=
None
seqlens
=
None
if
self
.
attn_backend
==
_Backend
.
FLASH_ATTN
:
max_seqlen
=
(
cu_seqlens
[
1
:]
-
cu_seqlens
[:
-
1
]).
max
().
item
()
elif
self
.
attn_backend
==
_Backend
.
XFORMERS
:
seqlens
=
(
cu_seqlens
[
1
:]
-
cu_seqlens
[:
-
1
]).
tolist
()
# pre-compute seqlens for attn mask to reduce cuMemcpy operations
max_seqlen
,
seqlens
=
self
.
compute_attn_mask_seqlen
(
cu_seqlens
)
for
blk
in
self
.
blocks
:
x
=
blk
(
x
,
...
...
vllm/model_executor/models/registry.py
View file @
53076d70
...
...
@@ -104,6 +104,7 @@ _TEXT_GENERATION_MODELS = {
"Starcoder2ForCausalLM"
:
(
"starcoder2"
,
"Starcoder2ForCausalLM"
),
"SolarForCausalLM"
:
(
"solar"
,
"SolarForCausalLM"
),
"TeleChat2ForCausalLM"
:
(
"telechat2"
,
"TeleChat2ForCausalLM"
),
"TeleFLMForCausalLM"
:
(
"teleflm"
,
"TeleFLMForCausalLM"
),
"XverseForCausalLM"
:
(
"llama"
,
"LlamaForCausalLM"
),
"Zamba2ForCausalLM"
:
(
"zamba2"
,
"Zamba2ForCausalLM"
),
# [Encoder-decoder]
...
...
@@ -418,11 +419,13 @@ class _ModelRegistry:
if
not
architectures
:
logger
.
warning
(
"No model architectures are specified"
)
normalized_arch
=
[]
for
model
in
architectures
:
if
model
not
in
self
.
models
:
model
=
"TransformersModel"
normalized_arch
.
append
(
model
)
# filter out support architectures
normalized_arch
=
list
(
filter
(
lambda
model
:
model
in
self
.
models
,
architectures
))
# make sure Transformers fallback are put at the last
if
len
(
normalized_arch
)
!=
len
(
architectures
):
normalized_arch
.
append
(
"TransformersModel"
)
return
normalized_arch
def
inspect_model_cls
(
...
...
vllm/model_executor/models/teleflm.py
0 → 100644
View file @
53076d70
# SPDX-License-Identifier: Apache-2.0
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
typing
import
Type
import
torch
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.logits_processor
import
LogitsProcessor
from
vllm.model_executor.models.llama
import
(
LlamaDecoderLayer
,
LlamaForCausalLM
,
LlamaModel
)
class
TeleFLMModel
(
LlamaModel
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
,
layer_type
:
Type
[
LlamaDecoderLayer
]
=
LlamaDecoderLayer
,
):
super
().
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
,
layer_type
=
layer_type
)
"""
This implementation is based on the µScaling paper presented at
the ICLR 2025 Workshop:
NanoLM: An Affordable LLM Study Benchmark
\
via Accurate Loss Prediction across Scales
by Yiqun Yao et al.
Available at: https://openreview.net/forum?id=IwaPYg1SCA
arXiv preprint: https://arxiv.org/abs/2304.06875
"""
self
.
use_mup
=
self
.
config
.
use_mup
if
self
.
use_mup
:
self
.
input_mult
=
self
.
config
.
input_mult
def
get_input_embeddings
(
self
,
input_ids
:
torch
.
Tensor
)
->
torch
.
Tensor
:
embedding
=
self
.
embed_tokens
(
input_ids
)
if
self
.
use_mup
:
embedding
=
embedding
*
self
.
input_mult
return
embedding
class
TeleFLMForCausalLM
(
LlamaForCausalLM
):
def
__init__
(
self
,
*
,
vllm_config
:
VllmConfig
,
prefix
:
str
=
""
):
super
().
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
# mup
self
.
use_mup
=
self
.
config
.
use_mup
if
self
.
use_mup
:
self
.
mup_scale_factor
=
self
.
config
.
mup_scale_factor
self
.
output_mult
=
self
.
config
.
output_mult
/
self
.
mup_scale_factor
logit_scale
=
self
.
output_mult
self
.
logits_processor
=
LogitsProcessor
(
self
.
unpadded_vocab_size
,
self
.
config
.
vocab_size
,
logit_scale
)
vllm/multimodal/processing.py
View file @
53076d70
...
...
@@ -103,13 +103,13 @@ The token sequence or text to update.
@
dataclass
class
PromptUpdateDetails
:
class
PromptUpdateDetails
(
Generic
[
_S
])
:
"""Details about the token sequence or text that are part of the update."""
full
:
PromptSeq
full
:
_S
"""The full content."""
features
:
PromptSeq
features
:
_S
"""
The part of the content that corresponds to feature placeholders;
this will be replaced by the output of the vision encoder during model
...
...
@@ -117,7 +117,7 @@ class PromptUpdateDetails:
"""
@
staticmethod
def
from_seq
(
seq
:
PromptSeq
)
->
"PromptUpdateDetails"
:
def
from_seq
(
seq
:
_S
)
->
"PromptUpdateDetails
[_S]
"
:
return
PromptUpdateDetails
(
full
=
seq
,
features
=
seq
)
...
...
vllm/outputs.py
View file @
53076d70
...
...
@@ -223,7 +223,12 @@ class RequestOutput:
if
delta
:
# Slice logprobs delta if applicable
if
output_logprobs
:
output_logprobs
=
output_logprobs
[
-
num_output_tokens
:]
# num_output_tokens can be 0 when n > 1 and request finishes
# before the others
if
num_output_tokens
>
0
:
output_logprobs
=
output_logprobs
[
-
num_output_tokens
:]
else
:
output_logprobs
=
None
# Don't include prompt if this is after the first output
# containing decode token ids
if
include_prompt
and
seq
.
get_output_len
()
>
num_output_tokens
:
...
...
vllm/platforms/__init__.py
View file @
53076d70
...
...
@@ -2,7 +2,6 @@
import
logging
import
traceback
from
contextlib
import
suppress
from
itertools
import
chain
from
typing
import
TYPE_CHECKING
,
Optional
...
...
@@ -191,21 +190,6 @@ def neuron_platform_plugin() -> Optional[str]:
return
"vllm.platforms.neuron.NeuronPlatform"
if
is_neuron
else
None
def
openvino_platform_plugin
()
->
Optional
[
str
]:
is_openvino
=
False
logger
.
debug
(
"Checking if OpenVINO platform is available."
)
with
suppress
(
Exception
):
is_openvino
=
vllm_version_matches_substr
(
"openvino"
)
if
is_openvino
:
logger
.
debug
(
"Confirmed OpenVINO platform is available"
" because vLLM is built with OpenVINO."
)
if
not
is_openvino
:
logger
.
debug
(
"OpenVINO platform is not available because"
" vLLM is not built with OpenVINO."
)
return
"vllm.platforms.openvino.OpenVinoPlatform"
if
is_openvino
else
None
builtin_platform_plugins
=
{
'tpu'
:
tpu_platform_plugin
,
'cuda'
:
cuda_platform_plugin
,
...
...
@@ -214,7 +198,6 @@ builtin_platform_plugins = {
'xpu'
:
xpu_platform_plugin
,
'cpu'
:
cpu_platform_plugin
,
'neuron'
:
neuron_platform_plugin
,
'openvino'
:
openvino_platform_plugin
,
}
...
...
vllm/platforms/cuda.py
View file @
53076d70
...
...
@@ -14,6 +14,7 @@ from typing_extensions import ParamSpec
# import custom ops, trigger op registration
import
vllm._C
# noqa
import
vllm.envs
as
envs
from
vllm.fa_utils
import
get_flash_attn_version
from
vllm.logger
import
init_logger
from
vllm.utils
import
import_pynvml
...
...
@@ -212,9 +213,14 @@ class CudaPlatformBase(Platform):
return
(
"vllm.attention.backends."
"flashmla.FlashMLABackend"
)
if
use_v1
:
logger
.
info_once
(
"Using Flash Attention backend on V1 engine."
)
return
(
"vllm.v1.attention.backends.flash_attn."
"FlashAttentionBackend"
)
if
selected_backend
==
_Backend
.
TRITON_ATTN_VLLM_V1
:
logger
.
info_once
(
"Using Triton backend on V1 engine."
)
return
(
"vllm.v1.attention.backends."
"triton_attn.TritonAttentionBackend"
)
if
cls
.
has_device_capability
(
80
):
logger
.
info_once
(
"Using Flash Attention backend on V1 engine."
)
return
(
"vllm.v1.attention.backends."
"flash_attn.FlashAttentionBackend"
)
if
selected_backend
==
_Backend
.
FLASHINFER
:
logger
.
info
(
"Using FlashInfer backend."
)
return
"vllm.attention.backends.flashinfer.FlashInferBackend"
...
...
@@ -240,15 +246,6 @@ class CudaPlatformBase(Platform):
"Cannot use FlashAttention-2 backend for dtype other than "
"torch.float16 or torch.bfloat16."
)
target_backend
=
_Backend
.
XFORMERS
elif
kv_cache_dtype
is
not
None
and
\
kv_cache_dtype
.
startswith
(
"fp8"
):
logger
.
info
(
"Cannot use FlashAttention-2 backend for FP8 KV cache."
)
logger
.
warning
(
"Please use FlashInfer backend with FP8 KV Cache for "
"better performance by setting environment variable "
"VLLM_ATTENTION_BACKEND=FLASHINFER"
)
target_backend
=
_Backend
.
XFORMERS
elif
block_size
%
16
!=
0
:
logger
.
info
(
"Cannot use FlashAttention-2 backend for block size not "
...
...
@@ -270,6 +267,17 @@ class CudaPlatformBase(Platform):
"Cannot use FlashAttention-2 backend for head size %d."
,
head_size
)
target_backend
=
_Backend
.
XFORMERS
fp8_kv_cache
=
(
kv_cache_dtype
is
not
None
and
kv_cache_dtype
.
startswith
(
"fp8"
))
if
(
fp8_kv_cache
and
get_flash_attn_version
()
!=
3
):
logger
.
info
(
"Cannot use FlashAttention-2 backend for FP8 KV cache."
)
logger
.
warning
(
"Please use FlashInfer backend with FP8 KV Cache for "
"better performance by setting environment variable "
"VLLM_ATTENTION_BACKEND=FLASHINFER"
)
target_backend
=
_Backend
.
XFORMERS
except
ImportError
:
logger
.
info
(
"Cannot use FlashAttention-2 backend because the "
...
...
vllm/platforms/interface.py
View file @
53076d70
...
...
@@ -29,10 +29,10 @@ def in_wsl() -> bool:
class
_Backend
(
enum
.
Enum
):
FLASH_ATTN
=
enum
.
auto
()
FLASH_ATTN_VLLM_V1
=
enum
.
auto
()
TRITON_ATTN_VLLM_V1
=
enum
.
auto
()
XFORMERS
=
enum
.
auto
()
ROCM_FLASH
=
enum
.
auto
()
TORCH_SDPA
=
enum
.
auto
()
OPENVINO
=
enum
.
auto
()
FLASHINFER
=
enum
.
auto
()
TRITON_MLA
=
enum
.
auto
()
# Supported by V1
FLASHMLA
=
enum
.
auto
()
# Supported by V1
...
...
@@ -52,7 +52,6 @@ class PlatformEnum(enum.Enum):
XPU
=
enum
.
auto
()
CPU
=
enum
.
auto
()
NEURON
=
enum
.
auto
()
OPENVINO
=
enum
.
auto
()
OOT
=
enum
.
auto
()
UNSPECIFIED
=
enum
.
auto
()
...
...
@@ -112,6 +111,8 @@ class Platform:
supported_quantization
:
list
[
str
]
=
[]
additional_env_vars
:
list
[
str
]
=
[]
def
is_cuda
(
self
)
->
bool
:
return
self
.
_enum
==
PlatformEnum
.
CUDA
...
...
@@ -133,9 +134,6 @@ class Platform:
def
is_neuron
(
self
)
->
bool
:
return
self
.
_enum
==
PlatformEnum
.
NEURON
def
is_openvino
(
self
)
->
bool
:
return
self
.
_enum
==
PlatformEnum
.
OPENVINO
def
is_out_of_tree
(
self
)
->
bool
:
return
self
.
_enum
==
PlatformEnum
.
OOT
...
...
vllm/platforms/openvino.py
deleted
100644 → 0
View file @
322a0be6
# SPDX-License-Identifier: Apache-2.0
from
typing
import
TYPE_CHECKING
,
Optional
import
torch
import
vllm.envs
as
envs
from
vllm.logger
import
init_logger
from
.interface
import
Platform
,
PlatformEnum
,
_Backend
if
TYPE_CHECKING
:
from
vllm.config
import
VllmConfig
else
:
VllmConfig
=
None
logger
=
init_logger
(
__name__
)
try
:
import
openvino
as
ov
import
openvino.properties.hint
as
hints
except
ImportError
as
e
:
logger
.
warning
(
"Failed to import OpenVINO with %r"
,
e
)
class
OpenVinoPlatform
(
Platform
):
_enum
=
PlatformEnum
.
OPENVINO
device_name
:
str
=
"openvino"
device_type
:
str
=
"openvino"
dispatch_key
:
str
=
"CPU"
@
classmethod
def
get_attn_backend_cls
(
cls
,
selected_backend
:
_Backend
,
head_size
:
int
,
dtype
:
torch
.
dtype
,
kv_cache_dtype
:
Optional
[
str
],
block_size
:
int
,
use_v1
:
bool
,
use_mla
:
bool
)
->
str
:
if
selected_backend
!=
_Backend
.
OPENVINO
:
logger
.
info
(
"Cannot use %s backend on OpenVINO."
,
selected_backend
)
logger
.
info
(
"Using OpenVINO Attention backend."
)
return
"vllm.attention.backends.openvino.OpenVINOAttentionBackend"
@
classmethod
def
get_device_name
(
cls
,
device_id
:
int
=
0
)
->
str
:
return
"openvino"
@
classmethod
def
is_async_output_supported
(
cls
,
enforce_eager
:
Optional
[
bool
])
->
bool
:
return
False
@
classmethod
def
inference_mode
(
cls
):
return
torch
.
inference_mode
(
mode
=
True
)
@
classmethod
def
is_openvino_cpu
(
cls
)
->
bool
:
return
"CPU"
in
envs
.
VLLM_OPENVINO_DEVICE
@
classmethod
def
is_openvino_gpu
(
cls
)
->
bool
:
return
"GPU"
in
envs
.
VLLM_OPENVINO_DEVICE
@
classmethod
def
is_pin_memory_available
(
cls
)
->
bool
:
logger
.
warning
(
"Pin memory is not supported on OpenViNO."
)
return
False
@
classmethod
def
check_and_update_config
(
cls
,
vllm_config
:
VllmConfig
)
->
None
:
from
vllm.utils
import
GiB_bytes
parallel_config
=
vllm_config
.
parallel_config
assert
(
parallel_config
.
world_size
==
1
),
"OpenVINO only supports single CPU socket currently."
if
parallel_config
.
worker_cls
==
"auto"
:
parallel_config
.
worker_cls
=
\
"vllm.worker.openvino_worker.OpenVINOWorker"
# check and update model config
model_config
=
vllm_config
.
model_config
if
model_config
.
dtype
!=
torch
.
float32
:
logger
.
warning
(
f
"Only float32 dtype is supported on OpenVINO, casting from
{
model_config
.
dtype
}
."
# noqa: G004, E501
)
model_config
.
dtype
=
torch
.
float32
if
not
model_config
.
enforce_eager
:
logger
.
warning
(
"CUDA graph is not supported on OpenVINO backend, fallback to "
"the eager mode."
)
model_config
.
enforce_eager
=
True
# check and update cache config
ov_core
=
ov
.
Core
()
cache_config
=
vllm_config
.
cache_config
if
cache_config
and
cache_config
.
block_size
is
None
:
cache_config
.
block_size
=
16
if
envs
.
VLLM_OPENVINO_CPU_KV_CACHE_PRECISION
==
"u8"
:
if
not
OpenVinoPlatform
.
is_openvino_cpu
():
logger
.
info
(
"VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is "
"ignored for GPU, f16 data type will be used."
)
cache_config
.
cache_dtype
=
ov
.
Type
.
f16
else
:
logger
.
info
(
"KV cache type is overridden to u8 via "
"VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var."
)
cache_config
.
cache_dtype
=
ov
.
Type
.
u8
else
:
if
OpenVinoPlatform
.
is_openvino_cpu
():
ov_device
=
envs
.
VLLM_OPENVINO_DEVICE
inference_precision
=
ov_core
.
get_property
(
ov_device
,
hints
.
inference_precision
)
if
inference_precision
==
ov
.
Type
.
bf16
:
cache_config
.
cache_dtype
=
ov
.
Type
.
bf16
else
:
cache_config
.
cache_dtype
=
ov
.
Type
.
f16
else
:
cache_config
.
cache_dtype
=
ov
.
Type
.
f16
if
OpenVinoPlatform
.
is_openvino_cpu
():
if
cache_config
.
block_size
!=
32
:
logger
.
info
(
f
"OpenVINO CPU optimal block size is 32, overriding currently set
{
cache_config
.
block_size
}
"
# noqa: G004, E501
)
cache_config
.
block_size
=
32
else
:
if
cache_config
.
block_size
!=
16
:
logger
.
info
(
f
"OpenVINO GPU optimal block size is 16, overriding currently set
{
cache_config
.
block_size
}
"
# noqa: G004, E501
)
cache_config
.
block_size
=
16
kv_cache_space
=
envs
.
VLLM_OPENVINO_KVCACHE_SPACE
if
kv_cache_space
>=
0
:
if
kv_cache_space
==
0
and
OpenVinoPlatform
.
is_openvino_cpu
():
cache_config
.
openvino_kvcache_space_bytes
=
4
*
GiB_bytes
# type: ignore
logger
.
warning
(
"Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
"for OpenVINO backend is not set, using 4 by default."
)
else
:
cache_config
.
openvino_kvcache_space_bytes
=
(
# type: ignore
kv_cache_space
*
GiB_bytes
)
else
:
raise
RuntimeError
(
"Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
f
"
{
kv_cache_space
}
, expect a positive integer value."
)
assert
vllm_config
.
device_config
.
device_type
==
"openvino"
assert
vllm_config
.
lora_config
is
None
,
\
"OpenVINO backend doesn't support LoRA"
assert
cls
.
is_openvino_cpu
()
or
\
cls
.
is_openvino_gpu
(),
\
"OpenVINO backend supports only CPU and GPU devices"
vllm/platforms/rocm.py
View file @
53076d70
...
...
@@ -120,8 +120,9 @@ class RocmPlatform(Platform):
selected_backend
=
(
_Backend
.
ROCM_FLASH
if
selected_backend
==
_Backend
.
FLASH_ATTN
else
selected_backend
)
if
envs
.
VLLM_USE_V1
:
logger
.
info
(
"Using ROCm Attention backend on V1 engine."
)
return
"vllm.v1.attention.backends.rocm_attn.ROCmAttentionBackend"
logger
.
info
(
"Using Triton Attention backend on V1 engine."
)
return
(
"vllm.v1.attention.backends."
"triton_attn.TritonAttentionBackend"
)
if
selected_backend
==
_Backend
.
ROCM_FLASH
:
if
not
cls
.
has_device_capability
(
90
):
# not Instinct series GPUs.
...
...
vllm/platforms/tpu.py
View file @
53076d70
...
...
@@ -29,6 +29,10 @@ class TpuPlatform(Platform):
"tpu_int8"
,
"compressed-tensors"
,
"compressed_tensors"
]
additional_env_vars
:
list
[
str
]
=
[
"TPU_CHIPS_PER_HOST_BOUNDS"
,
"TPU_HOST_BOUNDS"
]
@
classmethod
def
get_attn_backend_cls
(
cls
,
selected_backend
:
_Backend
,
head_size
:
int
,
dtype
:
torch
.
dtype
,
kv_cache_dtype
:
Optional
[
str
],
...
...
vllm/spec_decode/spec_decode_worker.py
View file @
53076d70
...
...
@@ -92,22 +92,20 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
# Override draft-model specific worker args.
draft_worker_kwargs
.
update
(
vllm_config
=
draft_worker_config
,
ngram_prompt_lookup_max
=
speculative_config
.
ngram_
prompt_lookup_max
,
ngram_prompt_lookup_min
=
speculative_config
.
ngram_
prompt_lookup_min
,
ngram_prompt_lookup_max
=
speculative_config
.
prompt_lookup_max
,
ngram_prompt_lookup_min
=
speculative_config
.
prompt_lookup_min
,
)
spec_decode_worker
=
SpecDecodeWorker
.
create_worker
(
scorer_worker
=
target_worker
,
draft_worker_kwargs
=
draft_worker_kwargs
,
disable_mqa_scorer
=
speculative_config
.
speculative_disable_mqa_scorer
,
disable_by_batch_size
=
speculative_config
.
speculative_disable_by_batch_size
,
draft_token_acceptance_method
=
speculative_config
.
draft_token_acceptance_method
,
disable_mqa_scorer
=
speculative_config
.
disable_mqa_scorer
,
disable_by_batch_size
=
speculative_config
.
disable_by_batch_size
,
draft_token_acceptance_method
=
speculative_config
.
acceptance_method
,
typical_acceptance_sampler_posterior_threshold
=
speculative_config
.
typical_acceptance_sampler_
posterior_threshold
,
posterior_threshold
,
typical_acceptance_sampler_posterior_alpha
=
speculative_config
.
typical_acceptance_sampler_
posterior_alpha
,
posterior_alpha
,
disable_logprobs
=
speculative_config
.
disable_logprobs
,
disable_log_stats
=
speculative_config
.
disable_log_stats
,
num_speculative_tokens
=
speculative_config
.
num_speculative_tokens
,
...
...
Prev
1
…
5
6
7
8
9
10
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment