Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d76fc11e
Commit
d76fc11e
authored
Jan 28, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.15.0rc1' into v0.15.0rc1-dev
parents
38166ec4
58996f35
Changes
313
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
363 additions
and
24 deletions
+363
-24
vllm/model_executor/models/starcoder2.py
vllm/model_executor/models/starcoder2.py
+2
-2
vllm/model_executor/models/step3_text.py
vllm/model_executor/models/step3_text.py
+2
-2
vllm/model_executor/models/step3_vl.py
vllm/model_executor/models/step3_vl.py
+1
-1
vllm/model_executor/models/tarsier.py
vllm/model_executor/models/tarsier.py
+1
-1
vllm/model_executor/models/transformers/base.py
vllm/model_executor/models/transformers/base.py
+2
-2
vllm/model_executor/models/transformers/moe.py
vllm/model_executor/models/transformers/moe.py
+1
-1
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/ultravox.py
+1
-1
vllm/model_executor/models/voxtral.py
vllm/model_executor/models/voxtral.py
+1
-1
vllm/model_executor/models/voxtral_streaming.py
vllm/model_executor/models/voxtral_streaming.py
+1
-1
vllm/model_executor/models/whisper_causal.py
vllm/model_executor/models/whisper_causal.py
+40
-0
vllm/model_executor/models/zamba2.py
vllm/model_executor/models/zamba2.py
+2
-2
vllm/model_executor/warmup/deep_gemm_warmup.py
vllm/model_executor/warmup/deep_gemm_warmup.py
+3
-3
vllm/multimodal/inputs.py
vllm/multimodal/inputs.py
+26
-1
vllm/multimodal/parse.py
vllm/multimodal/parse.py
+19
-0
vllm/multimodal/video.py
vllm/multimodal/video.py
+21
-0
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+2
-1
vllm/plugins/lora_resolvers/filesystem_resolver.py
vllm/plugins/lora_resolvers/filesystem_resolver.py
+13
-3
vllm/plugins/lora_resolvers/hf_hub_resolver.py
vllm/plugins/lora_resolvers/hf_hub_resolver.py
+143
-0
vllm/reasoning/__init__.py
vllm/reasoning/__init__.py
+2
-2
vllm/reasoning/kimi_k2_reasoning_parser.py
vllm/reasoning/kimi_k2_reasoning_parser.py
+80
-0
No files found.
vllm/model_executor/models/starcoder2.py
View file @
d76fc11e
...
@@ -252,7 +252,7 @@ class Starcoder2Model(nn.Module):
...
@@ -252,7 +252,7 @@ class Starcoder2Model(nn.Module):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
,
intermediate_tensors
:
IntermediateTensors
|
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
@@ -336,7 +336,7 @@ class Starcoder2ForCausalLM(nn.Module, SupportsPP):
...
@@ -336,7 +336,7 @@ class Starcoder2ForCausalLM(nn.Module, SupportsPP):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/step3_text.py
View file @
d76fc11e
...
@@ -354,7 +354,7 @@ class Step3TextModel(nn.Module):
...
@@ -354,7 +354,7 @@ class Step3TextModel(nn.Module):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
@@ -419,7 +419,7 @@ class Step3TextForCausalLM(nn.Module, SupportsPP):
...
@@ -419,7 +419,7 @@ class Step3TextForCausalLM(nn.Module, SupportsPP):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/step3_vl.py
View file @
d76fc11e
...
@@ -1101,7 +1101,7 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
...
@@ -1101,7 +1101,7 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/tarsier.py
View file @
d76fc11e
...
@@ -585,7 +585,7 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
...
@@ -585,7 +585,7 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/transformers/base.py
View file @
d76fc11e
...
@@ -350,7 +350,7 @@ class Base(
...
@@ -350,7 +350,7 @@ class Base(
# vLLM does not support encoder-decoder models, so if any encoder layer is
# vLLM does not support encoder-decoder models, so if any encoder layer is
# found in a text only model, we assume the whole model is an encoder model
# found in a text only model, we assume the whole model is an encoder model
if
has_encoder
(
self
.
model
)
and
not
is_multimodal
(
self
.
config
):
if
has_encoder
(
self
.
model
)
and
not
is_multimodal
(
self
.
config
):
self
.
check_version
(
"5.0.0
.dev0
"
,
"encoder models support"
)
self
.
check_version
(
"5.0.0"
,
"encoder models support"
)
attn_type
=
AttentionType
.
ENCODER_ONLY
attn_type
=
AttentionType
.
ENCODER_ONLY
else
:
else
:
attn_type
=
AttentionType
.
DECODER
attn_type
=
AttentionType
.
DECODER
...
@@ -502,7 +502,7 @@ class Base(
...
@@ -502,7 +502,7 @@ class Base(
)
)
def
set_aux_hidden_state_layers
(
self
,
layers
:
tuple
[
int
,
...])
->
None
:
def
set_aux_hidden_state_layers
(
self
,
layers
:
tuple
[
int
,
...])
->
None
:
self
.
check_version
(
"5.0.0
.dev0
"
,
"Eagle3 support"
)
self
.
check_version
(
"5.0.0"
,
"Eagle3 support"
)
from
transformers.utils.generic
import
OutputRecorder
from
transformers.utils.generic
import
OutputRecorder
# The default value in PreTrainedModel is None
# The default value in PreTrainedModel is None
...
...
vllm/model_executor/models/transformers/moe.py
View file @
d76fc11e
...
@@ -118,7 +118,7 @@ direct_register_custom_op(
...
@@ -118,7 +118,7 @@ direct_register_custom_op(
class
MoEMixin
(
MixtureOfExperts
):
class
MoEMixin
(
MixtureOfExperts
):
def
__init__
(
self
,
*
,
vllm_config
:
"VllmConfig"
,
prefix
:
str
=
""
):
def
__init__
(
self
,
*
,
vllm_config
:
"VllmConfig"
,
prefix
:
str
=
""
):
self
.
check_version
(
"5.0.0
.dev0
"
,
"MoE models support"
)
self
.
check_version
(
"5.0.0"
,
"MoE models support"
)
# Skip MixtureOfExperts.__init__ and call the next class in MRO
# Skip MixtureOfExperts.__init__ and call the next class in MRO
super
(
MixtureOfExperts
,
self
).
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
super
(
MixtureOfExperts
,
self
).
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
...
...
vllm/model_executor/models/ultravox.py
View file @
d76fc11e
...
@@ -714,7 +714,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
...
@@ -714,7 +714,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
torch
.
Tensor
|
None
=
None
,
intermediate_tensors
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/voxtral.py
View file @
d76fc11e
...
@@ -397,7 +397,7 @@ class VoxtralForConditionalGeneration(
...
@@ -397,7 +397,7 @@ class VoxtralForConditionalGeneration(
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/voxtral_streaming.py
View file @
d76fc11e
...
@@ -173,7 +173,7 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration):
...
@@ -173,7 +173,7 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/whisper_causal.py
View file @
d76fc11e
...
@@ -105,6 +105,7 @@ def create_whisper_attention_backend_with_block_pooling(
...
@@ -105,6 +105,7 @@ def create_whisper_attention_backend_with_block_pooling(
)
->
type
[
AttentionBackend
]:
)
->
type
[
AttentionBackend
]:
prefix
=
"WhisperCausalAttentionWithBlockPooling_"
prefix
=
"WhisperCausalAttentionWithBlockPooling_"
underlying_builder
=
underlying_attn_backend
.
get_builder_cls
()
underlying_builder
=
underlying_attn_backend
.
get_builder_cls
()
underlying_impl
=
underlying_attn_backend
.
get_impl_cls
()
class
WhisperCausalAttentionWithBlockPoolingBuilder
(
underlying_builder
):
# type: ignore
class
WhisperCausalAttentionWithBlockPoolingBuilder
(
underlying_builder
):
# type: ignore
def
__init__
(
def
__init__
(
...
@@ -151,6 +152,43 @@ def create_whisper_attention_backend_with_block_pooling(
...
@@ -151,6 +152,43 @@ def create_whisper_attention_backend_with_block_pooling(
common_prefix_len
,
new_common_attn_metadata
,
fast_build
common_prefix_len
,
new_common_attn_metadata
,
fast_build
)
)
# NOTE: We need a custom impl so we can use the transformed slot_mapping
# computed by `WhisperCausalAttentionWithBlockPoolingBuilder` instead of
# the one from `forward_context.slot_mapping` (gpu_model_runner).
# This follows the same pattern as CrossAttentionImpl.
class
WhisperCausalAttentionWithBlockPoolingImpl
(
underlying_impl
):
# type: ignore[valid-type,misc]
def
forward
(
self
,
layer
:
torch
.
nn
.
Module
,
query
:
torch
.
Tensor
,
key
:
torch
.
Tensor
,
value
:
torch
.
Tensor
,
kv_cache
:
torch
.
Tensor
,
attn_metadata
:
AttentionMetadata
,
output
:
torch
.
Tensor
|
None
=
None
,
output_scale
:
torch
.
Tensor
|
None
=
None
,
output_block_scale
:
torch
.
Tensor
|
None
=
None
,
)
->
torch
.
Tensor
:
if
(
not
underlying_attn_backend
.
forward_includes_kv_cache_update
and
attn_metadata
is
not
None
):
self
.
do_kv_cache_update
(
layer
,
key
,
value
,
kv_cache
,
attn_metadata
.
slot_mapping
)
return
super
().
forward
(
layer
,
query
,
key
,
value
,
kv_cache
,
attn_metadata
,
output
,
output_scale
,
output_block_scale
,
)
if
not
issubclass
(
underlying_attn_backend
,
FlashAttentionBackend
):
if
not
issubclass
(
underlying_attn_backend
,
FlashAttentionBackend
):
raise
NotImplementedError
(
raise
NotImplementedError
(
f
"
{
underlying_attn_backend
}
is not yet supported."
f
"
{
underlying_attn_backend
}
is not yet supported."
...
@@ -163,6 +201,7 @@ def create_whisper_attention_backend_with_block_pooling(
...
@@ -163,6 +201,7 @@ def create_whisper_attention_backend_with_block_pooling(
attention_backend_cls
=
underlying_attn_backend
,
attention_backend_cls
=
underlying_attn_backend
,
overrides
=
{
overrides
=
{
"get_builder_cls"
:
lambda
:
WhisperCausalAttentionWithBlockPoolingBuilder
,
"get_builder_cls"
:
lambda
:
WhisperCausalAttentionWithBlockPoolingBuilder
,
"get_impl_cls"
:
lambda
:
WhisperCausalAttentionWithBlockPoolingImpl
,
"get_kv_cache_shape"
:
lambda
num_blocks
,
"get_kv_cache_shape"
:
lambda
num_blocks
,
block_size
,
block_size
,
num_kv_heads
,
num_kv_heads
,
...
@@ -175,6 +214,7 @@ def create_whisper_attention_backend_with_block_pooling(
...
@@ -175,6 +214,7 @@ def create_whisper_attention_backend_with_block_pooling(
num_kv_heads
//
block_pool_size
,
num_kv_heads
//
block_pool_size
,
head_size
,
head_size
,
),
# TODO: generalize to other backends
),
# TODO: generalize to other backends
"forward_includes_kv_cache_update"
:
True
,
},
},
)
)
...
...
vllm/model_executor/models/zamba2.py
View file @
d76fc11e
...
@@ -771,7 +771,7 @@ class Zamba2Model(nn.Module):
...
@@ -771,7 +771,7 @@ class Zamba2Model(nn.Module):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
)
->
torch
.
Tensor
|
IntermediateTensors
:
)
->
torch
.
Tensor
|
IntermediateTensors
:
...
@@ -947,7 +947,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsMambaPrefixC
...
@@ -947,7 +947,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsMambaPrefixC
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
**
kwargs
:
Any
,
**
kwargs
:
Any
,
...
...
vllm/model_executor/warmup/deep_gemm_warmup.py
View file @
d76fc11e
...
@@ -14,7 +14,6 @@ from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank
...
@@ -14,7 +14,6 @@ from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank
from
vllm.model_executor.layers.fused_moe.deep_gemm_moe
import
DeepGemmExperts
from
vllm.model_executor.layers.fused_moe.deep_gemm_moe
import
DeepGemmExperts
from
vllm.model_executor.layers.fused_moe.deep_gemm_utils
import
compute_aligned_M
from
vllm.model_executor.layers.fused_moe.deep_gemm_utils
import
compute_aligned_M
from
vllm.model_executor.layers.fused_moe.layer
import
FusedMoE
,
FusedMoEModularMethod
from
vllm.model_executor.layers.fused_moe.layer
import
FusedMoE
,
FusedMoEModularMethod
from
vllm.model_executor.layers.fused_moe.modular_kernel
import
FusedMoEModularKernel
from
vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe
import
(
from
vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe
import
(
TritonOrDeepGemmExperts
,
TritonOrDeepGemmExperts
,
)
)
...
@@ -169,9 +168,10 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
...
@@ -169,9 +168,10 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
# modular kernels could invoke deep_gemm_moe_fp8
# modular kernels could invoke deep_gemm_moe_fp8
return
True
return
True
mk
:
FusedMoEModularKernel
=
module
.
quant_method
.
fused_experts
# Further check if the ModularKernel implementation uses the DeepGemmExperts
# Further check if the ModularKernel implementation uses the DeepGemmExperts
return
isinstance
(
mk
.
fused_experts
,
(
DeepGemmExperts
,
TritonOrDeepGemmExperts
))
return
isinstance
(
module
.
quant_method
.
moe_mk
,
(
DeepGemmExperts
,
TritonOrDeepGemmExperts
)
)
FP8_GEMM_NT_WARMUP_CACHE
:
set
[
torch
.
Size
]
=
set
()
FP8_GEMM_NT_WARMUP_CACHE
:
set
[
torch
.
Size
]
=
set
()
...
...
vllm/multimodal/inputs.py
View file @
d76fc11e
...
@@ -20,6 +20,7 @@ from typing import (
...
@@ -20,6 +20,7 @@ from typing import (
)
)
import
numpy
as
np
import
numpy
as
np
from
PIL.Image
import
Image
from
typing_extensions
import
NotRequired
,
TypeVar
from
typing_extensions
import
NotRequired
,
TypeVar
from
vllm.utils.collection_utils
import
full_groupby
,
is_list_of
from
vllm.utils.collection_utils
import
full_groupby
,
is_list_of
...
@@ -29,7 +30,6 @@ from vllm.utils.jsontree import json_map_leaves
...
@@ -29,7 +30,6 @@ from vllm.utils.jsontree import json_map_leaves
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
import
torch
import
torch
import
torch.types
import
torch.types
from
PIL.Image
import
Image
from
transformers.feature_extraction_utils
import
BatchFeature
from
transformers.feature_extraction_utils
import
BatchFeature
from
.media
import
MediaWithBytes
from
.media
import
MediaWithBytes
...
@@ -105,6 +105,28 @@ The number of data items allowed per modality is restricted by
...
@@ -105,6 +105,28 @@ The number of data items allowed per modality is restricted by
"""
"""
class
VisionChunkImage
(
TypedDict
):
"""Represents an image wrapped as a vision chunk."""
type
:
Literal
[
"image"
]
image
:
Image
uuid
:
str
|
None
class
VisionChunkVideo
(
TypedDict
):
"""Represents a video chunk with metadata."""
type
:
Literal
[
"video_chunk"
]
video_chunk
:
list
[
Image
]
uuid
:
str
|
None
prompt
:
str
video_idx
:
int
VisionChunk
=
VisionChunkImage
|
VisionChunkVideo
"""A vision chunk is either an image or a video chunk."""
@
final
@
final
class
MultiModalDataBuiltins
(
TypedDict
,
total
=
False
):
class
MultiModalDataBuiltins
(
TypedDict
,
total
=
False
):
"""Type annotations for modality types predefined by vLLM."""
"""Type annotations for modality types predefined by vLLM."""
...
@@ -118,6 +140,9 @@ class MultiModalDataBuiltins(TypedDict, total=False):
...
@@ -118,6 +140,9 @@ class MultiModalDataBuiltins(TypedDict, total=False):
audio
:
ModalityData
[
AudioItem
]
audio
:
ModalityData
[
AudioItem
]
"""The input audio(s)."""
"""The input audio(s)."""
vision_chunk
:
ModalityData
[
VisionChunk
]
"""The input visual atom(s) - unified modality for images and video chunks."""
MultiModalDataDict
:
TypeAlias
=
Mapping
[
str
,
ModalityData
[
Any
]]
MultiModalDataDict
:
TypeAlias
=
Mapping
[
str
,
ModalityData
[
Any
]]
"""
"""
...
...
vllm/multimodal/parse.py
View file @
d76fc11e
...
@@ -384,6 +384,13 @@ class VideoEmbeddingItems(EmbeddingItems):
...
@@ -384,6 +384,13 @@ class VideoEmbeddingItems(EmbeddingItems):
super
().
__init__
(
data
,
"video"
,
expected_hidden_size
)
super
().
__init__
(
data
,
"video"
,
expected_hidden_size
)
class
VisionChunkProcessorItems
(
ProcessorBatchItems
[
Any
]):
"""Processor items for vision chunks (unified image and video chunks)."""
def
__init__
(
self
,
data
:
Sequence
[
Any
])
->
None
:
super
().
__init__
(
data
,
"vision_chunk"
)
_D
=
TypeVar
(
"_D"
,
bound
=
ModalityDataItems
[
Any
,
Any
])
_D
=
TypeVar
(
"_D"
,
bound
=
ModalityDataItems
[
Any
,
Any
])
...
@@ -652,11 +659,23 @@ class MultiModalDataParser:
...
@@ -652,11 +659,23 @@ class MultiModalDataParser:
return
VideoProcessorItems
(
new_videos
,
metadata
=
metadata_lst
)
return
VideoProcessorItems
(
new_videos
,
metadata
=
metadata_lst
)
def
_parse_vision_chunk_data
(
self
,
data
:
ModalityData
[
Any
],
)
->
ModalityDataItems
[
Any
,
Any
]
|
None
:
"""Parse vision chunk data (unified image and video chunks)."""
if
data
is
None
or
self
.
_is_empty
(
data
):
return
None
if
self
.
is_embeddings
(
data
):
raise
ValueError
(
"Do not support embedding data for vision_chunk right now"
)
return
VisionChunkProcessorItems
(
data
)
def
_get_subparsers
(
self
)
->
Mapping
[
str
,
ModalityDataParser
]:
def
_get_subparsers
(
self
)
->
Mapping
[
str
,
ModalityDataParser
]:
return
{
return
{
"audio"
:
self
.
_parse_audio_data
,
"audio"
:
self
.
_parse_audio_data
,
"image"
:
self
.
_parse_image_data
,
"image"
:
self
.
_parse_image_data
,
"video"
:
self
.
_parse_video_data
,
"video"
:
self
.
_parse_video_data
,
"vision_chunk"
:
self
.
_parse_vision_chunk_data
,
}
}
def
parse_mm_data
(
self
,
mm_data
:
MultiModalDataDict
)
->
MultiModalDataItems
:
def
parse_mm_data
(
self
,
mm_data
:
MultiModalDataDict
)
->
MultiModalDataItems
:
...
...
vllm/multimodal/video.py
View file @
d76fc11e
...
@@ -235,6 +235,27 @@ class VideoLoader:
...
@@ -235,6 +235,27 @@ class VideoLoader:
VIDEO_LOADER_REGISTRY
=
ExtensionManager
()
VIDEO_LOADER_REGISTRY
=
ExtensionManager
()
@
VIDEO_LOADER_REGISTRY
.
register
(
"identity"
)
class
IdentityVideoLoader
(
VideoLoader
):
"""IdentityVideoLoader returns raw video bytes without decoding.
This allows the model processor to handle video decoding and
is required for models like Kimi-K2.5 that need custom video chunk splitting.
NOTE: This is temporary for Kimi-K2.5 testing. Remember to change back
to opencv before release if needed.
"""
@
classmethod
def
load_bytes
(
cls
,
data
:
bytes
,
num_frames
:
int
=
-
1
,
**
kwargs
:
Any
,
)
->
tuple
[
Any
,
Any
]:
return
data
,
None
@
VIDEO_LOADER_REGISTRY
.
register
(
"opencv"
)
@
VIDEO_LOADER_REGISTRY
.
register
(
"opencv"
)
class
OpenCVVideoBackend
(
VideoLoader
):
class
OpenCVVideoBackend
(
VideoLoader
):
def
get_cv2_video_api
(
self
):
def
get_cv2_video_api
(
self
):
...
...
vllm/platforms/rocm.py
View file @
d76fc11e
...
@@ -599,7 +599,8 @@ class RocmPlatform(Platform):
...
@@ -599,7 +599,8 @@ class RocmPlatform(Platform):
cls
,
device
:
torch
.
types
.
Device
|
None
=
None
cls
,
device
:
torch
.
types
.
Device
|
None
=
None
)
->
float
:
)
->
float
:
torch
.
cuda
.
reset_peak_memory_stats
(
device
)
torch
.
cuda
.
reset_peak_memory_stats
(
device
)
# return torch.cuda.mem_get_info(device)[1] - torch.cuda.mem_get_info(device)[0]
# free_mem, total_mem = torch.cuda.mem_get_info(device)
# return total_mem - free_mem
return
torch
.
cuda
.
max_memory_allocated
(
device
)
return
torch
.
cuda
.
max_memory_allocated
(
device
)
@
classmethod
@
classmethod
...
...
vllm/plugins/lora_resolvers/filesystem_resolver.py
View file @
d76fc11e
...
@@ -16,10 +16,20 @@ class FilesystemResolver(LoRAResolver):
...
@@ -16,10 +16,20 @@ class FilesystemResolver(LoRAResolver):
self
,
base_model_name
:
str
,
lora_name
:
str
self
,
base_model_name
:
str
,
lora_name
:
str
)
->
LoRARequest
|
None
:
)
->
LoRARequest
|
None
:
lora_path
=
os
.
path
.
join
(
self
.
lora_cache_dir
,
lora_name
)
lora_path
=
os
.
path
.
join
(
self
.
lora_cache_dir
,
lora_name
)
maybe_lora_request
=
await
self
.
_get_lora_req_from_path
(
lora_name
,
lora_path
,
base_model_name
)
return
maybe_lora_request
async
def
_get_lora_req_from_path
(
self
,
lora_name
:
str
,
lora_path
:
str
,
base_model_name
:
str
)
->
LoRARequest
|
None
:
"""Builds a LoraRequest pointing to the lora path if it's a valid
LoRA adapter and has a matching base_model_name.
"""
if
os
.
path
.
exists
(
lora_path
):
if
os
.
path
.
exists
(
lora_path
):
adapter_config_path
=
os
.
path
.
join
(
adapter_config_path
=
os
.
path
.
join
(
lora_path
,
"adapter_config.json"
)
self
.
lora_cache_dir
,
lora_name
,
"adapter_config.json"
)
if
os
.
path
.
exists
(
adapter_config_path
):
if
os
.
path
.
exists
(
adapter_config_path
):
with
open
(
adapter_config_path
)
as
file
:
with
open
(
adapter_config_path
)
as
file
:
adapter_config
=
json
.
load
(
file
)
adapter_config
=
json
.
load
(
file
)
...
...
vllm/plugins/lora_resolvers/hf_hub_resolver.py
0 → 100644
View file @
d76fc11e
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
os
from
huggingface_hub
import
HfApi
,
snapshot_download
import
vllm.envs
as
envs
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.resolver
import
LoRAResolverRegistry
from
vllm.plugins.lora_resolvers.filesystem_resolver
import
FilesystemResolver
logger
=
init_logger
(
__name__
)
class
HfHubResolver
(
FilesystemResolver
):
def
__init__
(
self
,
repo_list
:
list
[
str
]):
logger
.
warning
(
"LoRA is allowing resolution from the following repositories on"
" HF Hub: %s please note that allowing remote downloads"
" is not secure, and that this plugin is not intended for use in"
" production environments."
,
repo_list
,
)
self
.
repo_list
:
list
[
str
]
=
repo_list
self
.
adapter_dirs
:
dict
[
str
,
set
[
str
]]
=
{}
async
def
resolve_lora
(
self
,
base_model_name
:
str
,
lora_name
:
str
)
->
LoRARequest
|
None
:
"""Resolves potential LoRA requests in a remote repo on HF Hub.
This is effectively the same behavior as the filesystem resolver, but
with a snapshot_download on dirs containing an adapter config prior
to inspecting the cached dir to build a potential LoRA
request.
"""
# If a LoRA name begins with the repository name, it's disambiguated
maybe_repo
=
await
self
.
_resolve_repo
(
lora_name
)
# If we haven't inspected this repo before, save available adapter dirs
if
maybe_repo
is
not
None
and
maybe_repo
not
in
self
.
adapter_dirs
:
self
.
adapter_dirs
[
maybe_repo
]
=
await
self
.
_get_adapter_dirs
(
maybe_repo
)
maybe_subpath
=
await
self
.
_resolve_repo_subpath
(
lora_name
,
maybe_repo
)
if
maybe_repo
is
None
or
maybe_subpath
is
None
:
return
None
repo_path
=
await
asyncio
.
to_thread
(
snapshot_download
,
repo_id
=
maybe_repo
,
allow_patterns
=
f
"
{
maybe_subpath
}
/*"
if
maybe_subpath
!=
"."
else
"*"
,
)
lora_path
=
os
.
path
.
join
(
repo_path
,
maybe_subpath
)
maybe_lora_request
=
await
self
.
_get_lora_req_from_path
(
lora_name
,
lora_path
,
base_model_name
)
return
maybe_lora_request
async
def
_resolve_repo
(
self
,
lora_name
:
str
)
->
str
|
None
:
"""Given a fully qualified path to a LoRA with respect to its HF Hub
repo, match the right repo to potentially download from if one exists.
Args:
lora_name: Path to LoRA in HF Hub, e.g., <org>/<repo>/<subpath>,
match on <org>/<repo> (if it contains an adapter directly) or
<org>/<repo>/ if it may have one in subdirs.
"""
for
potential_repo
in
self
.
repo_list
:
if
lora_name
.
startswith
(
potential_repo
)
and
(
len
(
lora_name
)
==
len
(
potential_repo
)
or
lora_name
[
len
(
potential_repo
)]
==
"/"
):
return
potential_repo
return
None
async
def
_resolve_repo_subpath
(
self
,
lora_name
:
str
,
maybe_repo
:
str
|
None
)
->
str
|
None
:
"""Given the fully qualified path of the LoRA with respect to the HF
Repo, get the subpath to download from assuming it's actually got an
adapter in it.
Args:
lora_name: Path to LoRA in HF Hub, e.g., <org>/<repo>/<subpath>
maybe_repo: Path to the repo to match against if one exists.
"""
if
maybe_repo
is
None
:
return
None
repo_len
=
len
(
maybe_repo
)
if
lora_name
==
maybe_repo
or
(
len
(
lora_name
)
==
repo_len
+
1
and
lora_name
[
-
1
]
==
"/"
):
# Resolves to the root of the directory
adapter_dir
=
"."
else
:
# It's a subpath; removing trailing slashes if there are any
adapter_dir
=
lora_name
[
repo_len
+
1
:].
rstrip
(
"/"
)
# Only download if the directory actually contains an adapter
is_adapter
=
adapter_dir
in
self
.
adapter_dirs
[
maybe_repo
]
return
adapter_dir
if
is_adapter
else
None
async
def
_get_adapter_dirs
(
self
,
repo_name
:
str
)
->
set
[
str
]:
"""Gets the subpaths within a HF repo that contain an adapter config.
Args:
repo_name: Name of the HF hub repo to inspect.
"""
repo_files
=
await
asyncio
.
to_thread
(
HfApi
().
list_repo_files
,
repo_id
=
repo_name
)
adapter_dirs
=
{
os
.
path
.
dirname
(
name
)
for
name
in
repo_files
if
name
.
endswith
(
"adapter_config.json"
)
}
if
"adapter_config.json"
in
repo_files
:
adapter_dirs
.
add
(
"."
)
return
adapter_dirs
def
register_hf_hub_resolver
():
"""Register the Hf hub LoRA Resolver with vLLM"""
hf_repo_list
=
envs
.
VLLM_LORA_RESOLVER_HF_REPO_LIST
is_enabled
=
(
envs
.
VLLM_PLUGINS
is
not
None
and
"lora_hf_hub_resolver"
in
envs
.
VLLM_PLUGINS
)
if
hf_repo_list
:
if
not
is_enabled
:
logger
.
warning
(
"It appears that VLLM_LORA_RESOLVER_HF_REPO_LIST is set, but "
"lora_hf_hub_resolver is not enabled in VLLM_PLUGINS; you must"
" enable this resolver directly in VLLM_PLUGINS to use it "
" because it allows remote downloads."
)
else
:
hf_hub_resolver
=
HfHubResolver
(
hf_repo_list
.
split
(
","
))
LoRAResolverRegistry
.
register_resolver
(
"Hf Hub Resolver"
,
hf_hub_resolver
)
return
vllm/reasoning/__init__.py
View file @
d76fc11e
...
@@ -54,8 +54,8 @@ _REASONING_PARSERS_TO_REGISTER = {
...
@@ -54,8 +54,8 @@ _REASONING_PARSERS_TO_REGISTER = {
"HunyuanA13BReasoningParser"
,
"HunyuanA13BReasoningParser"
,
),
),
"kimi_k2"
:
(
"kimi_k2"
:
(
"
deepseek_r1
_reasoning_parser"
,
"
kimi_k2
_reasoning_parser"
,
"
DeepSeekR1
ReasoningParser"
,
"
KimiK2
ReasoningParser"
,
),
),
"minimax_m2"
:
(
"minimax_m2"
:
(
"minimax_m2_reasoning_parser"
,
"minimax_m2_reasoning_parser"
,
...
...
vllm/reasoning/kimi_k2_reasoning_parser.py
0 → 100644
View file @
d76fc11e
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
collections.abc
import
Sequence
from
typing
import
TYPE_CHECKING
,
Any
from
transformers
import
PreTrainedTokenizerBase
from
vllm.entrypoints.openai.engine.protocol
import
DeltaMessage
from
vllm.logger
import
init_logger
from
vllm.reasoning
import
ReasoningParser
from
vllm.reasoning.deepseek_r1_reasoning_parser
import
DeepSeekR1ReasoningParser
from
.identity_reasoning_parser
import
IdentityReasoningParser
if
TYPE_CHECKING
:
from
vllm.entrypoints.openai.chat_completion.protocol
import
(
ChatCompletionRequest
,
)
else
:
ChatCompletionRequest
=
Any
logger
=
init_logger
(
__name__
)
class
KimiK2ReasoningParser
(
ReasoningParser
):
"""
Kimi K2 parser that delegates to either DeepSeekR1ReasoningParser or
IdentityReasoningParser based on `thinking` and `separate_reasoning`.
Unlike DeepSeekV3ReasoningParser which defaults to NOT thinking,
KimiK2ReasoningParser defaults to thinking mode (uses DeepSeekR1ReasoningParser).
"""
def
__init__
(
self
,
tokenizer
:
PreTrainedTokenizerBase
,
*
args
,
**
kwargs
):
super
().
__init__
(
tokenizer
,
*
args
,
**
kwargs
)
chat_kwargs
=
kwargs
.
pop
(
"chat_template_kwargs"
,
{})
or
{}
# Key difference: default to True instead of False
thinking
=
bool
(
chat_kwargs
.
pop
(
"thinking"
,
True
))
if
thinking
:
self
.
_parser
=
DeepSeekR1ReasoningParser
(
tokenizer
,
*
args
,
**
kwargs
)
else
:
self
.
_parser
=
IdentityReasoningParser
(
tokenizer
,
*
args
,
**
kwargs
)
def
is_reasoning_end
(
self
,
input_ids
:
Sequence
[
int
])
->
bool
:
return
self
.
_parser
.
is_reasoning_end
(
input_ids
)
def
is_reasoning_end_streaming
(
self
,
input_ids
:
list
[
int
],
delta_ids
:
list
[
int
]
)
->
bool
:
return
self
.
_parser
.
is_reasoning_end_streaming
(
input_ids
,
delta_ids
)
def
extract_content_ids
(
self
,
input_ids
:
list
[
int
])
->
list
[
int
]:
return
self
.
_parser
.
extract_content_ids
(
input_ids
)
def
extract_reasoning
(
self
,
model_output
:
str
,
request
:
"ChatCompletionRequest"
)
->
tuple
[
str
|
None
,
str
|
None
]:
return
self
.
_parser
.
extract_reasoning
(
model_output
,
request
)
def
extract_reasoning_streaming
(
self
,
previous_text
:
str
,
current_text
:
str
,
delta_text
:
str
,
previous_token_ids
:
Sequence
[
int
],
current_token_ids
:
Sequence
[
int
],
delta_token_ids
:
Sequence
[
int
],
)
->
DeltaMessage
|
None
:
return
self
.
_parser
.
extract_reasoning_streaming
(
previous_text
,
current_text
,
delta_text
,
previous_token_ids
,
current_token_ids
,
delta_token_ids
,
)
Prev
1
…
11
12
13
14
15
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment