Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c721b814
"vscode:/vscode.git/clone" did not exist on "d5d214ac7f379c75265c973fe4f2047e22bda54d"
Commit
c721b814
authored
Feb 05, 2026
by
zhuwenwen
Browse files
sync v0.15.1
parent
d53fe7e5
Changes
328
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
37 additions
and
243 deletions
+37
-243
vllm/model_executor/models/step3_vl.py
vllm/model_executor/models/step3_vl.py
+2
-2
vllm/model_executor/models/tarsier.py
vllm/model_executor/models/tarsier.py
+2
-2
vllm/model_executor/models/transformers/base.py
vllm/model_executor/models/transformers/base.py
+3
-3
vllm/model_executor/models/transformers/moe.py
vllm/model_executor/models/transformers/moe.py
+1
-1
vllm/model_executor/models/ultravox.py
vllm/model_executor/models/ultravox.py
+2
-2
vllm/model_executor/models/utils.py
vllm/model_executor/models/utils.py
+2
-1
vllm/model_executor/models/voxtral.py
vllm/model_executor/models/voxtral.py
+2
-2
vllm/model_executor/models/voxtral_streaming.py
vllm/model_executor/models/voxtral_streaming.py
+2
-2
vllm/model_executor/models/whisper_causal.py
vllm/model_executor/models/whisper_causal.py
+1
-41
vllm/model_executor/models/zamba2.py
vllm/model_executor/models/zamba2.py
+3
-3
vllm/model_executor/warmup/deep_gemm_warmup.py
vllm/model_executor/warmup/deep_gemm_warmup.py
+4
-4
vllm/platforms/tpu.py
vllm/platforms/tpu.py
+0
-1
vllm/platforms/xpu.py
vllm/platforms/xpu.py
+2
-5
vllm/plugins/lora_resolvers/filesystem_resolver.py
vllm/plugins/lora_resolvers/filesystem_resolver.py
+4
-14
vllm/plugins/lora_resolvers/hf_hub_resolver.py
vllm/plugins/lora_resolvers/hf_hub_resolver.py
+0
-143
vllm/tool_parsers/kimi_k2_tool_parser.py
vllm/tool_parsers/kimi_k2_tool_parser.py
+2
-2
vllm/transformers_utils/config.py
vllm/transformers_utils/config.py
+1
-1
vllm/transformers_utils/model_arch_config_convertor.py
vllm/transformers_utils/model_arch_config_convertor.py
+0
-1
vllm/v1/attention/backends/mla/flashmla.py
vllm/v1/attention/backends/mla/flashmla.py
+0
-2
vllm/v1/attention/backends/rocm_attn.py
vllm/v1/attention/backends/rocm_attn.py
+4
-11
No files found.
vllm/model_executor/models/step3_vl.py
View file @
c721b814
...
...
@@ -1101,7 +1101,7 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -1124,4 +1124,4 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]]):
loader
=
AutoWeightsLoader
(
self
)
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
\ No newline at end of file
vllm/model_executor/models/tarsier.py
View file @
c721b814
...
...
@@ -585,7 +585,7 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -610,4 +610,4 @@ class TarsierForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
loader
=
AutoWeightsLoader
(
self
)
return
loader
.
load_weights
(
weights
)
return
loader
.
load_weights
(
weights
)
\ No newline at end of file
vllm/model_executor/models/transformers/base.py
View file @
c721b814
...
...
@@ -350,7 +350,7 @@ class Base(
# vLLM does not support encoder-decoder models, so if any encoder layer is
# found in a text only model, we assume the whole model is an encoder model
if
has_encoder
(
self
.
model
)
and
not
is_multimodal
(
self
.
config
):
self
.
check_version
(
"5.0.0"
,
"encoder models support"
)
self
.
check_version
(
"5.0.0
.dev0
"
,
"encoder models support"
)
attn_type
=
AttentionType
.
ENCODER_ONLY
else
:
attn_type
=
AttentionType
.
DECODER
...
...
@@ -502,7 +502,7 @@ class Base(
)
def
set_aux_hidden_state_layers
(
self
,
layers
:
tuple
[
int
,
...])
->
None
:
self
.
check_version
(
"5.0.0"
,
"Eagle3 support"
)
self
.
check_version
(
"5.0.0
.dev0
"
,
"Eagle3 support"
)
from
transformers.utils.generic
import
OutputRecorder
# The default value in PreTrainedModel is None
...
...
@@ -520,4 +520,4 @@ class Base(
def
get_eagle3_aux_hidden_state_layers
(
self
)
->
tuple
[
int
,
...]:
num_layers
=
self
.
text_config
.
num_hidden_layers
return
(
2
,
num_layers
//
2
,
num_layers
-
3
)
return
(
2
,
num_layers
//
2
,
num_layers
-
3
)
\ No newline at end of file
vllm/model_executor/models/transformers/moe.py
View file @
c721b814
...
...
@@ -118,7 +118,7 @@ direct_register_custom_op(
class
MoEMixin
(
MixtureOfExperts
):
def
__init__
(
self
,
*
,
vllm_config
:
"VllmConfig"
,
prefix
:
str
=
""
):
self
.
check_version
(
"5.0.0"
,
"MoE models support"
)
self
.
check_version
(
"5.0.0
.dev0
"
,
"MoE models support"
)
# Skip MixtureOfExperts.__init__ and call the next class in MRO
super
(
MixtureOfExperts
,
self
).
__init__
(
vllm_config
=
vllm_config
,
prefix
=
prefix
)
...
...
vllm/model_executor/models/ultravox.py
View file @
c721b814
...
...
@@ -714,7 +714,7 @@ class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -784,4 +784,4 @@ def pad_and_concat_to_dim3(
# Pad and concatenate:
# [[B1, 80, M1], [B2, 80, M2]] -> [B1+B2, 80, max(M1, M2)]
features
=
[
F
.
pad
(
f
,
(
0
,
max_len
-
f
.
shape
[
-
1
]))
for
f
in
features
]
return
torch
.
cat
(
features
)
return
torch
.
cat
(
features
)
\ No newline at end of file
vllm/model_executor/models/utils.py
View file @
c721b814
...
...
@@ -867,6 +867,7 @@ def fast_topk(
# Use topk for efficiency with larger k values
return
torch
.
topk
(
values
,
topk
,
dim
=
dim
)
# Chunk x along the num_tokens axis for sequence parallelism
# NOTE: This is wrapped in a torch custom op to work around the following issue:
# The output tensor can have a sequence length 0 at small input sequence lengths
...
...
@@ -942,4 +943,4 @@ def get_layer_index(feature_layer_index: int, num_hidden_layers: int) -> int:
"""
if
feature_layer_index
<
0
:
return
num_hidden_layers
+
feature_layer_index
+
1
return
feature_layer_index
return
feature_layer_index
\ No newline at end of file
vllm/model_executor/models/voxtral.py
View file @
c721b814
...
...
@@ -397,7 +397,7 @@ class VoxtralForConditionalGeneration(
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -899,4 +899,4 @@ class VoxtralEncoderModel(nn.Module):
weight_loader
=
getattr
(
param
,
"weight_loader"
,
default_weight_loader
)
weight_loader
(
param
,
loaded_weight
)
return
name
return
name
\ No newline at end of file
vllm/model_executor/models/voxtral_streaming.py
View file @
c721b814
...
...
@@ -173,7 +173,7 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
@@ -318,4 +318,4 @@ class VoxtralStreamingGeneration(VoxtralForConditionalGeneration):
audio
=
(
tokenized
.
audios
[
0
].
audio_array
,
stt_config
.
sample_rate
)
prompts_dict
=
{
"multi_modal_data"
:
{
"audio"
:
audio
}}
prompts_dict
[
"prompt_token_ids"
]
=
tokenized
.
tokens
return
cast
(
PromptType
,
prompts_dict
)
return
cast
(
PromptType
,
prompts_dict
)
\ No newline at end of file
vllm/model_executor/models/whisper_causal.py
View file @
c721b814
...
...
@@ -105,7 +105,6 @@ def create_whisper_attention_backend_with_block_pooling(
)
->
type
[
AttentionBackend
]:
prefix
=
"WhisperCausalAttentionWithBlockPooling_"
underlying_builder
=
underlying_attn_backend
.
get_builder_cls
()
underlying_impl
=
underlying_attn_backend
.
get_impl_cls
()
class
WhisperCausalAttentionWithBlockPoolingBuilder
(
underlying_builder
):
# type: ignore
def
__init__
(
...
...
@@ -152,43 +151,6 @@ def create_whisper_attention_backend_with_block_pooling(
common_prefix_len
,
new_common_attn_metadata
,
fast_build
)
# NOTE: We need a custom impl so we can use the transformed slot_mapping
# computed by `WhisperCausalAttentionWithBlockPoolingBuilder` instead of
# the one from `forward_context.slot_mapping` (gpu_model_runner).
# This follows the same pattern as CrossAttentionImpl.
class
WhisperCausalAttentionWithBlockPoolingImpl
(
underlying_impl
):
# type: ignore[valid-type,misc]
def
forward
(
self
,
layer
:
torch
.
nn
.
Module
,
query
:
torch
.
Tensor
,
key
:
torch
.
Tensor
,
value
:
torch
.
Tensor
,
kv_cache
:
torch
.
Tensor
,
attn_metadata
:
AttentionMetadata
,
output
:
torch
.
Tensor
|
None
=
None
,
output_scale
:
torch
.
Tensor
|
None
=
None
,
output_block_scale
:
torch
.
Tensor
|
None
=
None
,
)
->
torch
.
Tensor
:
if
(
not
underlying_attn_backend
.
forward_includes_kv_cache_update
and
attn_metadata
is
not
None
):
self
.
do_kv_cache_update
(
layer
,
key
,
value
,
kv_cache
,
attn_metadata
.
slot_mapping
)
return
super
().
forward
(
layer
,
query
,
key
,
value
,
kv_cache
,
attn_metadata
,
output
,
output_scale
,
output_block_scale
,
)
if
not
issubclass
(
underlying_attn_backend
,
FlashAttentionBackend
):
raise
NotImplementedError
(
f
"
{
underlying_attn_backend
}
is not yet supported."
...
...
@@ -201,7 +163,6 @@ def create_whisper_attention_backend_with_block_pooling(
attention_backend_cls
=
underlying_attn_backend
,
overrides
=
{
"get_builder_cls"
:
lambda
:
WhisperCausalAttentionWithBlockPoolingBuilder
,
"get_impl_cls"
:
lambda
:
WhisperCausalAttentionWithBlockPoolingImpl
,
"get_kv_cache_shape"
:
lambda
num_blocks
,
block_size
,
num_kv_heads
,
...
...
@@ -214,7 +175,6 @@ def create_whisper_attention_backend_with_block_pooling(
num_kv_heads
//
block_pool_size
,
head_size
,
),
# TODO: generalize to other backends
"forward_includes_kv_cache_update"
:
True
,
},
)
...
...
@@ -502,4 +462,4 @@ class WhisperCausalEncoder(nn.Module):
hidden_states
=
encoder_layer
(
hidden_states
,
positions
)
hidden_states
=
self
.
layer_norm
(
hidden_states
)
return
hidden_states
return
hidden_states
\ No newline at end of file
vllm/model_executor/models/zamba2.py
View file @
c721b814
...
...
@@ -771,7 +771,7 @@ class Zamba2Model(nn.Module):
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
)
->
torch
.
Tensor
|
IntermediateTensors
:
...
...
@@ -947,7 +947,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsMambaPrefixC
def
forward
(
self
,
input_ids
:
torch
.
Tensor
|
None
,
input_ids
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
**
kwargs
:
Any
,
...
...
@@ -989,4 +989,4 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsMambaPrefixC
def
load_weights
(
self
,
weights
:
Iterable
[
tuple
[
str
,
torch
.
Tensor
]])
->
set
[
str
]:
loader
=
AutoWeightsLoader
(
self
)
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
return
loader
.
load_weights
(
weights
,
mapper
=
self
.
hf_to_vllm_mapper
)
\ No newline at end of file
vllm/model_executor/warmup/deep_gemm_warmup.py
View file @
c721b814
...
...
@@ -14,6 +14,7 @@ from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank
from
vllm.model_executor.layers.fused_moe.deep_gemm_moe
import
DeepGemmExperts
from
vllm.model_executor.layers.fused_moe.deep_gemm_utils
import
compute_aligned_M
from
vllm.model_executor.layers.fused_moe.layer
import
FusedMoE
,
FusedMoEModularMethod
from
vllm.model_executor.layers.fused_moe.modular_kernel
import
FusedMoEModularKernel
from
vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe
import
(
TritonOrDeepGemmExperts
,
)
...
...
@@ -168,10 +169,9 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
# modular kernels could invoke deep_gemm_moe_fp8
return
True
mk
:
FusedMoEModularKernel
=
module
.
quant_method
.
fused_experts
# Further check if the ModularKernel implementation uses the DeepGemmExperts
return
isinstance
(
module
.
quant_method
.
moe_mk
,
(
DeepGemmExperts
,
TritonOrDeepGemmExperts
)
)
return
isinstance
(
mk
.
fused_experts
,
(
DeepGemmExperts
,
TritonOrDeepGemmExperts
))
FP8_GEMM_NT_WARMUP_CACHE
:
set
[
torch
.
Size
]
=
set
()
...
...
@@ -370,4 +370,4 @@ def deep_gemm_warmup(model: torch.nn.Module, max_tokens: int):
deepgemm_grouped_fp8_gemm_nt_contiguous_warmup
(
model
,
max_tokens
,
pbar
)
else
:
deepgemm_fp8_gemm_nt_warmup
(
model
,
max_tokens
,
None
)
deepgemm_grouped_fp8_gemm_nt_contiguous_warmup
(
model
,
max_tokens
,
None
)
deepgemm_grouped_fp8_gemm_nt_contiguous_warmup
(
model
,
max_tokens
,
None
)
\ No newline at end of file
vllm/platforms/tpu.py
View file @
c721b814
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
vllm.logger
import
init_logger
logger
=
init_logger
(
__name__
)
...
...
vllm/platforms/xpu.py
View file @
c721b814
...
...
@@ -147,10 +147,7 @@ class XPUPlatform(Platform):
model_config
=
vllm_config
.
model_config
# in V1(or with ipex chunked prefill) block_size is 64
if
cache_config
and
cache_config
.
block_size
is
None
:
if
envs
.
VLLM_USE_V1
:
cache_config
.
block_size
=
64
else
:
cache_config
.
block_size
=
16
cache_config
.
block_size
=
64
# lazy import to avoid circular import
from
vllm.config
import
CompilationMode
,
CUDAGraphMode
...
...
@@ -262,4 +259,4 @@ class XPUPlatform(Platform):
)
->
None
:
"""Copy blocks from XPU to host (CPU)."""
_src_cache
=
src_cache
[:,
src_block_indices
]
dst_cache
[:,
dst_block_indices
]
=
_src_cache
.
cpu
()
dst_cache
[:,
dst_block_indices
]
=
_src_cache
.
cpu
()
\ No newline at end of file
vllm/plugins/lora_resolvers/filesystem_resolver.py
View file @
c721b814
...
...
@@ -16,20 +16,10 @@ class FilesystemResolver(LoRAResolver):
self
,
base_model_name
:
str
,
lora_name
:
str
)
->
LoRARequest
|
None
:
lora_path
=
os
.
path
.
join
(
self
.
lora_cache_dir
,
lora_name
)
maybe_lora_request
=
await
self
.
_get_lora_req_from_path
(
lora_name
,
lora_path
,
base_model_name
)
return
maybe_lora_request
async
def
_get_lora_req_from_path
(
self
,
lora_name
:
str
,
lora_path
:
str
,
base_model_name
:
str
)
->
LoRARequest
|
None
:
"""Builds a LoraRequest pointing to the lora path if it's a valid
LoRA adapter and has a matching base_model_name.
"""
if
os
.
path
.
exists
(
lora_path
):
adapter_config_path
=
os
.
path
.
join
(
lora_path
,
"adapter_config.json"
)
adapter_config_path
=
os
.
path
.
join
(
self
.
lora_cache_dir
,
lora_name
,
"adapter_config.json"
)
if
os
.
path
.
exists
(
adapter_config_path
):
with
open
(
adapter_config_path
)
as
file
:
adapter_config
=
json
.
load
(
file
)
...
...
@@ -59,4 +49,4 @@ def register_filesystem_resolver():
fs_resolver
=
FilesystemResolver
(
lora_cache_dir
)
LoRAResolverRegistry
.
register_resolver
(
"Filesystem Resolver"
,
fs_resolver
)
return
return
\ No newline at end of file
vllm/plugins/lora_resolvers/hf_hub_resolver.py
deleted
100644 → 0
View file @
d53fe7e5
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
os
from
huggingface_hub
import
HfApi
,
snapshot_download
import
vllm.envs
as
envs
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.resolver
import
LoRAResolverRegistry
from
vllm.plugins.lora_resolvers.filesystem_resolver
import
FilesystemResolver
logger
=
init_logger
(
__name__
)
class
HfHubResolver
(
FilesystemResolver
):
def
__init__
(
self
,
repo_list
:
list
[
str
]):
logger
.
warning
(
"LoRA is allowing resolution from the following repositories on"
" HF Hub: %s please note that allowing remote downloads"
" is not secure, and that this plugin is not intended for use in"
" production environments."
,
repo_list
,
)
self
.
repo_list
:
list
[
str
]
=
repo_list
self
.
adapter_dirs
:
dict
[
str
,
set
[
str
]]
=
{}
async
def
resolve_lora
(
self
,
base_model_name
:
str
,
lora_name
:
str
)
->
LoRARequest
|
None
:
"""Resolves potential LoRA requests in a remote repo on HF Hub.
This is effectively the same behavior as the filesystem resolver, but
with a snapshot_download on dirs containing an adapter config prior
to inspecting the cached dir to build a potential LoRA
request.
"""
# If a LoRA name begins with the repository name, it's disambiguated
maybe_repo
=
await
self
.
_resolve_repo
(
lora_name
)
# If we haven't inspected this repo before, save available adapter dirs
if
maybe_repo
is
not
None
and
maybe_repo
not
in
self
.
adapter_dirs
:
self
.
adapter_dirs
[
maybe_repo
]
=
await
self
.
_get_adapter_dirs
(
maybe_repo
)
maybe_subpath
=
await
self
.
_resolve_repo_subpath
(
lora_name
,
maybe_repo
)
if
maybe_repo
is
None
or
maybe_subpath
is
None
:
return
None
repo_path
=
await
asyncio
.
to_thread
(
snapshot_download
,
repo_id
=
maybe_repo
,
allow_patterns
=
f
"
{
maybe_subpath
}
/*"
if
maybe_subpath
!=
"."
else
"*"
,
)
lora_path
=
os
.
path
.
join
(
repo_path
,
maybe_subpath
)
maybe_lora_request
=
await
self
.
_get_lora_req_from_path
(
lora_name
,
lora_path
,
base_model_name
)
return
maybe_lora_request
async
def
_resolve_repo
(
self
,
lora_name
:
str
)
->
str
|
None
:
"""Given a fully qualified path to a LoRA with respect to its HF Hub
repo, match the right repo to potentially download from if one exists.
Args:
lora_name: Path to LoRA in HF Hub, e.g., <org>/<repo>/<subpath>,
match on <org>/<repo> (if it contains an adapter directly) or
<org>/<repo>/ if it may have one in subdirs.
"""
for
potential_repo
in
self
.
repo_list
:
if
lora_name
.
startswith
(
potential_repo
)
and
(
len
(
lora_name
)
==
len
(
potential_repo
)
or
lora_name
[
len
(
potential_repo
)]
==
"/"
):
return
potential_repo
return
None
async
def
_resolve_repo_subpath
(
self
,
lora_name
:
str
,
maybe_repo
:
str
|
None
)
->
str
|
None
:
"""Given the fully qualified path of the LoRA with respect to the HF
Repo, get the subpath to download from assuming it's actually got an
adapter in it.
Args:
lora_name: Path to LoRA in HF Hub, e.g., <org>/<repo>/<subpath>
maybe_repo: Path to the repo to match against if one exists.
"""
if
maybe_repo
is
None
:
return
None
repo_len
=
len
(
maybe_repo
)
if
lora_name
==
maybe_repo
or
(
len
(
lora_name
)
==
repo_len
+
1
and
lora_name
[
-
1
]
==
"/"
):
# Resolves to the root of the directory
adapter_dir
=
"."
else
:
# It's a subpath; removing trailing slashes if there are any
adapter_dir
=
lora_name
[
repo_len
+
1
:].
rstrip
(
"/"
)
# Only download if the directory actually contains an adapter
is_adapter
=
adapter_dir
in
self
.
adapter_dirs
[
maybe_repo
]
return
adapter_dir
if
is_adapter
else
None
async
def
_get_adapter_dirs
(
self
,
repo_name
:
str
)
->
set
[
str
]:
"""Gets the subpaths within a HF repo that contain an adapter config.
Args:
repo_name: Name of the HF hub repo to inspect.
"""
repo_files
=
await
asyncio
.
to_thread
(
HfApi
().
list_repo_files
,
repo_id
=
repo_name
)
adapter_dirs
=
{
os
.
path
.
dirname
(
name
)
for
name
in
repo_files
if
name
.
endswith
(
"adapter_config.json"
)
}
if
"adapter_config.json"
in
repo_files
:
adapter_dirs
.
add
(
"."
)
return
adapter_dirs
def
register_hf_hub_resolver
():
"""Register the Hf hub LoRA Resolver with vLLM"""
hf_repo_list
=
envs
.
VLLM_LORA_RESOLVER_HF_REPO_LIST
is_enabled
=
(
envs
.
VLLM_PLUGINS
is
not
None
and
"lora_hf_hub_resolver"
in
envs
.
VLLM_PLUGINS
)
if
hf_repo_list
:
if
not
is_enabled
:
logger
.
warning
(
"It appears that VLLM_LORA_RESOLVER_HF_REPO_LIST is set, but "
"lora_hf_hub_resolver is not enabled in VLLM_PLUGINS; you must"
" enable this resolver directly in VLLM_PLUGINS to use it "
" because it allows remote downloads."
)
else
:
hf_hub_resolver
=
HfHubResolver
(
hf_repo_list
.
split
(
","
))
LoRAResolverRegistry
.
register_resolver
(
"Hf Hub Resolver"
,
hf_hub_resolver
)
return
vllm/tool_parsers/kimi_k2_tool_parser.py
View file @
c721b814
...
...
@@ -448,7 +448,7 @@ class KimiK2ToolParser(ToolParser):
if
current_tool_call_matches
:
tool_id
,
tool_args
=
current_tool_call_matches
.
groups
()
tool_name
=
tool_id
.
split
(
":"
)[
0
].
split
(
"."
)[
-
1
]
current_tool_call
[
"id"
]
=
tool_id
.
strip
()
current_tool_call
[
"id"
]
=
tool_id
current_tool_call
[
"name"
]
=
tool_name
current_tool_call
[
"arguments"
]
=
tool_args
else
:
...
...
@@ -458,7 +458,7 @@ class KimiK2ToolParser(ToolParser):
if
current_tool_call_name_matches
:
(
tool_id_str
,)
=
current_tool_call_name_matches
.
groups
()
tool_name
=
tool_id_str
.
split
(
":"
)[
0
].
split
(
"."
)[
-
1
]
current_tool_call
[
"id"
]
=
tool_id_str
.
strip
()
current_tool_call
[
"id"
]
=
tool_id_str
current_tool_call
[
"name"
]
=
tool_name
current_tool_call
[
"arguments"
]
=
""
else
:
...
...
vllm/transformers_utils/config.py
View file @
c721b814
...
...
@@ -331,7 +331,7 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
partial_rotary_factor
=
getattr_iter
(
config
,
names
,
None
,
warn
=
True
)
ompe
=
getattr
(
config
,
"original_max_position_embeddings"
,
None
)
if
Version
(
version
(
"transformers"
))
<
Version
(
"5.0.0"
):
if
Version
(
version
(
"transformers"
))
<
Version
(
"5.0.0
.dev0
"
):
# Transformers v4 installed, legacy config fields may be present
if
(
rope_scaling
:
=
getattr
(
config
,
"rope_scaling"
,
None
))
is
not
None
:
config
.
rope_parameters
=
rope_scaling
...
...
vllm/transformers_utils/model_arch_config_convertor.py
View file @
c721b814
...
...
@@ -398,7 +398,6 @@ MODEL_ARCH_CONFIG_CONVERTORS = {
"qwen3_next_mtp"
:
Qwen3NextMTPModelArchConfigConvertor
,
"mimo_mtp"
:
MimoMTPModelArchConfigConvertor
,
"glm4_moe_mtp"
:
GLM4MoeMTPModelArchConfigConvertor
,
"glm_ocr_mtp"
:
GLM4MoeMTPModelArchConfigConvertor
,
"ernie_mtp"
:
ErnieMTPModelArchConfigConvertor
,
"pangu_ultra_moe_mtp"
:
PanguUltraMoeMTPModelArchConfigConvertor
,
"longcat_flash_mtp"
:
LongCatFlashMTPModelArchConfigConvertor
,
...
...
vllm/v1/attention/backends/mla/flashmla.py
View file @
c721b814
...
...
@@ -40,7 +40,6 @@ from vllm.v1.attention.ops.flashmla import (
is_flashmla_dense_supported
,
)
from
vllm.v1.kv_cache_interface
import
AttentionSpec
from
vllm.platforms
import
current_platform
logger
=
init_logger
(
__name__
)
...
...
@@ -285,7 +284,6 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
num_splits
=
torch
.
zeros
((
B
+
1
,),
dtype
=
dtype
,
device
=
device
)
scheduler_metadata
.
tile_scheduler_metadata
=
tile_scheduler_metadata
scheduler_metadata
.
num_splits
=
num_splits
if
self
.
kv_cache_dtype
.
startswith
(
"fp8"
):
o
,
lse
=
flash_mla_with_kvcache_fp8
(
q
=
q
,
...
...
vllm/v1/attention/backends/rocm_attn.py
View file @
c721b814
...
...
@@ -330,14 +330,7 @@ class RocmAttentionImpl(AttentionImpl):
kv_cache
,
self
.
num_kv_heads
,
self
.
head_size
)
# key and value may be None in the case of cross attention. They are
# calculated once based on the output from the encoder and then cached
# in KV cache.
if
(
self
.
kv_sharing_target_layer_name
is
None
and
key
is
not
None
and
value
is
not
None
):
if
self
.
kv_sharing_target_layer_name
is
None
:
# Reshape the input keys and values and store them in the cache.
# Skip this if sharing KV cache with an earlier attention layer.
...
...
@@ -389,8 +382,8 @@ class RocmAttentionImpl(AttentionImpl):
# Compute attention and update output up to `num_actual_tokens`.
chunked_prefill_paged_decode
(
query
=
query
[:
num_actual_tokens
],
key
=
key
[:
num_actual_tokens
]
if
key
is
not
None
else
None
,
value
=
value
[:
num_actual_tokens
]
if
value
is
not
None
else
None
,
key
=
key
[:
num_actual_tokens
],
value
=
value
[:
num_actual_tokens
],
output
=
output
[:
num_actual_tokens
],
kv_cache_dtype
=
self
.
kv_cache_dtype
,
key_cache
=
key_cache
,
...
...
@@ -409,4 +402,4 @@ class RocmAttentionImpl(AttentionImpl):
sinks
=
self
.
sinks
,
)
return
output
return
output
\ No newline at end of file
Prev
1
…
12
13
14
15
16
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment