Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
eefa41c1
Commit
eefa41c1
authored
Mar 24, 2026
by
zhuwenwen
Browse files
sync v0.18.0
parent
82155c76
Changes
253
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
174 additions
and
23 deletions
+174
-23
vllm/model_executor/models/voxtral.py
vllm/model_executor/models/voxtral.py
+1
-1
vllm/model_executor/models/voxtral_realtime.py
vllm/model_executor/models/voxtral_realtime.py
+2
-2
vllm/model_executor/models/whisper_causal.py
vllm/model_executor/models/whisper_causal.py
+2
-0
vllm/model_executor/models/zamba2.py
vllm/model_executor/models/zamba2.py
+2
-2
vllm/model_executor/warmup/deep_gemm_warmup.py
vllm/model_executor/warmup/deep_gemm_warmup.py
+0
-2
vllm/plugins/lora_resolvers/filesystem_resolver.py
vllm/plugins/lora_resolvers/filesystem_resolver.py
+13
-3
vllm/plugins/lora_resolvers/hf_hub_resolver.py
vllm/plugins/lora_resolvers/hf_hub_resolver.py
+143
-0
vllm/tool_parsers/kimi_k2_tool_parser.py
vllm/tool_parsers/kimi_k2_tool_parser.py
+3
-3
vllm/transformers_utils/config.py
vllm/transformers_utils/config.py
+2
-2
vllm/transformers_utils/model_arch_config_convertor.py
vllm/transformers_utils/model_arch_config_convertor.py
+2
-1
vllm/v1/structured_output/__init__.py
vllm/v1/structured_output/__init__.py
+1
-4
vllm/v1/worker/gpu/mm/encoder_runner.py
vllm/v1/worker/gpu/mm/encoder_runner.py
+1
-1
vllm/v1/worker/gpu/model_runner.py
vllm/v1/worker/gpu/model_runner.py
+2
-2
No files found.
vllm/model_executor/models/voxtral.py
View file @
eefa41c1
...
@@ -344,7 +344,7 @@ class VoxtralForConditionalGeneration(
...
@@ -344,7 +344,7 @@ class VoxtralForConditionalGeneration(
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
...
vllm/model_executor/models/voxtral_realtime.py
View file @
eefa41c1
...
@@ -328,7 +328,7 @@ class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtim
...
@@ -328,7 +328,7 @@ class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtim
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
intermediate_tensors
:
IntermediateTensors
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
...
@@ -492,4 +492,4 @@ class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtim
...
@@ -492,4 +492,4 @@ class VoxtralRealtimeGeneration(VoxtralForConditionalGeneration, SupportsRealtim
multi_modal_data
=
{
multi_modal_data
=
{
"audio"
:
(
tokenized
.
audios
[
0
].
audio_array
,
stt_config
.
sample_rate
)
"audio"
:
(
tokenized
.
audios
[
0
].
audio_array
,
stt_config
.
sample_rate
)
},
},
)
)
\ No newline at end of file
vllm/model_executor/models/whisper_causal.py
View file @
eefa41c1
...
@@ -115,6 +115,7 @@ def create_whisper_attention_backend_with_block_pooling(
...
@@ -115,6 +115,7 @@ def create_whisper_attention_backend_with_block_pooling(
)
->
type
[
AttentionBackend
]:
)
->
type
[
AttentionBackend
]:
prefix
=
"WhisperCausalAttentionWithBlockPooling_"
prefix
=
"WhisperCausalAttentionWithBlockPooling_"
underlying_builder
=
underlying_attn_backend
.
get_builder_cls
()
underlying_builder
=
underlying_attn_backend
.
get_builder_cls
()
underlying_impl
=
underlying_attn_backend
.
get_impl_cls
()
class
WhisperCausalAttentionWithBlockPoolingBuilder
(
underlying_builder
):
# type: ignore
class
WhisperCausalAttentionWithBlockPoolingBuilder
(
underlying_builder
):
# type: ignore
def
__init__
(
def
__init__
(
...
@@ -243,6 +244,7 @@ def create_whisper_attention_backend_with_block_pooling(
...
@@ -243,6 +244,7 @@ def create_whisper_attention_backend_with_block_pooling(
attention_backend_cls
=
underlying_attn_backend
,
attention_backend_cls
=
underlying_attn_backend
,
overrides
=
{
overrides
=
{
"get_builder_cls"
:
lambda
:
WhisperCausalAttentionWithBlockPoolingBuilder
,
"get_builder_cls"
:
lambda
:
WhisperCausalAttentionWithBlockPoolingBuilder
,
"get_impl_cls"
:
lambda
:
WhisperCausalAttentionWithBlockPoolingImpl
,
"get_kv_cache_shape"
:
lambda
num_blocks
,
"get_kv_cache_shape"
:
lambda
num_blocks
,
block_size
,
block_size
,
num_kv_heads
,
num_kv_heads
,
...
...
vllm/model_executor/models/zamba2.py
View file @
eefa41c1
...
@@ -771,7 +771,7 @@ class Zamba2Model(nn.Module):
...
@@ -771,7 +771,7 @@ class Zamba2Model(nn.Module):
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
)
->
torch
.
Tensor
|
IntermediateTensors
:
)
->
torch
.
Tensor
|
IntermediateTensors
:
...
@@ -947,7 +947,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsMambaPrefixC
...
@@ -947,7 +947,7 @@ class Zamba2ForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsMambaPrefixC
def
forward
(
def
forward
(
self
,
self
,
input_ids
:
torch
.
Tensor
,
input_ids
:
torch
.
Tensor
|
None
,
positions
:
torch
.
Tensor
,
positions
:
torch
.
Tensor
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
inputs_embeds
:
torch
.
Tensor
|
None
=
None
,
**
kwargs
:
Any
,
**
kwargs
:
Any
,
...
...
vllm/model_executor/warmup/deep_gemm_warmup.py
View file @
eefa41c1
...
@@ -14,7 +14,6 @@ from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank
...
@@ -14,7 +14,6 @@ from vllm.distributed.parallel_state import get_dp_group, is_global_first_rank
from
vllm.model_executor.layers.fused_moe.deep_gemm_moe
import
DeepGemmExperts
from
vllm.model_executor.layers.fused_moe.deep_gemm_moe
import
DeepGemmExperts
from
vllm.model_executor.layers.fused_moe.deep_gemm_utils
import
compute_aligned_M
from
vllm.model_executor.layers.fused_moe.deep_gemm_utils
import
compute_aligned_M
from
vllm.model_executor.layers.fused_moe.layer
import
FusedMoE
,
FusedMoEModularMethod
from
vllm.model_executor.layers.fused_moe.layer
import
FusedMoE
,
FusedMoEModularMethod
from
vllm.model_executor.layers.fused_moe.modular_kernel
import
FusedMoEModularKernel
from
vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe
import
(
from
vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe
import
(
TritonOrDeepGemmExperts
,
TritonOrDeepGemmExperts
,
)
)
...
@@ -171,7 +170,6 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
...
@@ -171,7 +170,6 @@ def _fused_moe_grouped_gemm_may_use_deep_gemm(module: torch.nn.Module) -> bool:
# modular kernels could invoke deep_gemm_moe_fp8
# modular kernels could invoke deep_gemm_moe_fp8
return
True
return
True
mk
:
FusedMoEModularKernel
=
module
.
quant_method
.
fused_experts
# Further check if the ModularKernel implementation uses the DeepGemmExperts
# Further check if the ModularKernel implementation uses the DeepGemmExperts
return
isinstance
(
return
isinstance
(
module
.
quant_method
.
moe_kernel
,
(
DeepGemmExperts
,
TritonOrDeepGemmExperts
)
module
.
quant_method
.
moe_kernel
,
(
DeepGemmExperts
,
TritonOrDeepGemmExperts
)
...
...
vllm/plugins/lora_resolvers/filesystem_resolver.py
View file @
eefa41c1
...
@@ -16,10 +16,20 @@ class FilesystemResolver(LoRAResolver):
...
@@ -16,10 +16,20 @@ class FilesystemResolver(LoRAResolver):
self
,
base_model_name
:
str
,
lora_name
:
str
self
,
base_model_name
:
str
,
lora_name
:
str
)
->
LoRARequest
|
None
:
)
->
LoRARequest
|
None
:
lora_path
=
os
.
path
.
join
(
self
.
lora_cache_dir
,
lora_name
)
lora_path
=
os
.
path
.
join
(
self
.
lora_cache_dir
,
lora_name
)
maybe_lora_request
=
await
self
.
_get_lora_req_from_path
(
lora_name
,
lora_path
,
base_model_name
)
return
maybe_lora_request
async
def
_get_lora_req_from_path
(
self
,
lora_name
:
str
,
lora_path
:
str
,
base_model_name
:
str
)
->
LoRARequest
|
None
:
"""Builds a LoraRequest pointing to the lora path if it's a valid
LoRA adapter and has a matching base_model_name.
"""
if
os
.
path
.
exists
(
lora_path
):
if
os
.
path
.
exists
(
lora_path
):
adapter_config_path
=
os
.
path
.
join
(
adapter_config_path
=
os
.
path
.
join
(
lora_path
,
"adapter_config.json"
)
self
.
lora_cache_dir
,
lora_name
,
"adapter_config.json"
)
if
os
.
path
.
exists
(
adapter_config_path
):
if
os
.
path
.
exists
(
adapter_config_path
):
with
open
(
adapter_config_path
)
as
file
:
with
open
(
adapter_config_path
)
as
file
:
adapter_config
=
json
.
load
(
file
)
adapter_config
=
json
.
load
(
file
)
...
...
vllm/plugins/lora_resolvers/hf_hub_resolver.py
0 → 100644
View file @
eefa41c1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
os
from
huggingface_hub
import
HfApi
,
snapshot_download
import
vllm.envs
as
envs
from
vllm.logger
import
init_logger
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.resolver
import
LoRAResolverRegistry
from
vllm.plugins.lora_resolvers.filesystem_resolver
import
FilesystemResolver
logger
=
init_logger
(
__name__
)
class
HfHubResolver
(
FilesystemResolver
):
def
__init__
(
self
,
repo_list
:
list
[
str
]):
logger
.
warning
(
"LoRA is allowing resolution from the following repositories on"
" HF Hub: %s please note that allowing remote downloads"
" is not secure, and that this plugin is not intended for use in"
" production environments."
,
repo_list
,
)
self
.
repo_list
:
list
[
str
]
=
repo_list
self
.
adapter_dirs
:
dict
[
str
,
set
[
str
]]
=
{}
async
def
resolve_lora
(
self
,
base_model_name
:
str
,
lora_name
:
str
)
->
LoRARequest
|
None
:
"""Resolves potential LoRA requests in a remote repo on HF Hub.
This is effectively the same behavior as the filesystem resolver, but
with a snapshot_download on dirs containing an adapter config prior
to inspecting the cached dir to build a potential LoRA
request.
"""
# If a LoRA name begins with the repository name, it's disambiguated
maybe_repo
=
await
self
.
_resolve_repo
(
lora_name
)
# If we haven't inspected this repo before, save available adapter dirs
if
maybe_repo
is
not
None
and
maybe_repo
not
in
self
.
adapter_dirs
:
self
.
adapter_dirs
[
maybe_repo
]
=
await
self
.
_get_adapter_dirs
(
maybe_repo
)
maybe_subpath
=
await
self
.
_resolve_repo_subpath
(
lora_name
,
maybe_repo
)
if
maybe_repo
is
None
or
maybe_subpath
is
None
:
return
None
repo_path
=
await
asyncio
.
to_thread
(
snapshot_download
,
repo_id
=
maybe_repo
,
allow_patterns
=
f
"
{
maybe_subpath
}
/*"
if
maybe_subpath
!=
"."
else
"*"
,
)
lora_path
=
os
.
path
.
join
(
repo_path
,
maybe_subpath
)
maybe_lora_request
=
await
self
.
_get_lora_req_from_path
(
lora_name
,
lora_path
,
base_model_name
)
return
maybe_lora_request
async
def
_resolve_repo
(
self
,
lora_name
:
str
)
->
str
|
None
:
"""Given a fully qualified path to a LoRA with respect to its HF Hub
repo, match the right repo to potentially download from if one exists.
Args:
lora_name: Path to LoRA in HF Hub, e.g., <org>/<repo>/<subpath>,
match on <org>/<repo> (if it contains an adapter directly) or
<org>/<repo>/ if it may have one in subdirs.
"""
for
potential_repo
in
self
.
repo_list
:
if
lora_name
.
startswith
(
potential_repo
)
and
(
len
(
lora_name
)
==
len
(
potential_repo
)
or
lora_name
[
len
(
potential_repo
)]
==
"/"
):
return
potential_repo
return
None
async
def
_resolve_repo_subpath
(
self
,
lora_name
:
str
,
maybe_repo
:
str
|
None
)
->
str
|
None
:
"""Given the fully qualified path of the LoRA with respect to the HF
Repo, get the subpath to download from assuming it's actually got an
adapter in it.
Args:
lora_name: Path to LoRA in HF Hub, e.g., <org>/<repo>/<subpath>
maybe_repo: Path to the repo to match against if one exists.
"""
if
maybe_repo
is
None
:
return
None
repo_len
=
len
(
maybe_repo
)
if
lora_name
==
maybe_repo
or
(
len
(
lora_name
)
==
repo_len
+
1
and
lora_name
[
-
1
]
==
"/"
):
# Resolves to the root of the directory
adapter_dir
=
"."
else
:
# It's a subpath; removing trailing slashes if there are any
adapter_dir
=
lora_name
[
repo_len
+
1
:].
rstrip
(
"/"
)
# Only download if the directory actually contains an adapter
is_adapter
=
adapter_dir
in
self
.
adapter_dirs
[
maybe_repo
]
return
adapter_dir
if
is_adapter
else
None
async
def
_get_adapter_dirs
(
self
,
repo_name
:
str
)
->
set
[
str
]:
"""Gets the subpaths within a HF repo that contain an adapter config.
Args:
repo_name: Name of the HF hub repo to inspect.
"""
repo_files
=
await
asyncio
.
to_thread
(
HfApi
().
list_repo_files
,
repo_id
=
repo_name
)
adapter_dirs
=
{
os
.
path
.
dirname
(
name
)
for
name
in
repo_files
if
name
.
endswith
(
"adapter_config.json"
)
}
if
"adapter_config.json"
in
repo_files
:
adapter_dirs
.
add
(
"."
)
return
adapter_dirs
def
register_hf_hub_resolver
():
"""Register the Hf hub LoRA Resolver with vLLM"""
hf_repo_list
=
envs
.
VLLM_LORA_RESOLVER_HF_REPO_LIST
is_enabled
=
(
envs
.
VLLM_PLUGINS
is
not
None
and
"lora_hf_hub_resolver"
in
envs
.
VLLM_PLUGINS
)
if
hf_repo_list
:
if
not
is_enabled
:
logger
.
warning
(
"It appears that VLLM_LORA_RESOLVER_HF_REPO_LIST is set, but "
"lora_hf_hub_resolver is not enabled in VLLM_PLUGINS; you must"
" enable this resolver directly in VLLM_PLUGINS to use it "
" because it allows remote downloads."
)
else
:
hf_hub_resolver
=
HfHubResolver
(
hf_repo_list
.
split
(
","
))
LoRAResolverRegistry
.
register_resolver
(
"Hf Hub Resolver"
,
hf_hub_resolver
)
return
\ No newline at end of file
vllm/tool_parsers/kimi_k2_tool_parser.py
View file @
eefa41c1
...
@@ -448,7 +448,7 @@ class KimiK2ToolParser(ToolParser):
...
@@ -448,7 +448,7 @@ class KimiK2ToolParser(ToolParser):
if
current_tool_call_matches
:
if
current_tool_call_matches
:
tool_id
,
tool_args
=
current_tool_call_matches
.
groups
()
tool_id
,
tool_args
=
current_tool_call_matches
.
groups
()
tool_name
=
tool_id
.
split
(
":"
)[
0
].
split
(
"."
)[
-
1
]
tool_name
=
tool_id
.
split
(
":"
)[
0
].
split
(
"."
)[
-
1
]
current_tool_call
[
"id"
]
=
tool_id
current_tool_call
[
"id"
]
=
tool_id
.
strip
()
current_tool_call
[
"name"
]
=
tool_name
current_tool_call
[
"name"
]
=
tool_name
current_tool_call
[
"arguments"
]
=
tool_args
current_tool_call
[
"arguments"
]
=
tool_args
else
:
else
:
...
@@ -458,7 +458,7 @@ class KimiK2ToolParser(ToolParser):
...
@@ -458,7 +458,7 @@ class KimiK2ToolParser(ToolParser):
if
current_tool_call_name_matches
:
if
current_tool_call_name_matches
:
(
tool_id_str
,)
=
current_tool_call_name_matches
.
groups
()
(
tool_id_str
,)
=
current_tool_call_name_matches
.
groups
()
tool_name
=
tool_id_str
.
split
(
":"
)[
0
].
split
(
"."
)[
-
1
]
tool_name
=
tool_id_str
.
split
(
":"
)[
0
].
split
(
"."
)[
-
1
]
current_tool_call
[
"id"
]
=
tool_id_str
current_tool_call
[
"id"
]
=
tool_id_str
.
strip
()
current_tool_call
[
"name"
]
=
tool_name
current_tool_call
[
"name"
]
=
tool_name
current_tool_call
[
"arguments"
]
=
""
current_tool_call
[
"arguments"
]
=
""
else
:
else
:
...
@@ -597,4 +597,4 @@ class KimiK2ToolParser(ToolParser):
...
@@ -597,4 +597,4 @@ class KimiK2ToolParser(ToolParser):
except
Exception
:
except
Exception
:
logger
.
exception
(
"Error trying to handle streaming tool call."
)
logger
.
exception
(
"Error trying to handle streaming tool call."
)
return
None
# do not stream a delta. skip this token ID.
return
None
# do not stream a delta. skip this token ID.
\ No newline at end of file
vllm/transformers_utils/config.py
View file @
eefa41c1
...
@@ -377,7 +377,7 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
...
@@ -377,7 +377,7 @@ def patch_rope_parameters(config: PretrainedConfig) -> None:
partial_rotary_factor
=
getattr_iter
(
config
,
names
,
None
,
warn
=
True
)
partial_rotary_factor
=
getattr_iter
(
config
,
names
,
None
,
warn
=
True
)
ompe
=
getattr
(
config
,
"original_max_position_embeddings"
,
None
)
ompe
=
getattr
(
config
,
"original_max_position_embeddings"
,
None
)
if
Version
(
version
(
"transformers"
))
<
Version
(
"5.0.0
.dev0
"
):
if
Version
(
version
(
"transformers"
))
<
Version
(
"5.0.0"
):
# Transformers v4 installed, legacy config fields may be present
# Transformers v4 installed, legacy config fields may be present
if
(
rope_scaling
:
=
getattr
(
config
,
"rope_scaling"
,
None
))
is
not
None
:
if
(
rope_scaling
:
=
getattr
(
config
,
"rope_scaling"
,
None
))
is
not
None
:
config
.
rope_parameters
=
rope_scaling
config
.
rope_parameters
=
rope_scaling
...
@@ -1209,4 +1209,4 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int:
...
@@ -1209,4 +1209,4 @@ def _maybe_retrieve_max_pos_from_hf(model, revision, **kwargs) -> int:
exc_info
=
e
,
exc_info
=
e
,
)
)
return
max_position_embeddings
return
max_position_embeddings
\ No newline at end of file
vllm/transformers_utils/model_arch_config_convertor.py
View file @
eefa41c1
...
@@ -441,7 +441,8 @@ MODEL_ARCH_CONFIG_CONVERTORS = {
...
@@ -441,7 +441,8 @@ MODEL_ARCH_CONFIG_CONVERTORS = {
"qwen3_5_mtp"
:
Qwen3_5MTPModelArchConfigConvertor
,
"qwen3_5_mtp"
:
Qwen3_5MTPModelArchConfigConvertor
,
"mimo_mtp"
:
MimoMTPModelArchConfigConvertor
,
"mimo_mtp"
:
MimoMTPModelArchConfigConvertor
,
"glm4_moe_mtp"
:
GLM4MoeMTPModelArchConfigConvertor
,
"glm4_moe_mtp"
:
GLM4MoeMTPModelArchConfigConvertor
,
"glm_ocr_mtp"
:
GLM4MoeMTPModelArchConfigConvertor
,
"ernie_mtp"
:
ErnieMTPModelArchConfigConvertor
,
"ernie_mtp"
:
ErnieMTPModelArchConfigConvertor
,
"pangu_ultra_moe_mtp"
:
PanguUltraMoeMTPModelArchConfigConvertor
,
"pangu_ultra_moe_mtp"
:
PanguUltraMoeMTPModelArchConfigConvertor
,
"longcat_flash_mtp"
:
LongCatFlashMTPModelArchConfigConvertor
,
"longcat_flash_mtp"
:
LongCatFlashMTPModelArchConfigConvertor
,
}
}
\ No newline at end of file
vllm/v1/structured_output/__init__.py
View file @
eefa41c1
...
@@ -74,9 +74,6 @@ class StructuredOutputManager:
...
@@ -74,9 +74,6 @@ class StructuredOutputManager:
self
.
tokenizer
=
cached_tokenizer_from_config
(
self
.
tokenizer
=
cached_tokenizer_from_config
(
model_config
=
self
.
vllm_config
.
model_config
model_config
=
self
.
vllm_config
.
model_config
)
)
reasoning_parser
=
(
self
.
vllm_config
.
structured_outputs_config
.
reasoning_parser
)
reasoning_parser_plugin
=
(
reasoning_parser_plugin
=
(
self
.
vllm_config
.
structured_outputs_config
.
reasoning_parser_plugin
self
.
vllm_config
.
structured_outputs_config
.
reasoning_parser_plugin
)
)
...
@@ -341,4 +338,4 @@ class StructuredOutputManager:
...
@@ -341,4 +338,4 @@ class StructuredOutputManager:
def
clear_backend
(
self
)
->
None
:
def
clear_backend
(
self
)
->
None
:
if
self
.
backend
is
not
None
:
if
self
.
backend
is
not
None
:
self
.
backend
.
destroy
()
self
.
backend
.
destroy
()
\ No newline at end of file
vllm/v1/worker/gpu/mm/encoder_runner.py
View file @
eefa41c1
...
@@ -132,7 +132,7 @@ class EncoderRunner:
...
@@ -132,7 +132,7 @@ class EncoderRunner:
mm_embeds
.
append
(
mm_embeds_item
)
mm_embeds
.
append
(
mm_embeds_item
)
# Copy the is_mm_embed tensor to the GPU.
# Copy the is_mm_embed tensor to the GPU.
is_mm_embed
=
self
.
tmp_
is_mm_embed
.
copy_to_gpu
(
is_mm_embed
)
is_mm_embed
=
is_mm_embed
.
to
(
device
=
self
.
device
,
non_blocking
=
True
)
return
mm_embeds
,
is_mm_embed
return
mm_embeds
,
is_mm_embed
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
...
...
vllm/v1/worker/gpu/model_runner.py
View file @
eefa41c1
...
@@ -672,7 +672,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -672,7 +672,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
cu_num_logits_np
=
np
.
empty
(
num_reqs
+
1
,
dtype
=
np
.
int32
)
cu_num_logits_np
=
np
.
empty
(
num_reqs
+
1
,
dtype
=
np
.
int32
)
cu_num_logits_np
[
0
]
=
0
cu_num_logits_np
[
0
]
=
0
np
.
cumsum
(
num_logits
,
out
=
cu_num_logits_np
[
1
:])
np
.
cumsum
(
num_logits
,
out
=
cu_num_logits_np
[
1
:])
cu_num_logits
=
self
.
tmp_cu_num_logits
.
copy_to_gpu
(
cu_num_logits_np
)
cu_num_logits
=
async_
copy_to_gpu
(
cu_num_logits_np
,
device
=
self
.
device
)
max_expand_len
=
self
.
num_speculative_steps
+
1
max_expand_len
=
self
.
num_speculative_steps
+
1
expanded_idx_mapping
,
expanded_local_pos
=
expand_idx_mapping
(
expanded_idx_mapping
,
expanded_local_pos
=
expand_idx_mapping
(
...
@@ -1225,4 +1225,4 @@ class ExecuteModelState(NamedTuple):
...
@@ -1225,4 +1225,4 @@ class ExecuteModelState(NamedTuple):
hidden_states
:
torch
.
Tensor
|
IntermediateTensors
hidden_states
:
torch
.
Tensor
|
IntermediateTensors
aux_hidden_states
:
list
[
torch
.
Tensor
]
|
None
aux_hidden_states
:
list
[
torch
.
Tensor
]
|
None
kv_connector_output
:
KVConnectorOutput
|
None
kv_connector_output
:
KVConnectorOutput
|
None
num_tokens_across_dp
:
torch
.
Tensor
|
None
num_tokens_across_dp
:
torch
.
Tensor
|
None
\ No newline at end of file
Prev
1
…
9
10
11
12
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment