Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c721b814
Commit
c721b814
authored
Feb 05, 2026
by
zhuwenwen
Browse files
sync v0.15.1
parent
d53fe7e5
Changes
328
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
27 additions
and
35 deletions
+27
-35
vllm/v1/attention/ops/chunked_prefill_paged_decode.py
vllm/v1/attention/ops/chunked_prefill_paged_decode.py
+2
-2
vllm/v1/attention/ops/flashmla.py
vllm/v1/attention/ops/flashmla.py
+1
-0
vllm/v1/engine/core_client.py
vllm/v1/engine/core_client.py
+0
-1
vllm/v1/spec_decode/eagle.py
vllm/v1/spec_decode/eagle.py
+1
-2
vllm/v1/structured_output/__init__.py
vllm/v1/structured_output/__init__.py
+3
-0
vllm/v1/worker/gpu/buffer_utils.py
vllm/v1/worker/gpu/buffer_utils.py
+1
-21
vllm/v1/worker/gpu/mm/encoder_runner.py
vllm/v1/worker/gpu/mm/encoder_runner.py
+6
-3
vllm/v1/worker/gpu/model_runner.py
vllm/v1/worker/gpu/model_runner.py
+13
-6
No files found.
vllm/v1/attention/ops/chunked_prefill_paged_decode.py
View file @
c721b814
...
@@ -303,8 +303,8 @@ def chunked_prefill_paged_decode(
...
@@ -303,8 +303,8 @@ def chunked_prefill_paged_decode(
num_seqs
=
len
(
seq_lens
)
num_seqs
=
len
(
seq_lens
)
num_query_heads
=
query
.
shape
[
1
]
num_query_heads
=
query
.
shape
[
1
]
# key may be None in cross-attention decode (already cached from encoder)
# key may be None in cross-attention decode (already cached from encoder)
num_kv_heads
=
key
.
shape
[
1
]
if
key
is
not
None
else
key_cache
.
shape
[
1
]
num_kv_heads
=
key
.
shape
[
1
]
num_queries_per_kv
=
num_
query
_heads
//
num_kv_heads
num_queries_per_kv
=
query
.
shape
[
1
]
//
key
.
shape
[
1
]
head_size
=
query
.
shape
[
2
]
head_size
=
query
.
shape
[
2
]
# Conversion of FP8 Tensor from uint8 storage to
# Conversion of FP8 Tensor from uint8 storage to
...
...
vllm/v1/attention/ops/flashmla.py
View file @
c721b814
...
@@ -22,6 +22,7 @@ else:
...
@@ -22,6 +22,7 @@ else:
if
current_platform
.
is_cuda
():
if
current_platform
.
is_cuda
():
try
:
try
:
import
vllm._flashmla_extension_C
# noqa: F401
import
vllm._flashmla_extension_C
# noqa: F401
_flashmla_extension_C_AVAILABLE
=
True
_flashmla_extension_C_AVAILABLE
=
True
except
ImportError
:
except
ImportError
:
_flashmla_extension_C_AVAILABLE
=
False
_flashmla_extension_C_AVAILABLE
=
False
...
...
vllm/v1/engine/core_client.py
View file @
c721b814
...
@@ -8,7 +8,6 @@ import sys
...
@@ -8,7 +8,6 @@ import sys
import
uuid
import
uuid
import
weakref
import
weakref
from
abc
import
ABC
,
abstractmethod
from
abc
import
ABC
,
abstractmethod
from
collections
import
defaultdict
,
deque
from
collections
import
defaultdict
,
deque
from
collections.abc
import
Awaitable
,
Callable
,
Sequence
from
collections.abc
import
Awaitable
,
Callable
,
Sequence
from
concurrent.futures
import
Future
from
concurrent.futures
import
Future
...
...
vllm/v1/spec_decode/eagle.py
View file @
c721b814
...
@@ -403,7 +403,7 @@ class SpecDecodeBaseProposer:
...
@@ -403,7 +403,7 @@ class SpecDecodeBaseProposer:
return
draft_token_ids
.
view
(
-
1
,
1
)
return
draft_token_ids
.
view
(
-
1
,
1
)
if
self
.
uses_mrope
:
if
self
.
uses_mrope
:
positions
=
self
.
mrope_
positions
[:,
last_token_indices
]
positions
=
self
.
positions
[:,
last_token_indices
]
else
:
else
:
positions
=
self
.
positions
[
last_token_indices
]
positions
=
self
.
positions
[
last_token_indices
]
if
self
.
method
in
(
if
self
.
method
in
(
...
@@ -1126,7 +1126,6 @@ class SpecDecodeBaseProposer:
...
@@ -1126,7 +1126,6 @@ class SpecDecodeBaseProposer:
"Qwen2_5_VLForConditionalGeneration"
,
"Qwen2_5_VLForConditionalGeneration"
,
"Qwen3VLForConditionalGeneration"
,
"Qwen3VLForConditionalGeneration"
,
"Qwen3VLMoeForConditionalGeneration"
,
"Qwen3VLMoeForConditionalGeneration"
,
"GlmOcrForConditionalGeneration"
,
]:
]:
self
.
model
.
config
.
image_token_index
=
target_model
.
config
.
image_token_id
self
.
model
.
config
.
image_token_index
=
target_model
.
config
.
image_token_id
elif
self
.
get_model_name
(
target_model
)
==
"PixtralForConditionalGeneration"
:
elif
self
.
get_model_name
(
target_model
)
==
"PixtralForConditionalGeneration"
:
...
...
vllm/v1/structured_output/__init__.py
View file @
c721b814
...
@@ -74,6 +74,9 @@ class StructuredOutputManager:
...
@@ -74,6 +74,9 @@ class StructuredOutputManager:
self
.
tokenizer
=
cached_tokenizer_from_config
(
self
.
tokenizer
=
cached_tokenizer_from_config
(
model_config
=
self
.
vllm_config
.
model_config
model_config
=
self
.
vllm_config
.
model_config
)
)
reasoning_parser
=
(
self
.
vllm_config
.
structured_outputs_config
.
reasoning_parser
)
reasoning_parser_plugin
=
(
reasoning_parser_plugin
=
(
self
.
vllm_config
.
structured_outputs_config
.
reasoning_parser_plugin
self
.
vllm_config
.
structured_outputs_config
.
reasoning_parser_plugin
)
)
...
...
vllm/v1/worker/gpu/buffer_utils.py
View file @
c721b814
...
@@ -11,26 +11,6 @@ from vllm.utils.platform_utils import is_uva_available
...
@@ -11,26 +11,6 @@ from vllm.utils.platform_utils import is_uva_available
from
vllm.utils.torch_utils
import
get_cuda_view_from_cpu_tensor
from
vllm.utils.torch_utils
import
get_cuda_view_from_cpu_tensor
def
async_copy_to_gpu
(
x
:
torch
.
Tensor
|
np
.
ndarray
,
out
:
torch
.
Tensor
|
None
=
None
,
device
:
torch
.
device
|
None
=
None
,
)
->
torch
.
Tensor
:
if
isinstance
(
x
,
np
.
ndarray
):
x
=
torch
.
from_numpy
(
x
)
assert
x
.
is_cpu
assert
not
x
.
is_pinned
()
if
out
is
None
:
assert
device
is
not
None
out
=
torch
.
empty_like
(
x
,
device
=
device
)
# CPU-to-CPU copy
tmp
=
x
.
pin_memory
()
# CPU-to-GPU copy
return
out
.
copy_
(
tmp
,
non_blocking
=
True
)
class
UvaBuffer
:
class
UvaBuffer
:
def
__init__
(
self
,
size
:
int
|
Sequence
[
int
],
dtype
:
torch
.
dtype
):
def
__init__
(
self
,
size
:
int
|
Sequence
[
int
],
dtype
:
torch
.
dtype
):
if
not
is_uva_available
():
if
not
is_uva_available
():
...
@@ -241,4 +221,4 @@ def _apply_write_kernel(
...
@@ -241,4 +221,4 @@ def _apply_write_kernel(
content
=
tl
.
load
(
write_contents_ptr
+
cu_start
+
block
,
mask
=
mask
)
content
=
tl
.
load
(
write_contents_ptr
+
cu_start
+
block
,
mask
=
mask
)
tl
.
store
(
tl
.
store
(
output_ptr
+
row_idx
*
output_stride
+
start_idx
+
block
,
content
,
mask
=
mask
output_ptr
+
row_idx
*
output_stride
+
start_idx
+
block
,
content
,
mask
=
mask
)
)
\ No newline at end of file
vllm/v1/worker/gpu/mm/encoder_runner.py
View file @
c721b814
...
@@ -6,6 +6,7 @@ import torch
...
@@ -6,6 +6,7 @@ import torch
from
vllm.model_executor.models.interfaces
import
SupportsMultiModal
from
vllm.model_executor.models.interfaces
import
SupportsMultiModal
from
vllm.multimodal.inputs
import
MultiModalFeatureSpec
,
MultiModalKwargsItem
from
vllm.multimodal.inputs
import
MultiModalFeatureSpec
,
MultiModalKwargsItem
from
vllm.multimodal.utils
import
group_mm_kwargs_by_modality
from
vllm.multimodal.utils
import
group_mm_kwargs_by_modality
from
vllm.v1.worker.gpu.buffer_utils
import
UvaBufferPool
from
vllm.v1.worker.utils
import
sanity_check_mm_encoder_outputs
from
vllm.v1.worker.utils
import
sanity_check_mm_encoder_outputs
...
@@ -31,6 +32,8 @@ class EncoderRunner:
...
@@ -31,6 +32,8 @@ class EncoderRunner:
self
.
req_id_to_mm_features
:
dict
[
str
,
list
[
MultiModalFeatureSpec
]]
=
{}
self
.
req_id_to_mm_features
:
dict
[
str
,
list
[
MultiModalFeatureSpec
]]
=
{}
self
.
encoder_cache
:
dict
[
str
,
torch
.
Tensor
]
=
{}
self
.
encoder_cache
:
dict
[
str
,
torch
.
Tensor
]
=
{}
self
.
tmp_is_mm_embed
=
UvaBufferPool
(
max_num_tokens
,
torch
.
bool
)
def
add_request
(
self
,
req_id
:
str
,
mm_features
:
list
[
MultiModalFeatureSpec
]):
def
add_request
(
self
,
req_id
:
str
,
mm_features
:
list
[
MultiModalFeatureSpec
]):
self
.
req_id_to_mm_features
[
req_id
]
=
mm_features
self
.
req_id_to_mm_features
[
req_id
]
=
mm_features
...
@@ -111,7 +114,7 @@ class EncoderRunner:
...
@@ -111,7 +114,7 @@ class EncoderRunner:
total_num_scheduled_tokens
,
total_num_scheduled_tokens
,
dtype
=
torch
.
bool
,
dtype
=
torch
.
bool
,
device
=
"cpu"
,
device
=
"cpu"
,
pin_memory
=
Tru
e
,
pin_memory
=
Fals
e
,
)
)
for
i
,
req_id
in
enumerate
(
req_ids
):
for
i
,
req_id
in
enumerate
(
req_ids
):
if
not
is_prefilling
[
i
]:
if
not
is_prefilling
[
i
]:
...
@@ -160,7 +163,7 @@ class EncoderRunner:
...
@@ -160,7 +163,7 @@ class EncoderRunner:
mm_embeds
.
append
(
mm_embeds_item
)
mm_embeds
.
append
(
mm_embeds_item
)
# Copy the is_mm_embed tensor to the GPU.
# Copy the is_mm_embed tensor to the GPU.
is_mm_embed
=
is_mm_embed
.
to
(
device
=
self
.
device
,
non_blocking
=
True
)
is_mm_embed
=
self
.
tmp_
is_mm_embed
.
copy_to_gpu
(
is_mm_embed
)
return
mm_embeds
,
is_mm_embed
return
mm_embeds
,
is_mm_embed
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
...
@@ -178,4 +181,4 @@ class EncoderRunner:
...
@@ -178,4 +181,4 @@ class EncoderRunner:
)
)
# Copy to the pre-allocated buffer for CUDA graphs.
# Copy to the pre-allocated buffer for CUDA graphs.
self
.
inputs_embeds
[:
x
.
shape
[
0
]]
=
x
self
.
inputs_embeds
[:
x
.
shape
[
0
]]
=
x
return
self
.
inputs_embeds
return
self
.
inputs_embeds
\ No newline at end of file
vllm/v1/worker/gpu/model_runner.py
View file @
c721b814
...
@@ -30,7 +30,7 @@ from vllm.v1.worker.gpu.attn_utils import (
...
@@ -30,7 +30,7 @@ from vllm.v1.worker.gpu.attn_utils import (
init_kv_cache
,
init_kv_cache
,
)
)
from
vllm.v1.worker.gpu.block_table
import
BlockTables
from
vllm.v1.worker.gpu.block_table
import
BlockTables
from
vllm.v1.worker.gpu.buffer_utils
import
async_copy_to_gpu
from
vllm.v1.worker.gpu.buffer_utils
import
UvaBufferPool
from
vllm.v1.worker.gpu.cudagraph_utils
import
CudaGraphManager
from
vllm.v1.worker.gpu.cudagraph_utils
import
CudaGraphManager
from
vllm.v1.worker.gpu.dp_utils
import
(
from
vllm.v1.worker.gpu.dp_utils
import
(
get_cudagraph_and_dp_padding
,
get_cudagraph_and_dp_padding
,
...
@@ -172,6 +172,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -172,6 +172,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# LoRA-related workers.
# LoRA-related workers.
self
.
lora_state
=
LoraState
(
max_num_reqs
=
self
.
max_num_reqs
)
self
.
lora_state
=
LoraState
(
max_num_reqs
=
self
.
max_num_reqs
)
# Buffers for CPU-to-GPU copies.
self
.
tmp_idx_mapping
=
UvaBufferPool
(
self
.
max_num_reqs
,
torch
.
int32
)
self
.
tmp_cu_num_logits
=
UvaBufferPool
(
self
.
max_num_reqs
+
1
,
torch
.
int32
)
self
.
tmp_query_start_loc
=
UvaBufferPool
(
self
.
max_num_reqs
+
1
,
torch
.
int32
)
self
.
kv_connector
:
KVConnector
=
NO_OP_KV_CONNECTOR
self
.
kv_connector
:
KVConnector
=
NO_OP_KV_CONNECTOR
def
update_max_model_len
(
self
,
max_model_len
:
int
)
->
None
:
def
update_max_model_len
(
self
,
max_model_len
:
int
)
->
None
:
...
@@ -513,7 +518,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -513,7 +518,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self
.
req_states
.
req_id_to_index
[
req_id
]
for
req_id
in
req_ids
self
.
req_states
.
req_id_to_index
[
req_id
]
for
req_id
in
req_ids
]
]
idx_mapping_np
=
np
.
array
(
idx_mapping_list
,
dtype
=
np
.
int32
)
idx_mapping_np
=
np
.
array
(
idx_mapping_list
,
dtype
=
np
.
int32
)
idx_mapping
=
async_
copy_to_gpu
(
idx_mapping_np
,
device
=
self
.
device
)
idx_mapping
=
self
.
tmp_idx_mapping
.
copy_to_gpu
(
idx_mapping_np
)
# Get the number of draft tokens for each request.
# Get the number of draft tokens for each request.
if
not
scheduler_output
.
scheduled_spec_decode_tokens
:
if
not
scheduler_output
.
scheduled_spec_decode_tokens
:
...
@@ -541,7 +546,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -541,7 +546,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
cu_num_logits_np
=
np
.
empty
(
num_reqs
+
1
,
dtype
=
np
.
int32
)
cu_num_logits_np
=
np
.
empty
(
num_reqs
+
1
,
dtype
=
np
.
int32
)
cu_num_logits_np
[
0
]
=
0
cu_num_logits_np
[
0
]
=
0
np
.
cumsum
(
num_logits
,
out
=
cu_num_logits_np
[
1
:])
np
.
cumsum
(
num_logits
,
out
=
cu_num_logits_np
[
1
:])
cu_num_logits
=
async_
copy_to_gpu
(
cu_num_logits_np
,
device
=
self
.
device
)
cu_num_logits
=
self
.
tmp_cu_num_logits
.
copy_to_gpu
(
cu_num_logits_np
)
expanded_idx_mapping
=
expand_idx_mapping
(
expanded_idx_mapping
=
expand_idx_mapping
(
idx_mapping
,
idx_mapping
,
...
@@ -560,8 +565,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -560,8 +565,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# Pad for full CUDA graph mode.
# Pad for full CUDA graph mode.
# Some attention backends like FA3 require query_start_loc to be non-decreasing.
# Some attention backends like FA3 require query_start_loc to be non-decreasing.
query_start_loc_np
[
num_reqs
+
1
:]
=
num_tokens
query_start_loc_np
[
num_reqs
+
1
:]
=
num_tokens
async_copy_to_gpu
(
query_start_loc_np
,
out
=
self
.
input_buffers
.
query_start_loc
)
self
.
tmp_query_start_loc
.
copy_to_gpu
(
query_start_loc_np
,
out
=
self
.
input_buffers
.
query_start_loc
,
)
query_start_loc_np
=
query_start_loc_np
[:
num_reqs
+
1
]
query_start_loc_np
=
query_start_loc_np
[:
num_reqs
+
1
]
query_start_loc_cpu
=
torch
.
from_numpy
(
query_start_loc_np
)
query_start_loc_cpu
=
torch
.
from_numpy
(
query_start_loc_np
)
query_start_loc
=
self
.
input_buffers
.
query_start_loc
[:
num_reqs
+
1
]
query_start_loc
=
self
.
input_buffers
.
query_start_loc
[:
num_reqs
+
1
]
...
@@ -969,4 +976,4 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -969,4 +976,4 @@ class GPUModelRunner(LoRAModelRunnerMixin):
if
self
.
use_async_scheduling
:
if
self
.
use_async_scheduling
:
return
async_output
return
async_output
return
async_output
.
get_output
()
return
async_output
.
get_output
()
\ No newline at end of file
Prev
1
…
13
14
15
16
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment