Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a1239b53
Commit
a1239b53
authored
Aug 07, 2025
by
王敏
Browse files
[feat]支持mtp模型full_cuda_graph
parent
7d4f5027
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
8 additions
and
10 deletions
+8
-10
vllm/v1/attention/backends/mla/common.py
vllm/v1/attention/backends/mla/common.py
+4
-4
vllm/v1/spec_decode/eagle.py
vllm/v1/spec_decode/eagle.py
+4
-4
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+0
-2
No files found.
vllm/v1/attention/backends/mla/common.py
View file @
a1239b53
...
@@ -647,13 +647,13 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
...
@@ -647,13 +647,13 @@ class MLACommonMetadataBuilder(AttentionMetadataBuilder[M]):
rarange
=
np
.
repeat
(
query_lens
,
query_lens
)
-
arange
-
1
rarange
=
np
.
repeat
(
query_lens
,
query_lens
)
-
arange
-
1
repeats
=
torch
.
from_numpy
(
query_lens
).
pin_memory
().
to
(
repeats
=
torch
.
from_numpy
(
query_lens
).
pin_memory
().
to
(
block_table_tensor
.
device
,
non_blocking
=
True
)
block_table_tensor
.
device
,
non_blocking
=
True
)
.
contiguous
()
decode_block_table_tensor
=
torch
.
repeat_interleave
(
decode_block_table_tensor
=
torch
.
repeat_interleave
(
block_table_tensor
[:
self
.
_num_decodes
,
...],
block_table_tensor
[:
self
.
_num_decodes
,
...],
repeats
,
dim
=
0
)
repeats
,
dim
=
0
)
.
contiguous
()
decode_seq_lens
=
torch
.
repeat_interleave
(
seq_lens
[:
self
.
_num_decodes
],
repeats
,
dim
=
0
)
decode_seq_lens
=
torch
.
repeat_interleave
(
seq_lens
[:
self
.
_num_decodes
],
repeats
,
dim
=
0
)
.
contiguous
()
seq_lens_minus
=
torch
.
from_numpy
(
rarange
).
to
(
torch
.
int32
).
pin_memory
().
to
(
seq_lens_minus
=
torch
.
from_numpy
(
rarange
).
to
(
torch
.
int32
).
pin_memory
().
to
(
seq_lens
.
device
,
non_blocking
=
True
)
seq_lens
.
device
,
non_blocking
=
True
)
.
contiguous
()
decode_seq_lens
=
decode_seq_lens
-
seq_lens_minus
decode_seq_lens
=
decode_seq_lens
-
seq_lens_minus
if
self
.
spec_decode_block_table_tensor
is
not
None
:
if
self
.
spec_decode_block_table_tensor
is
not
None
:
...
...
vllm/v1/spec_decode/eagle.py
View file @
a1239b53
...
@@ -269,7 +269,7 @@ class EagleProposer:
...
@@ -269,7 +269,7 @@ class EagleProposer:
block_table
=
self
.
runner
.
attn_metadata_builders
[
0
].
block_table
.
get_device_tensor
()[:
batch_size
,
...]
block_table
=
self
.
runner
.
attn_metadata_builders
[
0
].
block_table
.
get_device_tensor
()[:
batch_size
,
...]
attn_metadata
.
decode
=
self
.
runner
.
attn_metadata_builders
[
0
].
_build_decode
(
attn_metadata
.
decode
=
self
.
runner
.
attn_metadata_builders
[
0
].
_build_decode
(
block_table_tensor
=
block_table
,
block_table_tensor
=
block_table
,
seq_lens
=
(
seq_lens
+
1
)
,
seq_lens
=
seq_lens
,
)
)
for
i
in
range
(
self
.
num_speculative_tokens
-
1
):
for
i
in
range
(
self
.
num_speculative_tokens
-
1
):
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
a1239b53
...
@@ -1548,8 +1548,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
...
@@ -1548,8 +1548,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
attn_metadata
,
attn_metadata
,
)
)
spec_token_ids
=
spec_token_ids
.
tolist
()
# Clear KVConnector state after all KVs are generated.
# Clear KVConnector state after all KVs are generated.
if
has_kv_transfer_group
():
if
has_kv_transfer_group
():
get_kv_transfer_group
().
clear_connector_metadata
()
get_kv_transfer_group
().
clear_connector_metadata
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment