Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
b16f183c
Commit
b16f183c
authored
Jul 12, 2025
by
王敏
Browse files
[fix]1.优化mtp代码,解决prefix-caching不兼容问题; 2.修复v1 engine从tokenizer config中读取max_model_len导致长文本输入报错
parent
ac182341
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
39 additions
and
68 deletions
+39
-68
vllm/config.py
vllm/config.py
+9
-9
vllm/v1/attention/backends/mla/common.py
vllm/v1/attention/backends/mla/common.py
+24
-53
vllm/v1/attention/backends/utils.py
vllm/v1/attention/backends/utils.py
+1
-1
vllm/v1/spec_decode/eagle.py
vllm/v1/spec_decode/eagle.py
+3
-2
vllm/v1/worker/gpu_model_runner.py
vllm/v1/worker/gpu_model_runner.py
+2
-3
No files found.
vllm/config.py
View file @
b16f183c
...
...
@@ -1440,17 +1440,17 @@ class ModelConfig:
spec_target_max_model_len
=
self
.
spec_target_max_model_len
,
encoder_config
=
self
.
encoder_config
)
tokenizer_config
=
try_get_tokenizer_config
(
self
.
tokenizer
,
trust_remote_code
=
self
.
trust_remote_code
,
revision
=
self
.
tokenizer_revision
)
#
tokenizer_config = try_get_tokenizer_config(
#
self.tokenizer,
#
trust_remote_code=self.trust_remote_code,
#
revision=self.tokenizer_revision)
if
tokenizer_config
is
None
:
return
max_model_len
#
if tokenizer_config is None:
#
return max_model_len
model_max_length
=
tokenizer_config
.
get
(
"model_max_length"
,
max_model_len
)
max_model_len
=
min
(
max_model_len
,
model_max_length
)
#
model_max_length = tokenizer_config.get("model_max_length",
#
max_model_len)
#
max_model_len = min(max_model_len, model_max_length)
return
max_model_len
...
...
vllm/v1/attention/backends/mla/common.py
View file @
b16f183c
...
...
@@ -388,30 +388,8 @@ class MLACommonMetadataBuilder(Generic[M]):
)
self
.
block_table
=
block_table
self
.
_use_spec_decode
=
False
self
.
pin_memory
=
is_pin_memory_available
()
self
.
_num_scheduled_tokens
=
torch
.
zeros
(
scheduler_config
.
max_num_seqs
,
dtype
=
torch
.
int32
,
device
=
runner
.
device
)
self
.
_num_scheduled_tokens_cpu_tensor
=
torch
.
zeros
(
(
scheduler_config
.
max_num_seqs
,
),
device
=
"cpu"
,
dtype
=
torch
.
int32
,
pin_memory
=
self
.
pin_memory
,
)
self
.
_num_scheduled_tokens_np
=
self
.
_num_scheduled_tokens_cpu_tensor
.
numpy
()
self
.
_seq_lens_minus
=
torch
.
zeros
(
scheduler_config
.
max_num_seqs
*
5
,
dtype
=
torch
.
int32
,
device
=
runner
.
device
)
self
.
_seq_lens_minus_cpu_tensor
=
torch
.
zeros
(
(
scheduler_config
.
max_num_seqs
*
5
,
),
device
=
"cpu"
,
dtype
=
torch
.
int32
,
pin_memory
=
self
.
pin_memory
,
)
self
.
_seq_lens_minus_np
=
self
.
_seq_lens_minus_cpu_tensor
.
numpy
()
self
.
use_spec_decode
=
False
self
.
num_scheduled_tokens_np
=
np
.
zeros
(
scheduler_config
.
max_num_seqs
,
dtype
=
np
.
int32
)
def
reorder_batch
(
self
,
input_batch
:
"InputBatch"
,
...
...
@@ -445,7 +423,7 @@ class MLACommonMetadataBuilder(Generic[M]):
req_idx
=
input_batch
.
req_id_to_index
[
req_id
]
num_computed_tokens
=
input_batch
.
num_computed_tokens_cpu
[
req_idx
]
num_prompt_tokens
=
input_batch
.
num_prompt_tokens
[
req_idx
]
self
.
_
num_scheduled_tokens_np
[
i
]
=
num_tokens
self
.
num_scheduled_tokens_np
[
i
]
=
num_tokens
if
num_computed_tokens
<
num_prompt_tokens
:
prefills
.
append
(
i
)
num_prefill_tokens
+=
num_tokens
...
...
@@ -478,9 +456,10 @@ class MLACommonMetadataBuilder(Generic[M]):
modified_batch
=
True
# num_scheduled_tokens also need to be swapped
tmp
=
self
.
_num_scheduled_tokens_np
[
decode_idx
]
self
.
_num_scheduled_tokens_np
[
decode_idx
]
=
self
.
_num_scheduled_tokens_np
[
prefills
[
i
-
1
]]
self
.
_num_scheduled_tokens_np
[
prefills
[
i
-
1
]]
=
tmp
tmp
=
self
.
num_scheduled_tokens_np
[
decode_idx
]
self
.
num_scheduled_tokens_np
[
decode_idx
]
=
self
.
num_scheduled_tokens_np
[
prefills
[
i
-
1
]]
self
.
num_scheduled_tokens_np
[
prefills
[
i
-
1
]]
=
tmp
# Save for next `build` call
# TODO(lucas): this is a bit of a hack, we should probably have a
...
...
@@ -490,11 +469,7 @@ class MLACommonMetadataBuilder(Generic[M]):
self
.
_num_decode_tokens
=
num_decode_tokens
self
.
_num_prefill_tokens
=
num_prefill_tokens
self
.
_use_spec_decode
=
use_spec_decode
if
use_spec_decode
:
self
.
_num_scheduled_tokens
[:
len
(
input_batch
.
req_ids
)].
copy_
(
self
.
_num_scheduled_tokens_cpu_tensor
[:
len
(
input_batch
.
req_ids
)],
non_blocking
=
True
)
self
.
use_spec_decode
=
use_spec_decode
return
modified_batch
...
...
@@ -601,31 +576,27 @@ class MLACommonMetadataBuilder(Generic[M]):
decode_metadata
=
None
if
self
.
_num_decodes
>
0
:
if
self
.
_use_spec_decode
:
# generate block_table/seq_lens of mla in spec decoding scenarios
if
common_attn_metadata
.
num_rejected_tokens_tuple
is
None
:
repeats
=
self
.
_num_scheduled_tokens
[:
self
.
_num_decodes
]
repeats_cpu
=
self
.
_num_scheduled_tokens_np
[:
self
.
_num_decodes
]
else
:
repeats
=
self
.
_num_scheduled_tokens
[:
self
.
_num_decodes
]
-
\
common_attn_metadata
.
num_rejected_tokens_tuple
[
1
][:
self
.
_num_decodes
]
num_rejected_tokens
=
common_attn_metadata
.
num_rejected_tokens_tuple
[
0
][:
self
.
_num_decodes
]
repeats_cpu
=
self
.
_num_scheduled_tokens_np
[:
self
.
_num_decodes
]
-
\
np
.
array
(
num_rejected_tokens
)
if
self
.
use_spec_decode
:
query_lens
=
self
.
num_scheduled_tokens_np
[:
self
.
_num_decodes
]
if
common_attn_metadata
.
num_rejected_tokens
is
not
None
:
num_rejected_tokens
=
common_attn_metadata
.
num_rejected_tokens
[:
self
.
_num_decodes
]
query_lens
=
query_lens
-
np
.
array
(
num_rejected_tokens
,
dtype
=
np
.
int32
)
self
.
_num_decode_tokens
-=
sum
(
num_rejected_tokens
)
cu_num_blocks
=
np
.
cumsum
(
query_lens
)
virtual_batches
=
cu_num_blocks
[
-
1
]
block_offsets
=
np
.
repeat
(
cu_num_blocks
-
query_lens
,
query_lens
)
arange
=
np
.
arange
(
virtual_batches
,
dtype
=
np
.
int32
)
-
block_offsets
rarange
=
np
.
repeat
(
query_lens
,
query_lens
)
-
arange
-
1
repeats
=
torch
.
from_numpy
(
query_lens
).
pin_memory
().
to
(
block_table_tensor
.
device
,
non_blocking
=
True
)
decode_block_table_tensor
=
torch
.
repeat_interleave
(
block_table_tensor
[:
self
.
_num_decodes
,
...],
repeats
,
dim
=
0
)
total_decode_tokens
=
np
.
sum
(
repeats_cpu
)
decode_seq_lens
=
torch
.
repeat_interleave
(
seq_lens
[:
self
.
_num_decodes
],
repeats
,
dim
=
0
)
self
.
_seq_lens_minus_np
[:
total_decode_tokens
]
=
np
.
fromiter
(
chain
.
from_iterable
(
np
.
flip
(
np
.
arange
(
x
))
for
x
in
repeats_cpu
),
dtype
=
int
)
self
.
_seq_lens_minus
[:
total_decode_tokens
].
copy_
(
self
.
_seq_lens_minus_cpu_tensor
[:
total_decode_tokens
],
non_blocking
=
True
)
decode_seq_lens
=
decode_seq_lens
-
self
.
_seq_lens_minus
[:
total_decode_tokens
]
seq_lens_minus
=
torch
.
from_numpy
(
rarange
).
to
(
torch
.
int32
).
pin_memory
().
to
(
seq_lens
.
device
,
non_blocking
=
True
)
decode_seq_lens
=
decode_seq_lens
-
seq_lens_minus
decode_metadata
=
self
.
_build_decode
(
block_table_tensor
=
decode_block_table_tensor
,
...
...
vllm/v1/attention/backends/utils.py
View file @
b16f183c
...
...
@@ -17,7 +17,7 @@ class CommonAttentionMetadata:
seq_lens
:
torch
.
Tensor
"""(batch_size,), the length of each request including both computed tokens
and newly scheduled tokens"""
num_rejected_tokens
_tuple
:
tuple
[
list
[
int
],
torch
.
Tensor
]
=
None
num_rejected_tokens
:
list
[
int
]
=
None
"""(batch_size,), record the rejected tokens number in cpu and gpu"""
def
validate_kv_sharing_target
(
current_layer_name
,
target_layer_name
,
...
...
vllm/v1/spec_decode/eagle.py
View file @
b16f183c
...
...
@@ -93,7 +93,8 @@ class EagleProposer:
# [batch_size, max_num_blocks_per_req]
block_table
:
torch
.
Tensor
,
# [batch_size]
num_rejected_tokens_tuple
:
tuple
[
list
[
int
],
torch
.
Tensor
],
num_rejected_tokens
:
list
[
int
],
# [batch_size]
sampling_metadata
:
SamplingMetadata
)
->
torch
.
Tensor
:
num_tokens
=
target_token_ids
.
shape
[
0
]
...
...
@@ -143,7 +144,7 @@ class EagleProposer:
common_attn_metadata
=
CommonAttentionMetadata
(
query_start_loc
=
cu_num_tokens
,
seq_lens
=
seq_lens
,
num_rejected_tokens
_tuple
=
num_rejected_tokens
_tuple
)
num_rejected_tokens
=
num_rejected_tokens
)
assert
self
.
runner
is
not
None
...
...
vllm/v1/worker/gpu_model_runner.py
View file @
b16f183c
...
...
@@ -1441,7 +1441,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
else
:
block_table
=
None
num_rejected_tokens
_tuple
=
None
num_rejected_tokens
=
None
if
spec_decode_metadata
is
None
:
# input_ids can be None for multimodal models.
target_token_ids
=
self
.
input_ids
[:
num_scheduled_tokens
]
...
...
@@ -1481,7 +1481,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
target_hidden_states
=
hidden_states
[
token_indices
]
target_slot_mapping
=
eagle_attn_metadata
.
slot_mapping
[
token_indices
]
num_rejected_tokens_tuple
=
(
num_rejected_tokens
,
num_rejected_tokens_tensor
)
draft_token_ids
=
self
.
drafter
.
propose
(
target_token_ids
=
target_token_ids
,
target_positions
=
target_positions
,
...
...
@@ -1491,7 +1490,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
cu_num_tokens
=
cu_num_tokens
,
block_table
=
block_table
,
sampling_metadata
=
sampling_metadata
,
num_rejected_tokens
_tuple
=
num_rejected_tokens
_tuple
num_rejected_tokens
=
num_rejected_tokens
)
spec_token_ids
=
draft_token_ids
.
tolist
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment