Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ea53ca5e
Unverified
Commit
ea53ca5e
authored
Jan 01, 2026
by
Benjamin Chislett
Committed by
GitHub
Jan 01, 2026
Browse files
[Bugfix] Fix block size used in EAGLE slot mapping (#31540)
Signed-off-by:
Benjamin Chislett
<
bchislett@nvidia.com
>
parent
27864a85
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
8 additions
and
9 deletions
+8
-9
vllm/v1/spec_decode/eagle.py
vllm/v1/spec_decode/eagle.py
+8
-9
No files found.
vllm/v1/spec_decode/eagle.py
View file @
ea53ca5e
...
@@ -71,7 +71,6 @@ class EagleProposer:
...
@@ -71,7 +71,6 @@ class EagleProposer:
self
.
device
=
device
self
.
device
=
device
self
.
dtype
=
vllm_config
.
model_config
.
dtype
self
.
dtype
=
vllm_config
.
model_config
.
dtype
self
.
max_model_len
=
vllm_config
.
model_config
.
max_model_len
self
.
max_model_len
=
vllm_config
.
model_config
.
max_model_len
self
.
block_size
=
vllm_config
.
cache_config
.
block_size
self
.
dp_rank
=
vllm_config
.
parallel_config
.
data_parallel_rank
self
.
dp_rank
=
vllm_config
.
parallel_config
.
data_parallel_rank
self
.
num_speculative_tokens
=
self
.
speculative_config
.
num_speculative_tokens
self
.
num_speculative_tokens
=
self
.
speculative_config
.
num_speculative_tokens
self
.
max_num_tokens
=
vllm_config
.
scheduler_config
.
max_num_batched_tokens
self
.
max_num_tokens
=
vllm_config
.
scheduler_config
.
max_num_batched_tokens
...
@@ -470,22 +469,23 @@ class EagleProposer:
...
@@ -470,22 +469,23 @@ class EagleProposer:
common_attn_metadata
.
_num_computed_tokens_cpu
+=
1
common_attn_metadata
.
_num_computed_tokens_cpu
+=
1
# Compute the slot mapping.
# Compute the slot mapping.
block_size
=
attn_metadata_builder
.
kv_cache_spec
.
block_size
if
self
.
uses_mrope
:
if
self
.
uses_mrope
:
# all dimensions of positions are the same
# all dimensions of positions are the same
block_numbers
=
clamped_positions
[
0
]
//
self
.
block_size
block_numbers
=
clamped_positions
[
0
]
//
block_size
else
:
else
:
block_numbers
=
clamped_positions
//
self
.
block_size
block_numbers
=
clamped_positions
//
block_size
block_ids
=
common_attn_metadata
.
block_table_tensor
.
gather
(
block_ids
=
common_attn_metadata
.
block_table_tensor
.
gather
(
dim
=
1
,
index
=
block_numbers
.
view
(
-
1
,
1
)
dim
=
1
,
index
=
block_numbers
.
view
(
-
1
,
1
)
)
)
block_ids
=
block_ids
.
view
(
-
1
)
block_ids
=
block_ids
.
view
(
-
1
)
if
self
.
uses_mrope
:
if
self
.
uses_mrope
:
common_attn_metadata
.
slot_mapping
=
(
common_attn_metadata
.
slot_mapping
=
(
block_ids
*
self
.
block_size
+
clamped_positions
[
0
]
%
self
.
block_size
block_ids
*
block_size
+
clamped_positions
[
0
]
%
block_size
)
)
else
:
else
:
common_attn_metadata
.
slot_mapping
=
(
common_attn_metadata
.
slot_mapping
=
(
block_ids
*
self
.
block_size
+
clamped_positions
%
self
.
block_size
block_ids
*
block_size
+
clamped_positions
%
block_size
)
)
# Mask out the slot mappings that exceed the max model length.
# Mask out the slot mappings that exceed the max model length.
# Otherwise, the KV cache will be inadvertently updated with the
# Otherwise, the KV cache will be inadvertently updated with the
...
@@ -800,12 +800,11 @@ class EagleProposer:
...
@@ -800,12 +800,11 @@ class EagleProposer:
attn_metadata
.
seq_lens
.
masked_fill_
(
exceeds_max_model_len
,
1
)
attn_metadata
.
seq_lens
.
masked_fill_
(
exceeds_max_model_len
,
1
)
# Compute the slot mapping.
# Compute the slot mapping.
block_size
=
tree_attn_metadata_builder
.
kv_cache_spec
.
block_size
query_positions
=
flattened_draft_positions
[:,
level
:
level
+
query_len
]
query_positions
=
flattened_draft_positions
[:,
level
:
level
+
query_len
]
block_numbers
=
query_positions
//
self
.
block_size
block_numbers
=
query_positions
//
block_size
block_ids
=
attn_metadata
.
block_table
.
gather
(
dim
=
1
,
index
=
block_numbers
)
block_ids
=
attn_metadata
.
block_table
.
gather
(
dim
=
1
,
index
=
block_numbers
)
slot_mapping
=
(
slot_mapping
=
block_ids
*
block_size
+
query_positions
%
block_size
block_ids
*
self
.
block_size
+
query_positions
%
self
.
block_size
)
# Mask out the slot mappings that exceed the max model length.
# Mask out the slot mappings that exceed the max model length.
# Otherwise, the KV cache will be inadvertently updated with the
# Otherwise, the KV cache will be inadvertently updated with the
# padding tokens.
# padding tokens.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment