Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
c44fcded
Commit
c44fcded
authored
Feb 24, 2026
by
laibao
Browse files
feat(kvpress): 调度输出透传 num_kv_tokens
parent
cafabeeb
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
11 additions
and
0 deletions
+11
-0
vllm/v1/core/sched/output.py
vllm/v1/core/sched/output.py
+7
-0
vllm/v1/request.py
vllm/v1/request.py
+4
-0
No files found.
vllm/v1/core/sched/output.py
View file @
c44fcded
...
@@ -39,6 +39,7 @@ class NewRequestData:
...
@@ -39,6 +39,7 @@ class NewRequestData:
pooling_params
:
PoolingParams
|
None
pooling_params
:
PoolingParams
|
None
block_ids
:
tuple
[
list
[
int
],
...]
block_ids
:
tuple
[
list
[
int
],
...]
num_computed_tokens
:
int
num_computed_tokens
:
int
num_kv_tokens
:
int
lora_request
:
LoRARequest
|
None
lora_request
:
LoRARequest
|
None
prompt_embeds
:
"torch.Tensor | None"
=
None
prompt_embeds
:
"torch.Tensor | None"
=
None
...
@@ -60,6 +61,7 @@ class NewRequestData:
...
@@ -60,6 +61,7 @@ class NewRequestData:
pooling_params
=
request
.
pooling_params
,
pooling_params
=
request
.
pooling_params
,
block_ids
=
block_ids
,
block_ids
=
block_ids
,
num_computed_tokens
=
request
.
num_computed_tokens
,
num_computed_tokens
=
request
.
num_computed_tokens
,
num_kv_tokens
=
request
.
num_kv_tokens
,
lora_request
=
request
.
lora_request
,
lora_request
=
request
.
lora_request
,
prompt_embeds
=
request
.
prompt_embeds
,
prompt_embeds
=
request
.
prompt_embeds
,
prefill_token_ids
=
prefill_token_ids
,
prefill_token_ids
=
prefill_token_ids
,
...
@@ -78,6 +80,7 @@ class NewRequestData:
...
@@ -78,6 +80,7 @@ class NewRequestData:
f
"sampling_params=
{
self
.
sampling_params
}
,"
f
"sampling_params=
{
self
.
sampling_params
}
,"
f
"block_ids=
{
self
.
block_ids
}
,"
f
"block_ids=
{
self
.
block_ids
}
,"
f
"num_computed_tokens=
{
self
.
num_computed_tokens
}
,"
f
"num_computed_tokens=
{
self
.
num_computed_tokens
}
,"
f
"num_kv_tokens=
{
self
.
num_kv_tokens
}
,"
f
"lora_request=
{
self
.
lora_request
}
,"
f
"lora_request=
{
self
.
lora_request
}
,"
f
"prompt_embeds_shape=
{
prompt_embeds_shape
}
"
f
"prompt_embeds_shape=
{
prompt_embeds_shape
}
"
")"
")"
...
@@ -103,6 +106,7 @@ class NewRequestData:
...
@@ -103,6 +106,7 @@ class NewRequestData:
f
"sampling_params=
{
self
.
sampling_params
}
,"
f
"sampling_params=
{
self
.
sampling_params
}
,"
f
"block_ids=
{
self
.
block_ids
}
,"
f
"block_ids=
{
self
.
block_ids
}
,"
f
"num_computed_tokens=
{
self
.
num_computed_tokens
}
,"
f
"num_computed_tokens=
{
self
.
num_computed_tokens
}
,"
f
"num_kv_tokens=
{
self
.
num_kv_tokens
}
,"
f
"lora_request=
{
self
.
lora_request
}
,"
f
"lora_request=
{
self
.
lora_request
}
,"
f
"prompt_embeds_shape=
{
prompt_embeds_shape
}
"
f
"prompt_embeds_shape=
{
prompt_embeds_shape
}
"
")"
")"
...
@@ -125,6 +129,7 @@ class CachedRequestData:
...
@@ -125,6 +129,7 @@ class CachedRequestData:
all_token_ids
:
dict
[
str
,
list
[
int
]]
all_token_ids
:
dict
[
str
,
list
[
int
]]
new_block_ids
:
list
[
tuple
[
list
[
int
],
...]
|
None
]
new_block_ids
:
list
[
tuple
[
list
[
int
],
...]
|
None
]
num_computed_tokens
:
list
[
int
]
num_computed_tokens
:
list
[
int
]
num_kv_tokens
:
list
[
int
]
num_output_tokens
:
list
[
int
]
num_output_tokens
:
list
[
int
]
# Version of dataclass repr with token IDs obfuscated.
# Version of dataclass repr with token IDs obfuscated.
...
@@ -141,6 +146,7 @@ class CachedRequestData:
...
@@ -141,6 +146,7 @@ class CachedRequestData:
f
"all_token_ids_lens=
{
all_token_ids_lens
}
,"
f
"all_token_ids_lens=
{
all_token_ids_lens
}
,"
f
"new_block_ids=
{
self
.
new_block_ids
}
,"
f
"new_block_ids=
{
self
.
new_block_ids
}
,"
f
"num_computed_tokens=
{
self
.
num_computed_tokens
}
,"
f
"num_computed_tokens=
{
self
.
num_computed_tokens
}
,"
f
"num_kv_tokens=
{
self
.
num_kv_tokens
}
,"
f
"num_output_tokens=
{
self
.
num_output_tokens
}
"
f
"num_output_tokens=
{
self
.
num_output_tokens
}
"
f
")"
f
")"
)
)
...
@@ -175,6 +181,7 @@ class CachedRequestData:
...
@@ -175,6 +181,7 @@ class CachedRequestData:
all_token_ids
=
{},
all_token_ids
=
{},
new_block_ids
=
[],
new_block_ids
=
[],
num_computed_tokens
=
[],
num_computed_tokens
=
[],
num_kv_tokens
=
[],
num_output_tokens
=
[],
num_output_tokens
=
[],
)
)
...
...
vllm/v1/request.py
View file @
c44fcded
...
@@ -131,6 +131,10 @@ class Request:
...
@@ -131,6 +131,10 @@ class Request:
self
.
spec_token_ids
:
list
[
int
]
=
[]
self
.
spec_token_ids
:
list
[
int
]
=
[]
self
.
num_computed_tokens
=
0
self
.
num_computed_tokens
=
0
# Number of tokens currently stored in the KV cache for this request.
# This may differ from `num_computed_tokens` when KV compression is
# enabled (e.g., token-shared prefill compression).
self
.
num_kv_tokens
=
0
self
.
cache_salt
:
str
|
None
=
cache_salt
self
.
cache_salt
:
str
|
None
=
cache_salt
# Multi-modal related
# Multi-modal related
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment