Commit c44fcded authored by laibao's avatar laibao
Browse files

feat(kvpress): 调度输出透传 num_kv_tokens

parent cafabeeb
...@@ -39,6 +39,7 @@ class NewRequestData: ...@@ -39,6 +39,7 @@ class NewRequestData:
pooling_params: PoolingParams | None pooling_params: PoolingParams | None
block_ids: tuple[list[int], ...] block_ids: tuple[list[int], ...]
num_computed_tokens: int num_computed_tokens: int
num_kv_tokens: int
lora_request: LoRARequest | None lora_request: LoRARequest | None
prompt_embeds: "torch.Tensor | None" = None prompt_embeds: "torch.Tensor | None" = None
...@@ -60,6 +61,7 @@ class NewRequestData: ...@@ -60,6 +61,7 @@ class NewRequestData:
pooling_params=request.pooling_params, pooling_params=request.pooling_params,
block_ids=block_ids, block_ids=block_ids,
num_computed_tokens=request.num_computed_tokens, num_computed_tokens=request.num_computed_tokens,
num_kv_tokens=request.num_kv_tokens,
lora_request=request.lora_request, lora_request=request.lora_request,
prompt_embeds=request.prompt_embeds, prompt_embeds=request.prompt_embeds,
prefill_token_ids=prefill_token_ids, prefill_token_ids=prefill_token_ids,
...@@ -78,6 +80,7 @@ class NewRequestData: ...@@ -78,6 +80,7 @@ class NewRequestData:
f"sampling_params={self.sampling_params}," f"sampling_params={self.sampling_params},"
f"block_ids={self.block_ids}," f"block_ids={self.block_ids},"
f"num_computed_tokens={self.num_computed_tokens}," f"num_computed_tokens={self.num_computed_tokens},"
f"num_kv_tokens={self.num_kv_tokens},"
f"lora_request={self.lora_request}," f"lora_request={self.lora_request},"
f"prompt_embeds_shape={prompt_embeds_shape}" f"prompt_embeds_shape={prompt_embeds_shape}"
")" ")"
...@@ -103,6 +106,7 @@ class NewRequestData: ...@@ -103,6 +106,7 @@ class NewRequestData:
f"sampling_params={self.sampling_params}," f"sampling_params={self.sampling_params},"
f"block_ids={self.block_ids}," f"block_ids={self.block_ids},"
f"num_computed_tokens={self.num_computed_tokens}," f"num_computed_tokens={self.num_computed_tokens},"
f"num_kv_tokens={self.num_kv_tokens},"
f"lora_request={self.lora_request}," f"lora_request={self.lora_request},"
f"prompt_embeds_shape={prompt_embeds_shape}" f"prompt_embeds_shape={prompt_embeds_shape}"
")" ")"
...@@ -125,6 +129,7 @@ class CachedRequestData: ...@@ -125,6 +129,7 @@ class CachedRequestData:
all_token_ids: dict[str, list[int]] all_token_ids: dict[str, list[int]]
new_block_ids: list[tuple[list[int], ...] | None] new_block_ids: list[tuple[list[int], ...] | None]
num_computed_tokens: list[int] num_computed_tokens: list[int]
num_kv_tokens: list[int]
num_output_tokens: list[int] num_output_tokens: list[int]
# Version of dataclass repr with token IDs obfuscated. # Version of dataclass repr with token IDs obfuscated.
...@@ -141,6 +146,7 @@ class CachedRequestData: ...@@ -141,6 +146,7 @@ class CachedRequestData:
f"all_token_ids_lens={all_token_ids_lens}," f"all_token_ids_lens={all_token_ids_lens},"
f"new_block_ids={self.new_block_ids}," f"new_block_ids={self.new_block_ids},"
f"num_computed_tokens={self.num_computed_tokens}," f"num_computed_tokens={self.num_computed_tokens},"
f"num_kv_tokens={self.num_kv_tokens},"
f"num_output_tokens={self.num_output_tokens}" f"num_output_tokens={self.num_output_tokens}"
f")" f")"
) )
...@@ -175,6 +181,7 @@ class CachedRequestData: ...@@ -175,6 +181,7 @@ class CachedRequestData:
all_token_ids={}, all_token_ids={},
new_block_ids=[], new_block_ids=[],
num_computed_tokens=[], num_computed_tokens=[],
num_kv_tokens=[],
num_output_tokens=[], num_output_tokens=[],
) )
......
...@@ -131,6 +131,10 @@ class Request: ...@@ -131,6 +131,10 @@ class Request:
self.spec_token_ids: list[int] = [] self.spec_token_ids: list[int] = []
self.num_computed_tokens = 0 self.num_computed_tokens = 0
# Number of tokens currently stored in the KV cache for this request.
# This may differ from `num_computed_tokens` when KV compression is
# enabled (e.g., token-shared prefill compression).
self.num_kv_tokens = 0
self.cache_salt: str | None = cache_salt self.cache_salt: str | None = cache_salt
# Multi-modal related # Multi-modal related
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment