"docs/vscode:/vscode.git/clone" did not exist on "93702e445622e9710b0cb154ca747f21bdfc52de"
Commit c44fcded authored by laibao's avatar laibao
Browse files

feat(kvpress): 调度输出透传 num_kv_tokens

parent cafabeeb
......@@ -39,6 +39,7 @@ class NewRequestData:
pooling_params: PoolingParams | None
block_ids: tuple[list[int], ...]
num_computed_tokens: int
num_kv_tokens: int
lora_request: LoRARequest | None
prompt_embeds: "torch.Tensor | None" = None
......@@ -60,6 +61,7 @@ class NewRequestData:
pooling_params=request.pooling_params,
block_ids=block_ids,
num_computed_tokens=request.num_computed_tokens,
num_kv_tokens=request.num_kv_tokens,
lora_request=request.lora_request,
prompt_embeds=request.prompt_embeds,
prefill_token_ids=prefill_token_ids,
......@@ -78,6 +80,7 @@ class NewRequestData:
f"sampling_params={self.sampling_params},"
f"block_ids={self.block_ids},"
f"num_computed_tokens={self.num_computed_tokens},"
f"num_kv_tokens={self.num_kv_tokens},"
f"lora_request={self.lora_request},"
f"prompt_embeds_shape={prompt_embeds_shape}"
")"
......@@ -103,6 +106,7 @@ class NewRequestData:
f"sampling_params={self.sampling_params},"
f"block_ids={self.block_ids},"
f"num_computed_tokens={self.num_computed_tokens},"
f"num_kv_tokens={self.num_kv_tokens},"
f"lora_request={self.lora_request},"
f"prompt_embeds_shape={prompt_embeds_shape}"
")"
......@@ -125,6 +129,7 @@ class CachedRequestData:
all_token_ids: dict[str, list[int]]
new_block_ids: list[tuple[list[int], ...] | None]
num_computed_tokens: list[int]
num_kv_tokens: list[int]
num_output_tokens: list[int]
# Version of dataclass repr with token IDs obfuscated.
......@@ -141,6 +146,7 @@ class CachedRequestData:
f"all_token_ids_lens={all_token_ids_lens},"
f"new_block_ids={self.new_block_ids},"
f"num_computed_tokens={self.num_computed_tokens},"
f"num_kv_tokens={self.num_kv_tokens},"
f"num_output_tokens={self.num_output_tokens}"
f")"
)
......@@ -175,6 +181,7 @@ class CachedRequestData:
all_token_ids={},
new_block_ids=[],
num_computed_tokens=[],
num_kv_tokens=[],
num_output_tokens=[],
)
......
......@@ -131,6 +131,10 @@ class Request:
self.spec_token_ids: list[int] = []
self.num_computed_tokens = 0
# Number of tokens currently stored in the KV cache for this request.
# This may differ from `num_computed_tokens` when KV compression is
# enabled (e.g., token-shared prefill compression).
self.num_kv_tokens = 0
self.cache_salt: str | None = cache_salt
# Multi-modal related
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment