Unverified Commit e7ebecf8 authored by Liangsheng Yin's avatar Liangsheng Yin Committed by GitHub
Browse files

Fix cache hit rate when chunked prefill (#2555)

parent 9a23c484
...@@ -248,7 +248,7 @@ class PrefillAdder: ...@@ -248,7 +248,7 @@ class PrefillAdder:
self.can_run_list.append(req) self.can_run_list.append(req)
self._prefill_one_req( self._prefill_one_req(
len(req.prefix_indices), 0,
req.extend_input_len, req.extend_input_len,
( (
min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION) min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION)
......
...@@ -629,16 +629,13 @@ class Scheduler: ...@@ -629,16 +629,13 @@ class Scheduler:
self.waiting_queue.append(req) self.waiting_queue.append(req)
def log_prefill_stats(self, adder, can_run_list, running_bs, has_being_chunked): def log_prefill_stats(self, adder, can_run_list, running_bs, has_being_chunked):
if isinstance(self.tree_cache, RadixCache): self.tree_cache_metrics["total"] += (
self.tree_cache_metrics["total"] += ( adder.log_input_tokens + adder.log_hit_tokens
adder.log_input_tokens + adder.log_hit_tokens ) / 10**9
) / 10**9 self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9
self.tree_cache_metrics["hit"] += (adder.log_hit_tokens) / 10**9 tree_cache_hit_rate = (
tree_cache_hit_rate = ( self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"] )
)
else:
tree_cache_hit_rate = 0.0
num_used = self.max_total_num_tokens - ( num_used = self.max_total_num_tokens - (
self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size() self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment