Unverified Commit e7ebecf8 authored by Liangsheng Yin's avatar Liangsheng Yin Committed by GitHub
Browse files

Fix cache hit rate when chunked prefill (#2555)

parent 9a23c484
......@@ -248,7 +248,7 @@ class PrefillAdder:
self.can_run_list.append(req)
self._prefill_one_req(
len(req.prefix_indices),
0,
req.extend_input_len,
(
min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION)
......
......@@ -629,7 +629,6 @@ class Scheduler:
self.waiting_queue.append(req)
def log_prefill_stats(self, adder, can_run_list, running_bs, has_being_chunked):
if isinstance(self.tree_cache, RadixCache):
self.tree_cache_metrics["total"] += (
adder.log_input_tokens + adder.log_hit_tokens
) / 10**9
......@@ -637,8 +636,6 @@ class Scheduler:
tree_cache_hit_rate = (
self.tree_cache_metrics["hit"] / self.tree_cache_metrics["total"]
)
else:
tree_cache_hit_rate = 0.0
num_used = self.max_total_num_tokens - (
self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment