[Bugfix] Add warmup for prefix caching example (#5235)

bd0e7802 · Zhuohan Li · GitHub · 06b2550c · bd0e7802
Unverified Commit bd0e7802 authored Jun 03, 2024 by Zhuohan Li Committed by GitHub Jun 03, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 2 deletions

examples/offline_inference_with_prefix.py examples/offline_inference_with_prefix.py +4 -2

No files found.
--- a/examples/offline_inference_with_prefix.py
+++ b/examples/offline_inference_with_prefix.py
@@ -51,8 +51,10 @@ for output in outputs:

 print("-" * 80)

-# The llm.generate call will batch all prompts and send the batch at once
-# if resources allow.
+# Warmup so that the shared prompt's KV cache is computed.
+prefix_cached_llm.generate(generating_prompts[0], sampling_params)
+
+# Generate with prefix caching.
 start_time_cached = time()
 outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
 duration_cached = time() - start_time_cached