Unverified Commit 5d80a917 authored by Jason Zhu's avatar Jason Zhu Committed by GitHub
Browse files

Minor fix in prefill cache example (#2494)

parent 8a25d3a7
...@@ -40,8 +40,16 @@ print("-" * 80) ...@@ -40,8 +40,16 @@ print("-" * 80)
# -1 since the last token can change when concatenating prompts. # -1 since the last token can change when concatenating prompts.
prefix_pos = len(llm.llm_engine.tokenizer.encode(prefix)) - 1 prefix_pos = len(llm.llm_engine.tokenizer.encode(prefix)) - 1
# Generate with prefix # The llm.generate call will batch all prompts and send the batch at once if resources allow.
outputs = llm.generate(generating_prompts, sampling_params, # The prefix will only be cached after the first batch is processed, so we need to call generate once
# to calculate the prefix and cache it.
outputs = llm.generate(generating_prompts[0],
sampling_params,
prefix_pos=[prefix_pos])
# Subsequent batches can leverage the cached prefix
outputs = llm.generate(generating_prompts,
sampling_params,
prefix_pos=[prefix_pos] * len(generating_prompts)) prefix_pos=[prefix_pos] * len(generating_prompts))
# Print the outputs. You should see the same outputs as before # Print the outputs. You should see the same outputs as before
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment