[Bugfix] Fix offline_inference_with_prefix.py (#9505)

ae8b633b · Tyler Michael Smith · GitHub · 1bbbcc0b · ae8b633b
Unverified Commit ae8b633b authored Oct 18, 2024 by Tyler Michael Smith Committed by GitHub Oct 18, 2024
Show whitespace changes
Inline Side-by-side

Showing with 4 additions and 2 deletions

examples/offline_inference_with_prefix.py examples/offline_inference_with_prefix.py +4 -2

No files found.
--- a/examples/offline_inference_with_prefix.py
+++ b/examples/offline_inference_with_prefix.py
@@ -29,11 +29,13 @@ generating_prompts = [prefix + prompt for prompt in prompts]
 sampling_params = SamplingParams(temperature=0.0)

 # Create an LLM.
-regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
+regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.3)

+# The second LLM needs to request a higher gpu_memory_utilization because
+# the first LLM has already allocated a full 30% of the gpu memory.
 prefix_cached_llm = LLM(model="facebook/opt-125m",
                        enable_prefix_caching=True,
-                        gpu_memory_utilization=0.4)
+                        gpu_memory_utilization=0.6)
 print("Results without `enable_prefix_caching`")

 # Generate texts from the prompts. The output is a list of RequestOutput objects