[Misc] refactor examples series - lmcache (#16758)

Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>

[Misc] refactor examples series - lmcache (#16758)
Signed-off-by: reidliu41 <reid201711@gmail.com> Co-authored-by: reidliu41 <reid201711@gmail.com>
99ed5261 · Reid · GitHub · 207da281 · 99ed5261
Unverified Commit 99ed5261 authored Apr 17, 2025 by Reid Committed by GitHub Apr 17, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 85 additions and 50 deletions

examples/offline_inference/cpu_offload_lmcache.py examples/offline_inference/cpu_offload_lmcache.py +85 -50

No files found.
--- a/examples/offline_inference/cpu_offload_lmcache.py
+++ b/examples/offline_inference/cpu_offload_lmcache.py
@@ -3,9 +3,12 @@
 This file demonstrates the example usage of cpu offloading
 with LMCache.
-Note that `pip install lmcache` is needed to run this example.
+Note that `lmcache` is needed to run this example.
-Learn more about LMCache in https://github.com/LMCache/LMCache.
+Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
+Learn more about LMCache environment setup, please refer to:
+https://docs.lmcache.ai/getting_started/installation.html
 """
+import contextlib
 import os
 import time
@@ -15,51 +18,83 @@ from lmcache.integration.vllm.utils import ENGINE_NAME
 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
-# LMCache-related environment variables
-# Use experimental features in LMCache
+def setup_environment_variables():
-os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
+    # LMCache-related environment variables
-# LMCache is set to use 256 tokens per chunk
+    # Use experimental features in LMCache
-os.environ["LMCACHE_CHUNK_SIZE"] = "256"
+    os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
-# Enable local CPU backend in LMCache
+    # LMCache is set to use 256 tokens per chunk
-os.environ["LMCACHE_LOCAL_CPU"] = "True"
+    os.environ["LMCACHE_CHUNK_SIZE"] = "256"
-# Set local CPU memory limit to 5.0 GB
+    # Enable local CPU backend in LMCache
-os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
+    os.environ["LMCACHE_LOCAL_CPU"] = "True"
+    # Set local CPU memory limit to 5.0 GB
-# This example script runs two requests with a shared prefix.
+    os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
-shared_prompt = "Hello, how are you?" * 1000
-first_prompt = [
-    shared_prompt + "Hello, my name is",
+@contextlib.contextmanager
-]
+def build_llm_with_lmcache():
-second_prompt = [
+    ktc = KVTransferConfig.from_cli(
-    shared_prompt + "Tell me a very long story",
+        '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
-]
+    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+    # memory. Reduce the value if your GPU has less memory.
-sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+    # Note that LMCache is not compatible with chunked prefill for now.
+    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
-ktc = KVTransferConfig.from_cli(
+              kv_transfer_config=ktc,
-    '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
+              max_model_len=8000,
-# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
+              enable_chunked_prefill=False,
-# memory. Reduce the value if your GPU has less memory.
+              gpu_memory_utilization=0.8)
-# Note that LMCache is not compatible with chunked prefill for now.
-llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
+    try:
-          kv_transfer_config=ktc,
+        yield llm
-          max_model_len=8000,
+    finally:
-          enable_chunked_prefill=False,
+        # Clean up lmcache backend
-          gpu_memory_utilization=0.8)
+        LMCacheEngineBuilder.destroy(ENGINE_NAME)
-outputs = llm.generate(first_prompt, sampling_params)
-for output in outputs:
+def print_output(
-    generated_text = output.outputs[0].text
+    llm: LLM,
-    print(f"Generated text: {generated_text!r}")
+    prompt: list[str],
-print("First request done.")
+    sampling_params: SamplingParams,
+    req_str: str,
-time.sleep(1)
+):
+    start = time.time()
-outputs = llm.generate(second_prompt, sampling_params)
+    outputs = llm.generate(prompt, sampling_params)
-for output in outputs:
+    print("-" * 50)
-    generated_text = output.outputs[0].text
+    for output in outputs:
-    print(f"Generated text: {generated_text!r}")
+        generated_text = output.outputs[0].text
-print("Second request done.")
+        print(f"Generated text: {generated_text!r}")
+    print(f"Generation took {time.time() - start:.2f} seconds, "
-# Clean up lmcache backend
+          f"{req_str} request done.")
-LMCacheEngineBuilder.destroy(ENGINE_NAME)
+    print("-" * 50)
+def main():
+    setup_environment_variables()
+    with build_llm_with_lmcache() as llm:
+        # This example script runs two requests with a shared prefix.
+        # Define the shared prompt and specific prompts
+        shared_prompt = "Hello, how are you?" * 1000
+        first_prompt = [
+            shared_prompt + "Hello, my name is",
+        ]
+        second_prompt = [
+            shared_prompt + "Tell me a very long story",
+        ]
+        sampling_params = SamplingParams(temperature=0,
+                                         top_p=0.95,
+                                         max_tokens=10)
+        # Print the first output
+        print_output(llm, first_prompt, sampling_params, "first")
+        time.sleep(1)
+        # print the second output
+        print_output(llm, second_prompt, sampling_params, "second")
+if __name__ == "__main__":
+    main()