Unverified Commit 99ed5261 authored by Reid's avatar Reid Committed by GitHub
Browse files

[Misc] refactor examples series - lmcache (#16758)


Signed-off-by: default avatarreidliu41 <reid201711@gmail.com>
Co-authored-by: default avatarreidliu41 <reid201711@gmail.com>
parent 207da281
...@@ -3,9 +3,12 @@ ...@@ -3,9 +3,12 @@
This file demonstrates the example usage of cpu offloading This file demonstrates the example usage of cpu offloading
with LMCache. with LMCache.
Note that `pip install lmcache` is needed to run this example. Note that `lmcache` is needed to run this example.
Learn more about LMCache in https://github.com/LMCache/LMCache. Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
Learn more about LMCache environment setup, please refer to:
https://docs.lmcache.ai/getting_started/installation.html
""" """
import contextlib
import os import os
import time import time
...@@ -15,51 +18,83 @@ from lmcache.integration.vllm.utils import ENGINE_NAME ...@@ -15,51 +18,83 @@ from lmcache.integration.vllm.utils import ENGINE_NAME
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.config import KVTransferConfig from vllm.config import KVTransferConfig
# LMCache-related environment variables
# Use experimental features in LMCache def setup_environment_variables():
os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True" # LMCache-related environment variables
# LMCache is set to use 256 tokens per chunk # Use experimental features in LMCache
os.environ["LMCACHE_CHUNK_SIZE"] = "256" os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
# Enable local CPU backend in LMCache # LMCache is set to use 256 tokens per chunk
os.environ["LMCACHE_LOCAL_CPU"] = "True" os.environ["LMCACHE_CHUNK_SIZE"] = "256"
# Set local CPU memory limit to 5.0 GB # Enable local CPU backend in LMCache
os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0" os.environ["LMCACHE_LOCAL_CPU"] = "True"
# Set local CPU memory limit to 5.0 GB
# This example script runs two requests with a shared prefix. os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
shared_prompt = "Hello, how are you?" * 1000
first_prompt = [
shared_prompt + "Hello, my name is", @contextlib.contextmanager
] def build_llm_with_lmcache():
second_prompt = [ ktc = KVTransferConfig.from_cli(
shared_prompt + "Tell me a very long story", '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
] # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
# memory. Reduce the value if your GPU has less memory.
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10) # Note that LMCache is not compatible with chunked prefill for now.
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
ktc = KVTransferConfig.from_cli( kv_transfer_config=ktc,
'{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}') max_model_len=8000,
# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB enable_chunked_prefill=False,
# memory. Reduce the value if your GPU has less memory. gpu_memory_utilization=0.8)
# Note that LMCache is not compatible with chunked prefill for now.
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2", try:
kv_transfer_config=ktc, yield llm
max_model_len=8000, finally:
enable_chunked_prefill=False, # Clean up lmcache backend
gpu_memory_utilization=0.8) LMCacheEngineBuilder.destroy(ENGINE_NAME)
outputs = llm.generate(first_prompt, sampling_params)
for output in outputs: def print_output(
generated_text = output.outputs[0].text llm: LLM,
print(f"Generated text: {generated_text!r}") prompt: list[str],
print("First request done.") sampling_params: SamplingParams,
req_str: str,
time.sleep(1) ):
start = time.time()
outputs = llm.generate(second_prompt, sampling_params) outputs = llm.generate(prompt, sampling_params)
for output in outputs: print("-" * 50)
generated_text = output.outputs[0].text for output in outputs:
print(f"Generated text: {generated_text!r}") generated_text = output.outputs[0].text
print("Second request done.") print(f"Generated text: {generated_text!r}")
print(f"Generation took {time.time() - start:.2f} seconds, "
# Clean up lmcache backend f"{req_str} request done.")
LMCacheEngineBuilder.destroy(ENGINE_NAME) print("-" * 50)
def main():
setup_environment_variables()
with build_llm_with_lmcache() as llm:
# This example script runs two requests with a shared prefix.
# Define the shared prompt and specific prompts
shared_prompt = "Hello, how are you?" * 1000
first_prompt = [
shared_prompt + "Hello, my name is",
]
second_prompt = [
shared_prompt + "Tell me a very long story",
]
sampling_params = SamplingParams(temperature=0,
top_p=0.95,
max_tokens=10)
# Print the first output
print_output(llm, first_prompt, sampling_params, "first")
time.sleep(1)
# print the second output
print_output(llm, second_prompt, sampling_params, "second")
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment