"vscode:/vscode.git/clone" did not exist on "81b8af8d36c7967933b576054bed94e7c18c5fec"
Unverified Commit b7c335a4 authored by Ziqi Fan's avatar Ziqi Fan Committed by GitHub
Browse files

chore: rm --enable-kvbm in docker build since KVBM is in by default (#3775)


Signed-off-by: default avatarZiqi Fan <ziqif@nvidia.com>
parent 0c87b00f
...@@ -34,13 +34,13 @@ To learn what KVBM is, please check [here](kvbm_architecture.md) ...@@ -34,13 +34,13 @@ To learn what KVBM is, please check [here](kvbm_architecture.md)
To use KVBM in TensorRT-LLM, you can follow the steps below: To use KVBM in TensorRT-LLM, you can follow the steps below:
```bash ```bash
# start up etcd for KVBM leader/worker registration and discovery # Start up etcd for KVBM leader/worker registration and discovery
docker compose -f deploy/docker-compose.yml up -d docker compose -f deploy/docker-compose.yml up -d
# Build a container that includes TensorRT-LLM and KVBM. # Build a dynamo TRTLLM container (KVBM is built in by default)
./container/build.sh --framework trtllm --enable-kvbm ./container/build.sh --framework trtllm
# launch the container # Launch the container
./container/run.sh --framework trtllm -it --mount-workspace --use-nixl-gds ./container/run.sh --framework trtllm -it --mount-workspace --use-nixl-gds
# Configure KVBM cache tiers (choose one of the following options): # Configure KVBM cache tiers (choose one of the following options):
...@@ -67,8 +67,8 @@ export DYN_KVBM_DISK_CACHE_GB=8 ...@@ -67,8 +67,8 @@ export DYN_KVBM_DISK_CACHE_GB=8
# 1200 means 1200 seconds timeout # 1200 means 1200 seconds timeout
export DYN_KVBM_LEADER_WORKER_INIT_TIMEOUT_SECS=1200 export DYN_KVBM_LEADER_WORKER_INIT_TIMEOUT_SECS=1200
# enable disk zerofill fallback for KVBM # Enable disk zerofill fallback for KVBM
# set to true to enable fallback behavior when disk operations fail # Set to true to enable fallback behavior when disk operations fail
export DYN_KVBM_DISK_ZEROFILL_FALLBACK=true export DYN_KVBM_DISK_ZEROFILL_FALLBACK=true
``` ```
...@@ -101,7 +101,7 @@ python3 -m dynamo.trtllm \ ...@@ -101,7 +101,7 @@ python3 -m dynamo.trtllm \
--served-model-name Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \
--extra-engine-args /tmp/kvbm_llm_api_config.yaml & --extra-engine-args /tmp/kvbm_llm_api_config.yaml &
# make a call to LLM # Make a call to LLM
curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "Qwen/Qwen3-0.6B", "model": "Qwen/Qwen3-0.6B",
"messages": [ "messages": [
...@@ -128,7 +128,7 @@ Follow below steps to enable metrics collection and view via Grafana dashboard: ...@@ -128,7 +128,7 @@ Follow below steps to enable metrics collection and view via Grafana dashboard:
# Start the basic services (etcd & natsd), along with Prometheus and Grafana # Start the basic services (etcd & natsd), along with Prometheus and Grafana
docker compose -f deploy/docker-compose.yml --profile metrics up -d docker compose -f deploy/docker-compose.yml --profile metrics up -d
# set env var DYN_KVBM_METRICS to true, when launch via dynamo # Set env var DYN_KVBM_METRICS to true, when launch via dynamo
# Optionally set DYN_KVBM_METRICS_PORT to choose the /metrics port (default: 6880). # Optionally set DYN_KVBM_METRICS_PORT to choose the /metrics port (default: 6880).
DYN_KVBM_METRICS=true \ DYN_KVBM_METRICS=true \
python3 -m dynamo.trtllm \ python3 -m dynamo.trtllm \
...@@ -136,7 +136,7 @@ python3 -m dynamo.trtllm \ ...@@ -136,7 +136,7 @@ python3 -m dynamo.trtllm \
--served-model-name Qwen/Qwen3-0.6B \ --served-model-name Qwen/Qwen3-0.6B \
--extra-engine-args /tmp/kvbm_llm_api_config.yaml & --extra-engine-args /tmp/kvbm_llm_api_config.yaml &
# optional if firewall blocks KVBM metrics ports to send prometheus metrics # Optional if firewall blocks KVBM metrics ports to send prometheus metrics
sudo ufw allow 6880/tcp sudo ufw allow 6880/tcp
``` ```
...@@ -148,8 +148,8 @@ Once the model is loaded ready, follow below steps to use LMBenchmark to benchma ...@@ -148,8 +148,8 @@ Once the model is loaded ready, follow below steps to use LMBenchmark to benchma
```bash ```bash
git clone https://github.com/LMCache/LMBenchmark.git git clone https://github.com/LMCache/LMBenchmark.git
# show case of running the synthetic multi-turn chat dataset. # Show case of running the synthetic multi-turn chat dataset.
# we are passing model, endpoint, output file prefix and qps to the sh script. # We are passing model, endpoint, output file prefix and qps to the sh script.
cd LMBenchmark/synthetic-multi-round-qa cd LMBenchmark/synthetic-multi-round-qa
./long_input_short_output_run.sh \ ./long_input_short_output_run.sh \
"Qwen/Qwen3-0.6B" \ "Qwen/Qwen3-0.6B" \
...@@ -173,6 +173,6 @@ kv_cache_config: ...@@ -173,6 +173,6 @@ kv_cache_config:
free_gpu_memory_fraction: 0.80 free_gpu_memory_fraction: 0.80
EOF EOF
# run trtllm-serve for the baseline for comparison # Run trtllm-serve for the baseline for comparison
trtllm-serve Qwen/Qwen3-0.6B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/llm_api_config.yaml & trtllm-serve Qwen/Qwen3-0.6B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/llm_api_config.yaml &
``` ```
...@@ -27,13 +27,13 @@ To use KVBM in vLLM, you can follow the steps below: ...@@ -27,13 +27,13 @@ To use KVBM in vLLM, you can follow the steps below:
### Docker Setup ### Docker Setup
```bash ```bash
# start up etcd for KVBM leader/worker registration and discovery # Start up etcd for KVBM leader/worker registration and discovery
docker compose -f deploy/docker-compose.yml up -d docker compose -f deploy/docker-compose.yml up -d
# build a container containing vllm and kvbm # Build a dynamo vLLM container (KVBM is built in by default)
./container/build.sh --framework vllm --enable-kvbm ./container/build.sh --framework vllm
# launch the container # Launch the container
./container/run.sh --framework vllm -it --mount-workspace --use-nixl-gds ./container/run.sh --framework vllm -it --mount-workspace --use-nixl-gds
``` ```
...@@ -55,11 +55,9 @@ cd $DYNAMO_HOME/components/backends/vllm ...@@ -55,11 +55,9 @@ cd $DYNAMO_HOME/components/backends/vllm
cd $DYNAMO_HOME/components/backends/vllm cd $DYNAMO_HOME/components/backends/vllm
./launch/disagg_kvbm_2p2d.sh ./launch/disagg_kvbm_2p2d.sh
``` ```
> [!NOTE]
> To tune the size of CPU or disk cache, set `DYN_KVBM_CPU_CACHE_GB` and `DYN_KVBM_DISK_CACHE_GB` accordingly. We only set `DYN_KVBM_CPU_CACHE_GB=20` in both scripts above.
> [!NOTE] > [!NOTE]
> Configure KVBM cache tiers (choose one of the following options): > Configure or tune KVBM cache tiers (choose one of the following options):
> ```bash > ```bash
> # Option 1: CPU cache only (GPU -> CPU offloading) > # Option 1: CPU cache only (GPU -> CPU offloading)
> # 4 means 4GB of pinned CPU memory would be used > # 4 means 4GB of pinned CPU memory would be used
...@@ -86,7 +84,7 @@ cd $DYNAMO_HOME/components/backends/vllm ...@@ -86,7 +84,7 @@ cd $DYNAMO_HOME/components/backends/vllm
### Sample Request ### Sample Request
```bash ```bash
# make a request to verify vLLM with KVBM is started up correctly # Make a request to verify vLLM with KVBM is started up correctly
# NOTE: change the model name if served with a different one # NOTE: change the model name if served with a different one
curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{ curl localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "Qwen/Qwen3-0.6B", "model": "Qwen/Qwen3-0.6B",
...@@ -113,7 +111,7 @@ Follow below steps to enable metrics collection and view via Grafana dashboard: ...@@ -113,7 +111,7 @@ Follow below steps to enable metrics collection and view via Grafana dashboard:
# Start the basic services (etcd & natsd), along with Prometheus and Grafana # Start the basic services (etcd & natsd), along with Prometheus and Grafana
docker compose -f deploy/docker-compose.yml --profile metrics up -d docker compose -f deploy/docker-compose.yml --profile metrics up -d
# set env var DYN_KVBM_METRICS to true, when launch via dynamo # Set env var DYN_KVBM_METRICS to true, when launch via dynamo
# Optionally set DYN_KVBM_METRICS_PORT to choose the /metrics port (default: 6880). # Optionally set DYN_KVBM_METRICS_PORT to choose the /metrics port (default: 6880).
# NOTE: update launch/disagg_kvbm.sh or launch/disagg_kvbm_2p2d.sh as needed # NOTE: update launch/disagg_kvbm.sh or launch/disagg_kvbm_2p2d.sh as needed
DYN_KVBM_METRICS=true \ DYN_KVBM_METRICS=true \
...@@ -122,7 +120,7 @@ python -m dynamo.vllm \ ...@@ -122,7 +120,7 @@ python -m dynamo.vllm \
--enforce-eager \ --enforce-eager \
--connector kvbm --connector kvbm
# optional if firewall blocks KVBM metrics ports to send prometheus metrics # Optional, if firewall blocks KVBM metrics ports to send prometheus metrics
sudo ufw allow 6880/tcp sudo ufw allow 6880/tcp
``` ```
...@@ -134,8 +132,8 @@ Once the model is loaded ready, follow below steps to use LMBenchmark to benchma ...@@ -134,8 +132,8 @@ Once the model is loaded ready, follow below steps to use LMBenchmark to benchma
```bash ```bash
git clone https://github.com/LMCache/LMBenchmark.git git clone https://github.com/LMCache/LMBenchmark.git
# show case of running the synthetic multi-turn chat dataset. # Show case of running the synthetic multi-turn chat dataset.
# we are passing model, endpoint, output file prefix and qps to the sh script. # We are passing model, endpoint, output file prefix and qps to the sh script.
cd LMBenchmark/synthetic-multi-round-qa cd LMBenchmark/synthetic-multi-round-qa
./long_input_short_output_run.sh \ ./long_input_short_output_run.sh \
"Qwen/Qwen3-0.6B" \ "Qwen/Qwen3-0.6B" \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment