chore: rm --enable-kvbm in docker build since KVBM is in by default (#3775)

Signed-off-by: Ziqi Fan <ziqif@nvidia.com>

chore: rm --enable-kvbm in docker build since KVBM is in by default (#3775)
Signed-off-by: Ziqi Fan <ziqif@nvidia.com>
b7c335a4 · Ziqi Fan · GitHub · 0c87b00f · b7c335a4 · b7c335a4
Unverified Commit b7c335a4 authored Oct 21, 2025 by Ziqi Fan Committed by GitHub Oct 21, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 22 additions and 24 deletions

docs/kvbm/trtllm-setup.md docs/kvbm/trtllm-setup.md +12 -12

docs/kvbm/vllm-setup.md docs/kvbm/vllm-setup.md +10 -12

No files found.
--- a/docs/kvbm/trtllm-setup.md
+++ b/docs/kvbm/trtllm-setup.md
@@ -34,13 +34,13 @@ To learn what KVBM is, please check [here](kvbm_architecture.md)
 To use KVBM in TensorRT-LLM, you can follow the steps below:

 ```bash
-# start up etcd for KVBM leader/worker registration and discovery
+# Start up etcd for KVBM leader/worker registration and discovery
 docker compose -f deploy/docker-compose.yml up -d

-# Build a container that includes TensorRT-LLM and KVBM.
-./container/build.sh --framework trtllm --enable-kvbm
+# Build a dynamo TRTLLM container (KVBM is built in by default)
+./container/build.sh --framework trtllm

-# launch the container
+# Launch the container
 ./container/run.sh --framework trtllm -it --mount-workspace --use-nixl-gds

 # Configure KVBM cache tiers (choose one of the following options):
@@ -67,8 +67,8 @@ export DYN_KVBM_DISK_CACHE_GB=8
 # 1200 means 1200 seconds timeout
 export DYN_KVBM_LEADER_WORKER_INIT_TIMEOUT_SECS=1200

-# enable disk zerofill fallback for KVBM
-# set to true to enable fallback behavior when disk operations fail
+# Enable disk zerofill fallback for KVBM
+# Set to true to enable fallback behavior when disk operations fail
 export DYN_KVBM_DISK_ZEROFILL_FALLBACK=true
 ```

@@ -101,7 +101,7 @@ python3 -m dynamo.trtllm \
  --served-model-name Qwen/Qwen3-0.6B \
  --extra-engine-args /tmp/kvbm_llm_api_config.yaml &

-# make a call to LLM
+# Make a call to LLM
 curl localhost:8000/v1/chat/completions   -H "Content-Type: application/json"   -d '{
    "model": "Qwen/Qwen3-0.6B",
    "messages": [
@@ -128,7 +128,7 @@ Follow below steps to enable metrics collection and view via Grafana dashboard:
 # Start the basic services (etcd & natsd), along with Prometheus and Grafana
 docker compose -f deploy/docker-compose.yml --profile metrics up -d

-# set env var DYN_KVBM_METRICS to true, when launch via dynamo
+# Set env var DYN_KVBM_METRICS to true, when launch via dynamo
 # Optionally set DYN_KVBM_METRICS_PORT to choose the /metrics port (default: 6880).
 DYN_KVBM_METRICS=true \
 python3 -m dynamo.trtllm \
@@ -136,7 +136,7 @@ python3 -m dynamo.trtllm \
  --served-model-name Qwen/Qwen3-0.6B \
  --extra-engine-args /tmp/kvbm_llm_api_config.yaml &

-# optional if firewall blocks KVBM metrics ports to send prometheus metrics
+# Optional if firewall blocks KVBM metrics ports to send prometheus metrics
 sudo ufw allow 6880/tcp
 ```

@@ -148,8 +148,8 @@ Once the model is loaded ready, follow below steps to use LMBenchmark to benchma
 ```bash
 git clone https://github.com/LMCache/LMBenchmark.git

-# show case of running the synthetic multi-turn chat dataset.
-# we are passing model, endpoint, output file prefix and qps to the sh script.
+# Show case of running the synthetic multi-turn chat dataset.
+# We are passing model, endpoint, output file prefix and qps to the sh script.
 cd LMBenchmark/synthetic-multi-round-qa
 ./long_input_short_output_run.sh \
    "Qwen/Qwen3-0.6B" \
@@ -173,6 +173,6 @@ kv_cache_config:
  free_gpu_memory_fraction: 0.80
 EOF

-# run trtllm-serve for the baseline for comparison
+# Run trtllm-serve for the baseline for comparison
 trtllm-serve Qwen/Qwen3-0.6B --host localhost --port 8000 --backend pytorch --extra_llm_api_options /tmp/llm_api_config.yaml &
 ```
--- a/docs/kvbm/vllm-setup.md
+++ b/docs/kvbm/vllm-setup.md
@@ -27,13 +27,13 @@ To use KVBM in vLLM, you can follow the steps below:

 ### Docker Setup
 ```bash
-# start up etcd for KVBM leader/worker registration and discovery
+# Start up etcd for KVBM leader/worker registration and discovery
 docker compose -f deploy/docker-compose.yml up -d

-# build a container containing vllm and kvbm
-./container/build.sh --framework vllm --enable-kvbm
+# Build a dynamo vLLM container (KVBM is built in by default)
+./container/build.sh --framework vllm

-# launch the container
+# Launch the container
 ./container/run.sh --framework vllm -it --mount-workspace --use-nixl-gds
 ```

@@ -55,11 +55,9 @@ cd $DYNAMO_HOME/components/backends/vllm
 cd $DYNAMO_HOME/components/backends/vllm
 ./launch/disagg_kvbm_2p2d.sh
 ```
-> [!NOTE]
-> To tune the size of CPU or disk cache, set `DYN_KVBM_CPU_CACHE_GB` and `DYN_KVBM_DISK_CACHE_GB` accordingly. We only set `DYN_KVBM_CPU_CACHE_GB=20` in both scripts above.

 > [!NOTE]
-> Configure KVBM cache tiers (choose one of the following options):
+> Configure or tune KVBM cache tiers (choose one of the following options):
 > ```bash
 > # Option 1: CPU cache only (GPU -> CPU offloading)
 > # 4 means 4GB of pinned CPU memory would be used
@@ -86,7 +84,7 @@ cd $DYNAMO_HOME/components/backends/vllm

 ### Sample Request
 ```bash
-# make a request to verify vLLM with KVBM is started up correctly
+# Make a request to verify vLLM with KVBM is started up correctly
 # NOTE: change the model name if served with a different one
 curl localhost:8000/v1/chat/completions   -H "Content-Type: application/json"   -d '{
    "model": "Qwen/Qwen3-0.6B",
@@ -113,7 +111,7 @@ Follow below steps to enable metrics collection and view via Grafana dashboard:
 # Start the basic services (etcd & natsd), along with Prometheus and Grafana
 docker compose -f deploy/docker-compose.yml --profile metrics up -d

-# set env var DYN_KVBM_METRICS to true, when launch via dynamo
+# Set env var DYN_KVBM_METRICS to true, when launch via dynamo
 # Optionally set DYN_KVBM_METRICS_PORT to choose the /metrics port (default: 6880).
 # NOTE: update launch/disagg_kvbm.sh or launch/disagg_kvbm_2p2d.sh as needed
 DYN_KVBM_METRICS=true \
@@ -122,7 +120,7 @@ python -m dynamo.vllm \
    --enforce-eager \
    --connector kvbm

-# optional if firewall blocks KVBM metrics ports to send prometheus metrics
+# Optional, if firewall blocks KVBM metrics ports to send prometheus metrics
 sudo ufw allow 6880/tcp
 ```

@@ -134,8 +132,8 @@ Once the model is loaded ready, follow below steps to use LMBenchmark to benchma
 ```bash
 git clone https://github.com/LMCache/LMBenchmark.git

-# show case of running the synthetic multi-turn chat dataset.
-# we are passing model, endpoint, output file prefix and qps to the sh script.
+# Show case of running the synthetic multi-turn chat dataset.
+# We are passing model, endpoint, output file prefix and qps to the sh script.
 cd LMBenchmark/synthetic-multi-round-qa
 ./long_input_short_output_run.sh \
    "Qwen/Qwen3-0.6B" \