chore: add info about cache flush and bump version (#1839)

7dea77c3 · ishandhanani · GitHub · a604c7f0 · 7dea77c3 · 7dea77c3
Unverified Commit 7dea77c3 authored Jul 09, 2025 by ishandhanani Committed by GitHub Jul 09, 2025
Show whitespace changes
Inline Side-by-side

Showing with 9 additions and 3 deletions

container/Dockerfile.sglang-deepep container/Dockerfile.sglang-deepep +3 -3

examples/sglang/dsr1-wideep.md examples/sglang/dsr1-wideep.md +6 -0

No files found.
--- a/container/Dockerfile.sglang-deepep
+++ b/container/Dockerfile.sglang-deepep
@@ -89,9 +89,9 @@ RUN pip install --break-system-packages "sglang==0.4.8.post1"
 ENV SGL_FORCE_SHUTDOWN=1

 WORKDIR /sgl-workspace
-# support batch completions for SGL benchmarking
-# https://github.com/ai-dynamo/dynamo/pull/1721
-ARG DYNAMO_COMMIT="9cbf803172f8ed4b3342c9d5237b49cb07d4d95c"
+# include flush cache endpoint and server support
+# https://github.com/ai-dynamo/dynamo/pull/1769
+ARG DYNAMO_COMMIT="bd91dca6141e05bcfbe9bd4dea54cc58b9e37d75"
 RUN git clone https://github.com/ai-dynamo/dynamo.git && cd dynamo && git checkout ${DYNAMO_COMMIT}

 # install dynamo in editable mode

--- a/examples/sglang/dsr1-wideep.md
+++ b/examples/sglang/dsr1-wideep.md
@@ -72,6 +72,8 @@ In each container, you should be in the `/sgl-workspace/dynamo/examples/sglang`
 ```bash
 # run ingress
 dynamo run in=http out=dyn &
+# optionally run the http server that allows you to flush the kv cache for all workers (see benchmarking section below)
+python3 utils/sgl_http_server.py --ns dynamo &
 # run prefill worker
 python3 components/worker.py \
  --model-path /model/ \
@@ -170,6 +172,8 @@ Example usage:
 ```bash
 # warmup
 ./utils/bench.sh HEAD_PREFILL_NODE_IP --type warmup
+# if you ran the http server on the head prefill node, you can optionally flush the kv cache for all workers (similar to SGLangs benchmarking script)
+curl -X POST http://${HEAD_PREFILL_NODE_IP}:9001/flush_cache
 # run benchmark
 ./utils/bench.sh HEAD_PREFILL_NODE_IP --type e2e
 ```
@@ -181,6 +185,8 @@ Example usage:
 ```bash
 # generate data
 python3 utils/generate_bench_data.py --output data.jsonl --num-prompts 8192 --input-len 4096 --output-len 5 --model deepseek-ai/DeepSeek-R1
+# if you ran the http server on the head prefill node, you can optionally flush the kv cache for all workers (similar to SGLangs benchmarking script)
+curl -X POST http://${HEAD_PREFILL_NODE_IP}:9001/flush_cache
 # run benchmark
 ./utils/bench.sh HEAD_PREFILL_NODE_IP --type custom_completions
 ```