Unverified Commit b441e26a authored by jthomson04's avatar jthomson04 Committed by GitHub
Browse files

feat: Interface for shared kv cache handling in kv routing (#7536)


Signed-off-by: default avatarjthomson04 <jwillthomson19@gmail.com>
Co-authored-by: default avatarIshan Dhanani <ishandhanani@gmail.com>
Co-authored-by: default avatarishandhanani <82981111+ishandhanani@users.noreply.github.com>
parent 1b0334c8
......@@ -2458,6 +2458,7 @@ dependencies = [
"serde",
"serde_json",
"serial_test",
"sha2",
"strum",
"temp-env",
"tempfile",
......
......@@ -38,6 +38,8 @@ _KV_ROUTER_FIELDS: tuple[str, ...] = (
"router_queue_policy",
"use_remote_indexer",
"serve_indexer",
"shared_cache_multiplier",
"shared_cache_type",
)
......@@ -64,6 +66,8 @@ class KvRouterConfigBase(ConfigBase):
router_queue_policy: str
use_remote_indexer: bool = False
serve_indexer: bool = False
shared_cache_multiplier: float = 0.0
shared_cache_type: str = "none"
def kv_router_kwargs(self) -> dict:
"""Return a dict suitable for ``KvRouterConfig(**kwargs)``."""
......@@ -299,3 +303,30 @@ class KvRouterArgGroup(ArgGroup):
),
dest="use_remote_indexer",
)
add_argument(
g,
flag_name="--shared-cache-multiplier",
env_var="DYN_SHARED_CACHE_MULTIPLIER",
default=0.5,
help=(
"[EXPERIMENTAL] KV Router: Multiplier for shared cache hits (0.0-1.0). "
"Blocks in the shared cache are less valuable than device-local blocks. "
"E.g. 0.5 means each shared hit counts as half a device-local hit. "
"Default 0.5."
),
arg_type=float,
)
add_argument(
g,
flag_name="--shared-cache-type",
env_var="DYN_SHARED_CACHE_TYPE",
default="none",
help=(
"[EXPERIMENTAL] KV Router: Type of external shared KV cache to query. "
"'none' (default): disabled. "
"'hicache': query Mooncake master directly for SGLang L3 (HiCache) state "
"using SGLang-compatible Mooncake key derivation."
),
arg_type=str,
choices=["none", "hicache"],
)
......@@ -9,8 +9,11 @@ registration, request routing, metrics, and disaggregated serving.
SGLang is pre-1.0 and regularly moves/renames internal APIs between releases. We
support the current version plus 1 version back (N and N-1). The pattern:
1. **All SGLang imports that have broken (or may break) across versions go through
`_compat.py`**, never directly from `sglang.*` in component code.
1. **Only SGLang imports that have actually broken across a version upgrade go through
`_compat.py`.** Do not preemptively route every `sglang.*` import through the shim --
import directly until a real breakage is observed. When an upgrade breaks an import,
move that specific symbol into `_compat.py` and replace direct imports in component
code with the shim.
2. `_compat.py` uses try/except ImportError: new path first, old path fallback.
3. When SGLang introduces a new class/function that doesn't exist in older versions
(e.g., `NetworkAddress`), add a minimal polyfill in the except branch -- just
......
......@@ -2,11 +2,14 @@
# SPDX-License-Identifier: Apache-2.0
import asyncio
import json
import logging
from typing import Any, List, Optional
import sglang as sgl
from sglang.srt.environ import envs
from sglang.srt.server_args import ServerArgs
from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
from dynamo._core import Endpoint
from dynamo.common.utils.output_modalities import get_output_modalities
......@@ -14,6 +17,8 @@ from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_model
from dynamo.sglang._compat import NetworkAddress, get_local_ip_auto, get_scheduler_info
from dynamo.sglang.args import DynamoConfig
SGLANG_HICACHE_MOONCAKE_RUNTIME_KEY = "sglang_hicache_mooncake"
async def _register_model_with_runtime_config(
engine: sgl.Engine,
......@@ -114,6 +119,135 @@ def _get_bootstrap_info_for_config(
return None, None
def _parse_hicache_storage_extra_config(
raw_extra_config: Optional[Any],
) -> dict[str, Any]:
if raw_extra_config is None:
return {}
if isinstance(raw_extra_config, dict):
return dict(raw_extra_config)
if isinstance(raw_extra_config, str):
raw_extra_config = raw_extra_config.strip()
if not raw_extra_config:
return {}
try:
parsed = json.loads(raw_extra_config)
except json.JSONDecodeError as e:
logging.warning(
f"Failed to parse hicache_storage_backend_extra_config JSON: {e}"
)
return {}
if isinstance(parsed, dict):
return parsed
logging.warning(
"hicache_storage_backend_extra_config JSON was not an object; ignoring it."
)
return {}
logging.warning(
"Unsupported hicache_storage_backend_extra_config type %s; ignoring it.",
type(raw_extra_config).__name__,
)
return {}
def _get_mooncake_runtime_data(server_args: ServerArgs) -> Optional[dict[str, Any]]:
if getattr(server_args, "hicache_storage_backend", None) != "mooncake":
return None
extra_config = _parse_hicache_storage_extra_config(
getattr(server_args, "hicache_storage_backend_extra_config", None)
)
try:
from sglang.srt.mem_cache.storage.mooncake_store.mooncake_store import (
MooncakeStoreConfig,
)
except ImportError as e:
logging.warning(f"MooncakeStoreConfig import unavailable: {e}")
return None
# Graceful degradation: Mooncake runtime metadata is optional. If config
# resolution fails for any reason (file not found, malformed env vars,
# upstream API change), skip publishing the metadata rather than crashing
# the worker -- the worker still serves requests, just without HiCache
# router hints. Broad catch is intentional per python-guidelines.md.
try:
if extra_config and (
extra_config.get("master_server_address") is not None
or extra_config.get("client_server_address") is not None
):
mooncake_config = MooncakeStoreConfig.load_from_extra_config(extra_config)
elif envs.SGLANG_HICACHE_MOONCAKE_CONFIG_PATH.is_set():
mooncake_config = MooncakeStoreConfig.from_file()
else:
mooncake_config = MooncakeStoreConfig.load_from_env()
except Exception as e:
logging.warning(f"Failed to resolve Mooncake config for runtime metadata: {e}")
return None
tp_size = int(getattr(server_args, "tp_size", 1) or 1)
pp_size = int(getattr(server_args, "pp_size", 1) or 1)
try:
is_mla_model = bool(server_args.use_mla_backend())
except Exception as e:
logging.warning(f"Failed to determine whether model uses MLA backend: {e}")
is_mla_model = False
try:
spec_algorithm = SpeculativeAlgorithm.from_string(
getattr(server_args, "speculative_algorithm", None)
)
is_eagle = bool(spec_algorithm.is_eagle())
except Exception as e:
logging.warning(f"Failed to determine speculative algorithm: {e}")
is_eagle = False
tp_lcm_size = extra_config.get("tp_lcm_size")
try:
tp_lcm_size = int(tp_lcm_size) if tp_lcm_size is not None else None
except (TypeError, ValueError):
logging.warning("Ignoring non-integer Mooncake tp_lcm_size=%r", tp_lcm_size)
tp_lcm_size = None
should_split_heads = (
not is_mla_model
and getattr(server_args, "hicache_mem_layout", None) == "page_head"
and tp_lcm_size is not None
and tp_lcm_size > tp_size
and tp_lcm_size % tp_size == 0
)
extra_backend_tag = extra_config.get("extra_backend_tag")
if not isinstance(extra_backend_tag, str) or not extra_backend_tag:
extra_backend_tag = None
master_server_address = getattr(mooncake_config, "master_server_address", None)
if not isinstance(master_server_address, str) or not master_server_address:
master_server_address = None
return {
"backend": "mooncake",
"page_size": int(getattr(server_args, "page_size", 1) or 1),
"tp_size": tp_size,
"pp_size": pp_size,
"is_mla_model": is_mla_model,
"is_eagle": is_eagle,
"tp_lcm_size": tp_lcm_size,
"should_split_heads": should_split_heads,
"extra_backend_tag": extra_backend_tag,
"master_server_address": master_server_address,
"master_metrics_port": int(
getattr(mooncake_config, "master_metrics_port", 9003)
),
}
async def _get_runtime_config(
engine: sgl.Engine, server_args: ServerArgs, dynamo_args: DynamoConfig
) -> Optional[ModelRuntimeConfig]:
......@@ -169,6 +303,19 @@ async def _get_runtime_config(
if server_args.speculative_algorithm in ("EAGLE", "NEXTN"):
runtime_config.enable_eagle = True
mooncake_runtime_data = _get_mooncake_runtime_data(server_args)
if mooncake_runtime_data is not None:
try:
runtime_config.set_engine_specific(
SGLANG_HICACHE_MOONCAKE_RUNTIME_KEY,
json.dumps(mooncake_runtime_data),
)
logging.info("Published Mooncake HiCache runtime metadata for router use.")
except Exception as e:
logging.warning(
f"Failed to attach Mooncake HiCache runtime metadata to registration: {e}"
)
try:
scheduler_info = get_scheduler_info(engine)
max_total_tokens = scheduler_info.get("max_total_num_tokens")
......
......@@ -335,4 +335,4 @@ Model and endpoint are configured in `.opencode/opencode.jsonc`:
- **[NVIDIA Request Extensions (nvext)](../../components/frontend/nvext.md)**: Full `nvext` field reference including agent hints
- **[Configuration and Tuning](../../components/router/router-configuration.md)**: Router configuration and CLI arguments
- **[SGLang HiCache](../../integrations/sglang-hicache.md)**: Enabling hierarchical KV cache
- **[SGLang HiCache](sglang-hicache.md)**: Enabling hierarchical KV cache
......@@ -188,6 +188,6 @@ Ensure both prefill and decode workers can reach each other over TCP. The bootst
- **[SGLang README](README.md)**: Quick start and feature overview
- **[Reference Guide](sglang-reference-guide.md)**: Architecture, configuration, and operational details
- **[SGLang Multimodal](../../features/multimodal/multimodal-sglang.md)**: Vision model deployment patterns
- **[SGLang HiCache](../../integrations/sglang-hicache.md)**: Hierarchical cache integration
- **[SGLang HiCache](sglang-hicache.md)**: Hierarchical cache integration
- **[Benchmarking](../../benchmarks/benchmarking.md)**: Performance benchmarking tools
- **[Tuning Disaggregated Performance](../../performance/tuning.md)**: P/D tuning guide
---
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
title: HiCache
subtitle: Hierarchical KV caching with tier-aware router integration
---
This guide covers running SGLang's Hierarchical Cache (HiCache) with Dynamo, and how the Dynamo KV router integrates with HiCache for tier-aware worker selection when workers share an external pool such as Mooncake.
## Overview
SGLang HiCache extends RadixAttention with a multi-tier KV cache that transparently moves pages between GPU HBM, host memory, and an optional external storage backend (e.g. Mooncake). For a full description of HiCache itself — flag reference, storage backends, memory layouts, prefetch policies — see SGLang's own documentation:
- [SGLang HiCache Design](https://docs.sglang.ai/advanced_features/hicache_design.html)
- [SGLang HiCache Best Practices](https://docs.sglang.ai/advanced_features/hicache_best_practices.html)
What Dynamo adds on top of HiCache:
- **Tier-aware routing.** The KV router tracks which cache tier each block lives on (GPU / Host / External) and uses that when scoring candidate workers — not just device overlap.
- **Shared-pool awareness.** When an external backend such as Mooncake is configured, the router queries the shared pool in parallel with its own indexer so it can discount prefill cost for blocks any worker can fetch, not just blocks the candidate holds locally.
If you are running a single worker with HiCache and no shared pool, no Dynamo-side configuration is required — the worker reports KV events to the router as usual.
## Running SGLang with HiCache
Launch a worker with HiCache enabled:
```bash
python -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--page-size 64 \
--enable-hierarchical-cache \
--hicache-ratio 2 \
--hicache-write-policy write_through \
--hicache-storage-backend nixl \
--skip-tokenizer-init
```
Then start the frontend:
```bash
python -m dynamo.frontend --http-port 8000
```
<Note>
The HiCache flags (`--enable-hierarchical-cache`, `--hicache-ratio`, `--hicache-write-policy`, `--hicache-storage-backend`, `--hicache-mem-layout`, etc.) are SGLang-native — Dynamo passes them through unchanged. See [SGLang's best-practices doc](https://docs.sglang.ai/advanced_features/hicache_best_practices.html) for the complete flag reference and tuning guidance.
</Note>
## Tier-Aware Shared KV Cache Routing
When you scale out to multiple SGLang workers that share an external pool such as [Mooncake](https://github.com/kvcache-ai/Mooncake), the Dynamo router can be made tier-aware. It tracks per-tier residency from worker events and consults the shared pool directly so that blocks cached anywhere in the cluster — not just on the candidate worker's GPU — contribute to worker scoring.
### Why
By default the router's radix tree only reflects blocks resident in **GPU HBM** on each worker. HiCache silently demotes blocks to host memory and further to Mooncake as the device pool fills, but the router never sees those transitions. A worker that has the full request prefix on host + Mooncake looks identical to a cold worker. The router ends up treating "fetchable from Mooncake in milliseconds" the same as "must be recomputed from scratch."
### Event model
SGLang's `HiRadixCache` emits `BlockStored` / `BlockRemoved` events carrying a `medium` field on every tier transition:
| Transition | Event emitted |
| ------------------------------------------------- | ---------------- |
| Fresh prefill writes blocks to GPU | `store(GPU)` |
| GPU → Host copy (after async DMA completes) | `store(CPU)` |
| GPU evicted, block still resident on Host | `remove(GPU)` |
| Host evicted (block gone from all worker tiers) | `remove(CPU)` |
| Host → GPU promotion (`load_back`) | `store(GPU)` |
| External → Host prefetch (L2 materialization) | `store(CPU)` |
A few properties the router relies on:
- **Ordering.** `store(new_tier)` is emitted before `remove(old_tier)` so the block is never invisible to the router during a transition.
- **DMA safety.** `store(CPU)` for a GPU→Host copy is deferred until `finish_event.synchronize()` confirms the DMA landed — events never fire before bytes are resident.
- **Per-tier tracking.** A block can be on GPU and Host simultaneously. The router records both and picks the highest-priority tier when scoring overlap.
### How it works
```mermaid
flowchart LR
Worker["SGLang Worker<br/>(HiRadixCache)"]
Mooncake["Mooncake<br/>shared pool"]
Router["Dynamo KV Router<br/>per-tier radix tree"]
Worker -- "KV events (store/remove + medium)" --> Router
Worker -- "writes pages" --> Mooncake
Router -- "batch_query on each request" --> Mooncake
```
On every request the router runs two lookups in parallel:
- Its own radix tree, built from worker KV events (per-tier).
- A batch query to the Mooncake master for blocks reachable from the shared pool.
If the shared-pool query fails, the router falls back to indexer-only scoring and logs a warning. The request still succeeds.
### Scoring
For each candidate worker, the router computes a **logit** (lower wins):
```text
# Without shared cache
logit = overlap_weight * (prefill_tokens / block_size) + decode_blocks
# With shared cache
shared_beyond = shared_cache_hits.hits_beyond(worker_device_overlap)
reduction = shared_cache_multiplier * shared_beyond * block_size
adjusted_prefill = max(0, prefill_tokens - reduction)
logit = overlap_weight * (adjusted_prefill / block_size) + decode_blocks
```
`hits_beyond(n)` counts shared-cache pages at positions `>= n` — "pages past my device prefix that I can still fetch from Mooncake instead of recomputing."
**Worked example.** Request is 4 blocks, `shared_cache_multiplier = 0.5`, `block_size = 1`, `overlap_weight = 1.0`. Shared pool contains blocks 0–3.
| Worker | Device overlap | `hits_beyond` | Reduction | Adjusted prefill | Logit |
| ------ | -------------- | ------------- | --------- | ---------------- | -------------- |
| W0 | 2 (A, B) | 2 (C, D) | 1.0 | 3.0 | 3.0 |
| W1 | 0 | 4 (A, B, C, D)| 2.0 | 2.0 | **2.0 — wins** |
W1 wins despite zero local overlap, because the shared pool covers its whole prefix. The multiplier encodes the cost ratio of a Mooncake fetch relative to a fresh GPU compute — `0.5` means "fetching from shared is half as expensive as recomputing."
## Requirements
> [!IMPORTANT]
> Tier-aware shared cache routing requires SGLang changes from [sgl-project/sglang#22894](https://github.com/sgl-project/sglang/pull/22894) ("fix(hicache): emit KV events for L2 host cache insertions"). This PR is **not yet merged** to SGLang main. Until it lands and a SGLang release includes it, the feature is not accessible from a stock `pip install sglang` — you must build SGLang from the PR branch (`gh pr checkout 22894 && pip install -e python/` from the SGLang repo). This section will be updated with the minimum required version once #22894 ships in a release.
Without PR #22894, worker events carry only `medium=GPU` and the router is blind to Host-tier residency — regardless of Mooncake configuration.
You also need:
- Dynamo router started with `--shared-cache-type hicache` (see [Configuration](#configuration)).
- A Mooncake master reachable from the Dynamo frontend host. Worker-side Mooncake config (master address, page size, TP/PP layout, split-head layout) is published automatically via each worker's registration metadata when the worker is started with `--hicache-storage-backend mooncake`.
## Setup
**SGLang worker** — HiCache with Mooncake storage:
```bash
python -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--page-size 64 \
--enable-hierarchical-cache \
--hicache-ratio 2 \
--hicache-write-policy write_through \
--hicache-storage-backend mooncake \
--hicache-storage-backend-extra-config '{"master_server_address": "mooncake-master.internal:50051"}' \
--skip-tokenizer-init
```
Launch additional workers on other GPUs / hosts with the same Mooncake config so they back to the same cluster.
**Dynamo frontend** — enable tier-aware routing:
```bash
python -m dynamo.frontend \
--http-port 8000 \
--router-mode kv \
--shared-cache-type hicache \
--shared-cache-multiplier 0.5
```
## Configuration
| Flag | Env var | Default | Description |
| --------------------------- | ----------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `--shared-cache-type` | `DYN_SHARED_CACHE_TYPE` | `none` | `none` disables shared-pool lookups; `hicache` enables Mooncake queries. |
| `--shared-cache-multiplier` | `DYN_SHARED_CACHE_MULTIPLIER` | `0.0` | Discount factor for shared-pool hits. `0.0` queries but ignores them; `0.5` treats a shared hit as half a device hit; `1.0` treats shared and device hits equally. |
Per-request overrides are available via `RouterConfigOverride.shared_cache_multiplier` for A/B experimentation without restarting the router.
No extra flags are required on the worker. When `--hicache-storage-backend mooncake` is set, Dynamo publishes the required metadata (page size, TP/PP layout, master address) via the worker's `ModelRuntimeConfig.engine_specific` blob under the key `sglang_hicache_mooncake`.
## Verification
**Events carry a medium.** Run the worker with `--log-level debug` and grep the log:
```bash
python -m dynamo.sglang ... --log-level debug 2>&1 | grep -E 'BlockStored|BlockRemoved'
# BlockStored(block_hashes=[...], medium=CPU_PINNED)
# BlockRemoved(block_hashes=[...], medium=GPU)
```
If `medium` is missing or always reads `GPU`, the worker is running an SGLang build without PR #22894.
**Router sees the shared pool.** Two new histograms are exposed on the frontend's Prometheus endpoint:
| Metric | Meaning |
| ----------------------------------- | ----------------------------------------------------------------------- |
| `router_shared_cache_hit_rate` | Fraction of request blocks found in the shared pool (0.0–1.0). |
| `router_shared_cache_beyond_blocks` | Blocks in the shared pool *beyond* the selected worker's device overlap. |
```bash
curl -s localhost:8000/metrics | grep shared_cache
```
## Troubleshooting
| Symptom | Likely cause | Fix |
| -------------------------------------------------------- | --------------------------------------------------------------------- | --------------------------------------------------------------------------------------------- |
| `shared_cache_hit_rate` is always 0 | Mooncake master unreachable from the router host | Check network path; the router logs `Shared cache query failed` when it can't reach Mooncake. |
| Events only ever carry `medium=GPU` | SGLang missing [PR #22894](https://github.com/sgl-project/sglang/pull/22894) | Rebuild SGLang from the PR branch. |
| Workers registered but router never queries shared cache | `--shared-cache-type` left at default `none` | Set `--shared-cache-type hicache` on the frontend. |
| Queries issued but winning worker rarely changes | `--shared-cache-multiplier 0.0` | Raise the multiplier — typical starting range is `0.3``0.7`. |
| Page-size mismatch warnings | Router `--page-size` doesn't match worker `--page-size` | They must agree; the router hashes pages using the worker's page size. |
| Router logs "no workers have HiCache enabled" | No worker published `sglang_hicache_mooncake` metadata | Confirm workers started with `--hicache-storage-backend mooncake`. |
## Further Reading
- [SGLang HiCache Design](https://docs.sglang.ai/advanced_features/hicache_design.html) and [Best Practices](https://docs.sglang.ai/advanced_features/hicache_best_practices.html)
- [Mooncake](https://github.com/kvcache-ai/Mooncake) — the shared KV store used as the external tier
- [SGLang PR #22894](https://github.com/sgl-project/sglang/pull/22894) — the tier-annotated events prerequisite
- [KVBM Guide](../../components/kvbm/kvbm-guide.md) — Dynamo's own block manager, an alternative to HiCache
- [KV Events for Custom Engines](../../integrations/kv-events-custom-engines.md) — the event protocol contract for backends other than SGLang
......@@ -59,5 +59,5 @@ KVBM has three primary logical layers:
- **[KVBM Design](../../design-docs/kvbm-design.md)** — Architecture deep dive, components, and data flows
- **[LMCache Integration](../../integrations/lmcache-integration.md)** — Use LMCache with Dynamo vLLM backend
- **[FlexKV Integration](../../integrations/flexkv-integration.md)** — Use FlexKV for KV cache management
- **[SGLang HiCache](../../integrations/sglang-hicache.md)** — Enable SGLang's hierarchical cache with NIXL
- **[SGLang HiCache](../../backends/sglang/sglang-hicache.md)** — Enable SGLang's hierarchical cache with NIXL
- **[NIXL Documentation](https://github.com/ai-dynamo/nixl/blob/main/docs/nixl.md)** — NIXL communication library details
\ No newline at end of file
......@@ -195,7 +195,7 @@ curl localhost:8000/v1/chat/completions \
}'
```
> **Learn more:** See the [SGLang HiCache Integration Guide](../../integrations/sglang-hicache.md) for detailed configuration, deployment examples, and troubleshooting.
> **Learn more:** See the [SGLang HiCache Integration Guide](../../backends/sglang/sglang-hicache.md) for detailed configuration, deployment examples, and troubleshooting.
## Disaggregated Serving with KVBM
......@@ -476,4 +476,4 @@ python -m dynamo.vllm --model Qwen/Qwen3-0.6B --kv-transfer-config '{"kv_connect
- [KVBM Design](../../design-docs/kvbm-design.md) for a deep dive into KVBM architecture
- [LMCache Integration](../../integrations/lmcache-integration.md)
- [FlexKV Integration](../../integrations/flexkv-integration.md)
- [SGLang HiCache](../../integrations/sglang-hicache.md)
- [SGLang HiCache](../../backends/sglang/sglang-hicache.md)
......@@ -78,7 +78,7 @@ The full list of supported ecosystem components:
| :--- | :--- |
| Inference engines | SGLang, TensorRT-LLM, vLLM |
| Kubernetes | Inference gateway |
| Memory management | Dynamo KV Block Manager, [LMCache](../integrations/lmcache-integration.md), [SGLang HiCache](../integrations/sglang-hicache.md), [FlexKV](../integrations/flexkv-integration.md) |
| Memory management | Dynamo KV Block Manager, [LMCache](../integrations/lmcache-integration.md), [SGLang HiCache](../backends/sglang/sglang-hicache.md), [FlexKV](../integrations/flexkv-integration.md) |
| Networking and storage | Mooncake, DOCA NetIO, GDS, POSIX, S3, 3FS ([supported via NIXL](../design-docs/kvbm-design.md)) |
| Multi-HW | Intel XPU, AMD |
......
......@@ -198,6 +198,8 @@ navigation:
path: backends/sglang/sglang-observability.md
- page: Agentic Workloads
path: backends/sglang/agents.md
- page: HiCache
path: backends/sglang/sglang-hicache.md
- section: TensorRT-LLM
path: backends/trtllm/README.md
contents:
......@@ -268,8 +270,6 @@ navigation:
contents:
- page: LMCache
path: integrations/lmcache-integration.md
- page: SGLang HiCache
path: integrations/sglang-hicache.md
- page: FlexKV
path: integrations/flexkv-integration.md
- page: KV Events for Custom Engines
......
---
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
title: SGLang HiCache
---
This guide shows how to enable SGLang's Hierarchical Cache (HiCache) inside Dynamo.
## 1) Start the SGLang worker with HiCache enabled
```bash
python -m dynamo.sglang \
--model-path Qwen/Qwen3-0.6B \
--host 0.0.0.0 --port 8000 \
--page-size 64 \
--enable-hierarchical-cache \
--hicache-ratio 2 \
--hicache-write-policy write_through \
--hicache-storage-backend nixl \
--log-level debug \
--skip-tokenizer-init
```
- **--enable-hierarchical-cache**: Enables hierarchical KV cache/offload
- **--hicache-ratio**: The ratio of the size of host KV cache memory pool to the size of device pool. Lower this number if your machine has less CPU memory.
- **--hicache-write-policy**: Write policy (e.g., `write_through` for synchronous host writes)
- **--hicache-storage-backend**: Host storage backend for HiCache (e.g., `nixl`). NIXL selects the concrete store automatically; see [PR #8488](https://github.com/sgl-project/sglang/pull/8488)
Then, start the frontend:
```bash
python -m dynamo.frontend --http-port 8000
```
## 2) Send a single request
```bash
curl localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "Qwen/Qwen3-0.6B",
"messages": [
{
"role": "user",
"content": "Explain why Roger Federer is considered one of the greatest tennis players of all time"
}
],
"stream": false,
"max_tokens": 30
}'
```
## 3) (Optional) Benchmarking
Run the perf script:
```bash
bash -x $DYNAMO_ROOT/benchmarks/llm/perf.sh \
--model Qwen/Qwen3-0.6B \
--tensor-parallelism 1 \
--data-parallelism 1 \
--concurrency "2,4,8" \
--input-sequence-length 2048 \
--output-sequence-length 256
```
......@@ -101,6 +101,11 @@ redirects:
destination: "/dynamo/dev/user-guides/agents"
- source: "/dynamo/latest/planner/planner_intro.html"
destination: "/dynamo/components/planner"
# SGLang HiCache moved from Integrations section to Backends > SGLang section
- source: "/dynamo/dev/integrations/sglang-hicache"
destination: "/dynamo/dev/backends/sglang/hicache"
- source: "/dynamo/integrations/sglang-hicache"
destination: "/dynamo/dev/backends/sglang/hicache"
# GitHub repository link in navbar
navbar-links:
......
......@@ -1585,6 +1585,7 @@ dependencies = [
"rustls",
"serde",
"serde_json",
"sha2",
"strum",
"tempfile",
"thiserror 2.0.18",
......
......@@ -1599,6 +1599,7 @@ dependencies = [
"rustls",
"serde",
"serde_json",
"sha2",
"strum",
"tempfile",
"thiserror 2.0.18",
......
......@@ -126,7 +126,7 @@ impl AicPerfConfig {
#[pymethods]
impl KvRouterConfig {
#[new]
#[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true, durable_kv_events=false, router_replica_sync=false, router_track_active_blocks=true, router_track_output_blocks=false, router_assume_kv_reuse=true, router_track_prefill_tokens=true, router_prefill_load_model="none", router_snapshot_threshold=1000000, router_reset_states=false, router_ttl_secs=120.0, router_max_tree_size=1048576, router_prune_target_ratio=0.8, router_queue_threshold=Some(4.0), router_event_threads=4, router_queue_policy="fcfs", use_remote_indexer=false, serve_indexer=false))]
#[pyo3(signature = (overlap_score_weight=1.0, router_temperature=0.0, use_kv_events=true, durable_kv_events=false, router_replica_sync=false, router_track_active_blocks=true, router_track_output_blocks=false, router_assume_kv_reuse=true, router_track_prefill_tokens=true, router_prefill_load_model="none", router_snapshot_threshold=1000000, router_reset_states=false, router_ttl_secs=120.0, router_max_tree_size=1048576, router_prune_target_ratio=0.8, router_queue_threshold=Some(4.0), router_event_threads=4, router_queue_policy="fcfs", use_remote_indexer=false, serve_indexer=false, shared_cache_multiplier=0.0, shared_cache_type="none"))]
#[allow(clippy::too_many_arguments)]
fn new(
overlap_score_weight: f64,
......@@ -149,6 +149,8 @@ impl KvRouterConfig {
router_queue_policy: &str,
use_remote_indexer: bool,
serve_indexer: bool,
shared_cache_multiplier: f64,
shared_cache_type: &str,
) -> Self {
KvRouterConfig {
inner: RsKvRouterConfig {
......@@ -179,6 +181,10 @@ impl KvRouterConfig {
}),
use_remote_indexer,
serve_indexer,
shared_cache_multiplier,
shared_cache_type: shared_cache_type
.parse()
.unwrap_or_else(|_| panic!("invalid shared_cache_type: {shared_cache_type:?}")),
},
}
}
......
......@@ -8,6 +8,21 @@ use std::sync::Arc;
use super::{KvIndexerMetrics, KvRouterError, WorkerTask};
use crate::protocols::*;
/// Trait for querying an external shared KV cache pool.
///
/// Implementations check which blocks/pages from a request's token sequence
/// exist in the shared cache. The returned `SharedCacheHits` describes which
/// block positions are available externally (and thus cheaper to prefill).
#[async_trait]
pub trait SharedKvCache: Send + Sync {
/// Query which blocks exist in the shared cache for the given token sequence.
async fn check_blocks(
&self,
tokens: &[u32],
block_size: u32,
) -> Result<SharedCacheHits, KvRouterError>;
}
/// Per-shard size snapshot returned by [`KvIndexerInterface::shard_sizes`].
///
/// `worker_count` and `block_count` are always populated.
......
......@@ -32,6 +32,9 @@ pub use sequences::single as sequence;
#[cfg(feature = "standalone-indexer")]
pub mod standalone_indexer;
#[cfg(feature = "standalone-indexer")]
pub mod standalone_shared_cache;
#[cfg(any(test, feature = "bench"))]
pub mod test_utils;
......@@ -43,12 +46,17 @@ pub use self::multi_worker_sequence::{
pub use self::sequence::{ActiveSequences, RequestId};
pub use concurrent_radix_tree::ConcurrentRadixTree;
pub use concurrent_radix_tree_compressed::ConcurrentRadixTreeCompressed;
pub use config::{KvRouterConfig, RouterConfigOverride, RouterPrefillLoadModel, RouterQueuePolicy};
pub use indexer::{BranchShardedIndexer, MaybeError, SyncIndexer, ThreadPoolIndexer};
pub use config::{
KvRouterConfig, RouterConfigOverride, RouterPrefillLoadModel, RouterQueuePolicy,
SharedCacheType,
};
pub use indexer::{
BranchShardedIndexer, MaybeError, SharedKvCache, SyncIndexer, ThreadPoolIndexer,
};
pub use nested_map::PositionalIndexer;
pub use protocols::{
KvCacheEventError, LocalBlockHash, OverlapScores, RouterEvent, RouterEventSink,
WorkerConfigLike, WorkerId, compute_block_hash_for_seq,
SharedCacheHits, WorkerConfigLike, WorkerId, compute_block_hash_for_seq,
};
pub use queue::SchedulerQueue;
pub use radix_tree::RadixTree;
......
......@@ -2,6 +2,7 @@
// SPDX-License-Identifier: Apache-2.0
use std::future::Future;
use std::ops::Range;
use std::time::Duration;
use dynamo_tokens::{SequenceHash, Token};
......@@ -733,6 +734,65 @@ impl RouterEvent {
}
}
/// Shared cache hit information, represented as sorted non-overlapping half-open ranges.
///
/// Ranges encode which block positions exist in the external shared KV cache pool.
/// Using ranges instead of `Vec<bool>` avoids iterating over potentially thousands
/// of blocks per worker. Typical shared cache patterns produce few contiguous regions,
/// making `hits_beyond` O(num_ranges) ~ O(1-5).
#[derive(Debug, Clone, Default)]
pub struct SharedCacheHits {
/// Ranges of block positions that exist in the shared cache.
/// Half-open ranges [start, end), sorted and non-overlapping.
pub ranges: Vec<Range<u32>>,
/// Total number of hits (sum of range lengths).
pub total_hits: u32,
}
impl SharedCacheHits {
/// Create from sorted, non-overlapping ranges.
pub fn from_ranges(ranges: Vec<Range<u32>>) -> Self {
let total_hits = ranges.iter().map(|r| r.end - r.start).sum();
Self { ranges, total_hits }
}
/// Create from a boolean hit vector (convenience for tests and simple backends).
/// Coalesces consecutive `true` entries into ranges.
pub fn from_hits(hits: &[bool]) -> Self {
let mut ranges = Vec::new();
let mut i = 0;
while i < hits.len() {
if hits[i] {
let start = i as u32;
while i < hits.len() && hits[i] {
i += 1;
}
ranges.push(start..i as u32);
} else {
i += 1;
}
}
Self::from_ranges(ranges)
}
/// Count hits at positions >= `from_position`.
/// O(num_ranges), not O(num_blocks).
pub fn hits_beyond(&self, from_position: u32) -> u32 {
self.ranges
.iter()
.map(|r| {
if r.end <= from_position {
0
} else if r.start >= from_position {
r.end - r.start
} else {
r.end - from_position
}
})
.sum()
}
}
/// Scores representing the overlap of workers (with their dp_rank).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OverlapScores {
......@@ -1188,6 +1248,76 @@ mod tests {
assert_eq!(deserialized.block_hashes[1].0, 5);
}
#[test]
fn test_router_request_mark_free_backwards_compatible_deserialization() {
let request: RouterRequest = serde_json::from_str(r#"{"method":"mark_free"}"#).unwrap();
assert!(matches!(
request,
RouterRequest::MarkFree { request_id: None }
));
}
#[test]
fn test_shared_cache_hits_from_hits() {
// All hits contiguous
let hits = SharedCacheHits::from_hits(&[true, true, true, true]);
assert_eq!(hits.ranges, vec![0..4]);
assert_eq!(hits.total_hits, 4);
// Sparse hits
let hits = SharedCacheHits::from_hits(&[true, false, true, true, false, true]);
assert_eq!(hits.ranges, vec![0..1, 2..4, 5..6]);
assert_eq!(hits.total_hits, 4);
// No hits
let hits = SharedCacheHits::from_hits(&[false, false, false]);
assert!(hits.ranges.is_empty());
assert_eq!(hits.total_hits, 0);
// Empty
let hits = SharedCacheHits::from_hits(&[]);
assert!(hits.ranges.is_empty());
assert_eq!(hits.total_hits, 0);
}
#[test]
fn test_shared_cache_hits_beyond() {
// Shared has [A, B, C, D] => range 0..4
#[allow(clippy::single_range_in_vec_init)]
let hits = SharedCacheHits::from_ranges(vec![0..4]);
// Device has overlap=2 (positions 0,1 on device) => shared_beyond should count positions 2,3
assert_eq!(hits.hits_beyond(2), 2);
// Device has overlap=0 => all 4 shared hits count
assert_eq!(hits.hits_beyond(0), 4);
// Device has overlap=4 => nothing beyond
assert_eq!(hits.hits_beyond(4), 0);
// Device overlap exceeds range
assert_eq!(hits.hits_beyond(10), 0);
}
#[test]
fn test_shared_cache_hits_beyond_sparse() {
// Ranges: [1..3, 5..8] => positions 1,2,5,6,7
let hits = SharedCacheHits::from_ranges(vec![1..3, 5..8]);
assert_eq!(hits.total_hits, 5);
// from_position=0 => all 5 hits
assert_eq!(hits.hits_beyond(0), 5);
// from_position=2 => pos 2 (from first range) + 5,6,7 (from second) = 4
assert_eq!(hits.hits_beyond(2), 4);
// from_position=3 => only second range: 3 hits
assert_eq!(hits.hits_beyond(3), 3);
// from_position=6 => positions 6,7 from second range = 2
assert_eq!(hits.hits_beyond(6), 2);
// from_position=8 => nothing
assert_eq!(hits.hits_beyond(8), 0);
}
#[test]
fn test_router_request_mark_free_serialization_with_request_id() {
let request = RouterRequest::MarkFree {
......
......@@ -35,6 +35,40 @@ pub fn min_initial_workers_from_env() -> anyhow::Result<usize> {
}
}
/// Type of external shared KV cache to query during routing.
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum SharedCacheType {
/// No shared cache (default).
#[default]
None,
/// HiCache L3 shared cache — queries sglang workers via the request plane.
Hicache,
}
impl fmt::Display for SharedCacheType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::None => f.write_str("none"),
Self::Hicache => f.write_str("hicache"),
}
}
}
impl FromStr for SharedCacheType {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"none" => Ok(Self::None),
"hicache" => Ok(Self::Hicache),
_ => Err(format!(
"unknown shared_cache_type: {s:?}, expected 'none' or 'hicache'"
)),
}
}
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum RouterQueuePolicy {
......@@ -121,6 +155,11 @@ pub struct RouterConfigOverride {
#[builder(default)]
pub track_prefill_tokens: Option<bool>,
/// Per-request override of `shared_cache_multiplier`.
#[builder(default)]
#[validate(range(min = 0.0, max = 1.0))]
pub shared_cache_multiplier: Option<f64>,
}
/// KV Router configuration parameters
......@@ -210,10 +249,20 @@ pub struct KvRouterConfig {
pub use_remote_indexer: bool,
/// Whether this router should serve its local indexer from the worker component.
/// This enables other routers/frontends in the same namespace to query
/// overlap scores remotely over the request plane by component + endpoint.
#[serde(default)]
pub serve_indexer: bool,
/// Multiplier for shared cache hits when scoring workers (0.0 to 1.0).
/// Blocks available in the shared cache are less valuable than device-local blocks
/// because they need to be fetched. A value of 0.5 means each shared cache hit
/// counts as half a device-local hit. Default: 0.0 (shared cache scoring disabled);
/// the CLI sets this to 0.5 when shared cache is enabled.
#[validate(range(min = 0.0, max = 1.0))]
pub shared_cache_multiplier: f64,
/// Type of external shared KV cache to query during routing.
/// "none" (default): disabled. "hicache": query sglang workers for L3 cache state.
pub shared_cache_type: SharedCacheType,
}
impl Default for KvRouterConfig {
......@@ -240,6 +289,8 @@ impl Default for KvRouterConfig {
router_queue_policy: RouterQueuePolicy::default(),
use_remote_indexer: false,
serve_indexer: false,
shared_cache_multiplier: 0.0,
shared_cache_type: SharedCacheType::default(),
}
}
}
......@@ -417,4 +468,45 @@ mod tests {
assert_eq!(seq_hashes, Some(compute_seq_hash_for_block(&precomputed)));
}
#[test]
fn test_kv_router_config_rejects_out_of_range_shared_cache_multiplier() {
let too_small = KvRouterConfig {
shared_cache_multiplier: -0.1,
..Default::default()
};
let too_large = KvRouterConfig {
shared_cache_multiplier: 1.1,
..Default::default()
};
assert!(too_small.validate().is_err());
assert!(too_large.validate().is_err());
}
#[test]
fn test_router_config_override_rejects_out_of_range_shared_cache_multiplier() {
let too_small = RouterConfigOverride {
overlap_score_weight: None,
router_temperature: None,
assume_kv_reuse: None,
track_prefill_tokens: None,
shared_cache_multiplier: Some(-0.1),
};
let too_large = RouterConfigOverride {
overlap_score_weight: None,
router_temperature: None,
assume_kv_reuse: None,
track_prefill_tokens: None,
shared_cache_multiplier: Some(1.1),
};
assert!(too_small.validate().is_err());
assert!(too_large.validate().is_err());
}
#[test]
fn test_kv_router_config_default_shared_cache_multiplier_is_disabled() {
assert_eq!(KvRouterConfig::default().shared_cache_multiplier, 0.0);
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment