"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "d5c5154fcf4c5d65551c98e458cbb027e5f4b672"
Unverified Commit ebd06a1f authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat(llm): unify user-facing priority hints (#7492)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 4d302ab6
...@@ -293,9 +293,9 @@ python real_data_benchmark.py --input-dataset trace.jsonl --prefix-root-multipli ...@@ -293,9 +293,9 @@ python real_data_benchmark.py --input-dataset trace.jsonl --prefix-root-multipli
1. The trace is synthesized (same parameters as `real_data_benchmark.py`) and split into low / medium / high tiers according to `--priority-distribution`. 1. The trace is synthesized (same parameters as `real_data_benchmark.py`) and split into low / medium / high tiers according to `--priority-distribution`.
2. Each tier is sent to aiperf as a concurrent stream. In the priority-tagged run, every request carries an OpenAI-compatible extension header: 2. Each tier is sent to aiperf as a concurrent stream. In the priority-tagged run, every request carries an OpenAI-compatible extension header:
```json ```json
{"nvext": {"agent_hints": {"latency_sensitivity": <value>}}} {"nvext": {"agent_hints": {"priority": <value>}}}
``` ```
The `latency_sensitivity` value acts as a **priority jump** (in seconds) inside the router's scheduler queue -- a higher value shifts the request's effective arrival time earlier, giving it priority over lower-valued requests. The `priority` value raises the request's router queue priority -- a higher value shifts the request's effective arrival time earlier, giving it priority over lower-valued requests.
3. Two separate aiperf seeds are used for baseline vs. priority runs to ensure different generated prompt content and prevent mocker KV cache cross-contamination. 3. Two separate aiperf seeds are used for baseline vs. priority runs to ensure different generated prompt content and prevent mocker KV cache cross-contamination.
#### Prerequisites: enable the priority queue #### Prerequisites: enable the priority queue
...@@ -332,16 +332,15 @@ python real_data_priority_benchmark.py \ ...@@ -332,16 +332,15 @@ python real_data_priority_benchmark.py \
| Parameter | Default | Description | | Parameter | Default | Description |
|-----------|---------|-------------| |-----------|---------|-------------|
| `--priority-distribution` | `0.5,0.3,0.2` | Fraction of requests assigned to low/medium/high tiers (must sum to 1.0) | | `--priority-distribution` | `0.5,0.3,0.2` | Fraction of requests assigned to low/medium/high tiers (must sum to 1.0) |
| `--priority-values` | `0,1,2` | `latency_sensitivity` values for low/medium/high tiers (seconds of priority jump) | | `--priority-values` | `0,1,2` | `priority` values for low/medium/high tiers |
Examples: Examples:
```bash ```bash
# Equal tier sizes with aggressive priority differentiation. # Equal tier sizes with aggressive priority differentiation.
# --priority-values sets the latency_sensitivity per tier (low, medium, high). # --priority-values sets the request priority per tier (low, medium, high).
# Each value is a priority jump in seconds: the router subtracts it from the # Higher values move the request further ahead in the router queue.
# request's arrival time, so higher values move the request further ahead # Here low gets no boost, medium gets priority 2, and high gets priority 5.
# in the queue. Here low gets no boost, medium jumps 2s ahead, high jumps 5s.
python real_data_priority_benchmark.py \ python real_data_priority_benchmark.py \
--input-dataset mooncake_trace.jsonl \ --input-dataset mooncake_trace.jsonl \
--num-requests 5000 \ --num-requests 5000 \
......
...@@ -34,6 +34,11 @@ def parse_float_list(s): ...@@ -34,6 +34,11 @@ def parse_float_list(s):
return [float(x.strip()) for x in s.split(",")] return [float(x.strip()) for x in s.split(",")]
def parse_int_list(s):
"""Parse a comma-separated string into a list of ints."""
return [int(x.strip()) for x in s.split(",")]
def split_trace(requests, distribution, seed): def split_trace(requests, distribution, seed):
"""Split requests into priority tiers by distribution. Deterministic given seed.""" """Split requests into priority tiers by distribution. Deterministic given seed."""
rng = np.random.RandomState(seed) rng = np.random.RandomState(seed)
...@@ -81,11 +86,11 @@ def run_concurrent_streams( ...@@ -81,11 +86,11 @@ def run_concurrent_streams(
"""Launch concurrent aiperf subprocesses for each tier. """Launch concurrent aiperf subprocesses for each tier.
Args: Args:
tag_priority: If True, inject nvext.agent_hints.latency_sensitivity per tier. tag_priority: If True, inject nvext.agent_hints.priority per tier.
""" """
processes = [] processes = []
log_files = [] log_files = []
for tier, pj in zip(TIERS, priority_values): for tier, priority in zip(TIERS, priority_values):
tier_dir = os.path.join(run_dir, f"{tier}_priority") tier_dir = os.path.join(run_dir, f"{tier}_priority")
os.makedirs(tier_dir, exist_ok=True) os.makedirs(tier_dir, exist_ok=True)
...@@ -109,7 +114,7 @@ def run_concurrent_streams( ...@@ -109,7 +114,7 @@ def run_concurrent_streams(
cmd.extend( cmd.extend(
[ [
"--extra-inputs", "--extra-inputs",
json.dumps({"nvext": {"agent_hints": {"latency_sensitivity": pj}}}), json.dumps({"nvext": {"agent_hints": {"priority": priority}}}),
] ]
) )
...@@ -118,7 +123,7 @@ def run_concurrent_streams( ...@@ -118,7 +123,7 @@ def run_concurrent_streams(
log_files.append(log_file) log_files.append(log_file)
label = "priority" if tag_priority else "baseline" label = "priority" if tag_priority else "baseline"
logger.info(f"Launching {tier} tier ({label}, latency_sensitivity={pj})") logger.info(f"Launching {tier} tier ({label}, priority={priority})")
logger.info(f" Command: {' '.join(cmd)}") logger.info(f" Command: {' '.join(cmd)}")
proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT) proc = subprocess.Popen(cmd, stdout=log_file, stderr=subprocess.STDOUT)
...@@ -192,12 +197,13 @@ def plot_ttft_comparison(baseline_dir, priority_dir, output_path, priority_value ...@@ -192,12 +197,13 @@ def plot_ttft_comparison(baseline_dir, priority_dir, output_path, priority_value
priority_medians, priority_medians,
width, width,
yerr=[priority_lo, priority_hi], yerr=[priority_lo, priority_hi],
label="With latency_sensitivity", label="With priority",
capsize=4, capsize=4,
) )
tier_labels = [ tier_labels = [
f"{tier.capitalize()}\n(ls={pj})" for tier, pj in zip(TIERS, priority_values) f"{tier.capitalize()}\n(p={priority})"
for tier, priority in zip(TIERS, priority_values)
] ]
ax.set_xticks(x) ax.set_xticks(x)
ax.set_xticklabels(tier_labels) ax.set_xticklabels(tier_labels)
...@@ -230,14 +236,14 @@ def main(): ...@@ -230,14 +236,14 @@ def main():
"--priority-values", "--priority-values",
type=str, type=str,
default="0,1,2", default="0,1,2",
help="Comma-separated latency_sensitivity values for low/medium/high tiers (default: 0,1,2)", help="Comma-separated priority values for low/medium/high tiers (default: 0,1,2)",
) )
args = parser.parse_args() args = parser.parse_args()
resolve_tokenizer(args) resolve_tokenizer(args)
distribution = parse_float_list(args.priority_distribution) distribution = parse_float_list(args.priority_distribution)
priority_values = parse_float_list(args.priority_values) priority_values = parse_int_list(args.priority_values)
if len(distribution) != len(TIERS): if len(distribution) != len(TIERS):
parser.error( parser.error(
......
...@@ -368,6 +368,13 @@ async def parse_args(args: list[str]) -> Config: ...@@ -368,6 +368,13 @@ async def parse_args(args: list[str]) -> Config:
else: else:
server_args = ServerArgs.from_cli_args(parsed_args) server_args = ServerArgs.from_cli_args(parsed_args)
if getattr(server_args, "schedule_low_priority_values_first", False):
raise ValueError(
"--schedule-low-priority-values-first is not supported in Dynamo's "
"SGLang integration. Dynamo normalizes request priority so higher "
"values are always higher priority at the API layer."
)
# Dynamo's streaming handlers expect disjoint output_ids from SGLang (only new # Dynamo's streaming handlers expect disjoint output_ids from SGLang (only new
# tokens since last output), not cumulative tokens. When stream_output=True, # tokens since last output), not cumulative tokens. When stream_output=True,
# SGLang sends disjoint segments which Dynamo passes through directly. # SGLang sends disjoint segments which Dynamo passes through directly.
......
...@@ -211,7 +211,12 @@ class BaseWorkerHandler(BaseGenerativeHandler[RequestT, ResponseT]): ...@@ -211,7 +211,12 @@ class BaseWorkerHandler(BaseGenerativeHandler[RequestT, ResponseT]):
def _priority_kwargs(self, priority: Any) -> Dict[str, Any]: def _priority_kwargs(self, priority: Any) -> Dict[str, Any]:
if priority is not None and self._engine_supports_priority: if priority is not None and self._engine_supports_priority:
return {"priority": priority} normalized = int(priority)
if getattr(
self.config.server_args, "schedule_low_priority_values_first", False
):
normalized = -normalized
return {"priority": normalized}
return {} return {}
async def release_memory_occupation(self, body: dict) -> dict: async def release_memory_occupation(self, body: dict) -> dict:
......
...@@ -1528,7 +1528,7 @@ class DecodeWorkerHandler(BaseWorkerHandler): ...@@ -1528,7 +1528,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
) )
routing = request.get("routing") or {} routing = request.get("routing") or {}
dp_rank = self._to_local_dp_rank(routing.get("dp_rank")) dp_rank = self._to_local_dp_rank(routing.get("dp_rank"))
priority = routing.get("priority", 0) priority = -int(routing.get("priority", 0))
trace_headers = build_trace_headers(context) trace_headers = build_trace_headers(context)
...@@ -1575,7 +1575,7 @@ class DecodeWorkerHandler(BaseWorkerHandler): ...@@ -1575,7 +1575,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
routing = request.get("routing") or {} routing = request.get("routing") or {}
dp_rank = self._to_local_dp_rank(routing.get("dp_rank")) dp_rank = self._to_local_dp_rank(routing.get("dp_rank"))
priority = routing.get("priority", 0) priority = -int(routing.get("priority", 0))
openai_request_id = request.get("id") or request.get("request_id", request_id) openai_request_id = request.get("id") or request.get("request_id", request_id)
previous_text = "" previous_text = ""
...@@ -1742,7 +1742,7 @@ class PrefillWorkerHandler(BaseWorkerHandler): ...@@ -1742,7 +1742,7 @@ class PrefillWorkerHandler(BaseWorkerHandler):
routing = request.get("routing") or {} routing = request.get("routing") or {}
dp_rank = self._to_local_dp_rank(routing.get("dp_rank")) dp_rank = self._to_local_dp_rank(routing.get("dp_rank"))
priority = routing.get("priority", 0) priority = -int(routing.get("priority", 0))
trace_headers = build_trace_headers(context) trace_headers = build_trace_headers(context)
......
...@@ -29,14 +29,12 @@ Enable priority-based scheduling so the engine respects the `priority` value fro ...@@ -29,14 +29,12 @@ Enable priority-based scheduling so the engine respects the `priority` value fro
python -m dynamo.sglang \ python -m dynamo.sglang \
--model-path <model> \ --model-path <model> \
--enable-priority-scheduling \ --enable-priority-scheduling \
--schedule-low-priority-values-first \
... ...
``` ```
| Flag | Description | | Flag | Description |
|------|-------------| |------|-------------|
| `--enable-priority-scheduling` | Enables priority-based request scheduling instead of FCFS. | | `--enable-priority-scheduling` | Enables priority-based request scheduling instead of FCFS. |
| `--schedule-low-priority-values-first` | Inverts priority ordering so lower values are scheduled first (matches vLLM convention). Without this flag, higher values = higher priority. |
When priority scheduling is enabled, the engine uses the `priority` field from `nvext.agent_hints` to order requests in its internal queue. Requests with higher effective priority are scheduled before lower-priority ones. Ties are broken by arrival time. When priority scheduling is enabled, the engine uses the `priority` field from `nvext.agent_hints` to order requests in its internal queue. Requests with higher effective priority are scheduled before lower-priority ones. Ties are broken by arrival time.
...@@ -77,8 +75,7 @@ Dynamo's `nvext.agent_hints` fields are consumed by the router and forwarded to ...@@ -77,8 +75,7 @@ Dynamo's `nvext.agent_hints` fields are consumed by the router and forwarded to
| Agent Hint | Router Behavior | SGLang Engine Behavior | | Agent Hint | Router Behavior | SGLang Engine Behavior |
|------------|----------------|----------------------| |------------|----------------|----------------------|
| `priority` | No routing effect (forwarded to engine) | Queue ordering when `--enable-priority-scheduling` is set. Also affects radix cache eviction order when `--radix-eviction-policy priority` is set. | | `priority` | Raises router queue priority when `--router-queue-threshold` is set. | Queue ordering when `--enable-priority-scheduling` is set. Also affects radix cache eviction order when `--radix-eviction-policy priority` is set. |
| `latency_sensitivity` | Shifts request earlier in router queue (requires `--router-queue-threshold`) | No direct engine effect. |
| `osl` | Output block tracking for routing decisions (requires `--router-track-output-blocks`) | No direct engine effect. | | `osl` | Output block tracking for routing decisions (requires `--router-track-output-blocks`) | No direct engine effect. |
| `speculative_prefill` | After response completes, sends a `max_tokens=1` prefill to warm the KV cache for the predicted next turn. | SGLang processes the prefill request normally, populating the radix cache. | | `speculative_prefill` | After response completes, sends a `max_tokens=1` prefill to warm the KV cache for the predicted next turn. | SGLang processes the prefill request normally, populating the radix cache. |
...@@ -100,7 +97,6 @@ response = client.chat.completions.create( ...@@ -100,7 +97,6 @@ response = client.chat.completions.create(
"nvext": { "nvext": {
"agent_hints": { "agent_hints": {
"priority": 10, "priority": 10,
"latency_sensitivity": 2.0,
"speculative_prefill": True, "speculative_prefill": True,
"osl": 512 "osl": 512
} }
......
...@@ -68,7 +68,6 @@ Dynamo’s new agent hints extension was designed to bridge this gap. It allows ...@@ -68,7 +68,6 @@ Dynamo’s new agent hints extension was designed to bridge this gap. It allows
"tools": [...], "tools": [...],
"nvext": { "nvext": {
"agent_hints": { "agent_hints": {
"latency_sensitivity": 0.9,
"osl": 256, "osl": 256,
"speculative_prefill": true, "speculative_prefill": true,
"priority": 10 "priority": 10
...@@ -83,7 +82,7 @@ Dynamo’s new agent hints extension was designed to bridge this gap. It allows ...@@ -83,7 +82,7 @@ Dynamo’s new agent hints extension was designed to bridge this gap. It allows
The `agent_hints` fields: The `agent_hints` fields:
- **`latency_sensitivity`** and **`priority`** control scheduling at the router and engine respectively. We cover both in detail in the Priority Scheduling section below. - **`priority`** controls scheduling across both the router and engine. Higher values mean "more important" at the Dynamo API level; Dynamo translates that into router queue ordering and backend-specific engine priority.
- **`osl`** (output sequence length) is the harness's estimate of how many tokens this request will generate. The router uses this to gauge how long a worker will be occupied, which improves load balancing. A harness can learn this over time by tracking average output lengths per tool call type. - **`osl`** (output sequence length) is the harness's estimate of how many tokens this request will generate. The router uses this to gauge how long a worker will be occupied, which improves load balancing. A harness can learn this over time by tracking average output lengths per tool call type.
- **`speculative_prefill`** signals the orchestrator to begin caching this request's prefix on a likely worker before the full request is ready. This is useful when the harness knows a tool call is about to return and wants to warm the cache ahead of time. - **`speculative_prefill`** signals the orchestrator to begin caching this request's prefix on a likely worker before the full request is ready. This is useful when the harness knows a tool call is about to return and wants to warm the cache ahead of time.
...@@ -99,17 +98,16 @@ Without cache-aware routing, turn 2 of a conversation has a ~1/N chance of landi ...@@ -99,17 +98,16 @@ Without cache-aware routing, turn 2 of a conversation has a ~1/N chance of landi
### Priority Scheduling ### Priority Scheduling
Two fields in `agent_hints` control scheduling. They are separate knobs because they solve different problems at different layers: `priority` is the single user-facing scheduling knob. Higher values mean "more important" at the Dynamo API level. Dynamo uses that one hint at both layers:
- **`latency_sensitivity`** (0.0-1.0) controls **how soon the request gets dispatched** from the router queue. It answers: "how urgently does this request need to reach a worker?" A user-facing interactive turn (e.g., a lead agent responding to the developer) needs low latency; a background subagent doing a code search does not. The harness knows the difference and sets accordingly. - At the **router**, higher-priority requests are shifted earlier in the queue when `--router-queue-threshold` is enabled.
- At the **engine**, Dynamo normalizes backend-specific polarity and forwards the request for queue ordering, preemption, and KV cache eviction.
- **`priority`** (integer) controls **how the engine treats the request once it arrives**: scheduling order within the engine's batch and KV cache eviction policy. It answers: "how important is this request's compute and cache relative to other active requests on this worker?" A long-running synthesis request whose KV cache should survive memory pressure gets a high priority; a short lookup whose cache is disposable gets a low one. At the router, incoming requests enter a `BinaryHeap<QueueEntry>` ordered by effective arrival time. A higher `priority` makes the request appear as if it arrived earlier, placing it ahead of lower-priority work. Requests only enter the queue when all workers exceed a configurable load threshold. Below that threshold, they bypass the queue entirely and go straight to worker selection. When capacity frees up (prefill completes or a request finishes), the queue drains highest-priority entries first.
At the router, incoming requests enter a `BinaryHeap<QueueEntry>` ordered by effective arrival time. A higher `latency_sensitivity` makes the request appear as if it arrived earlier, placing it ahead of lower-priority work. Requests only enter the queue when all workers exceed a configurable load threshold. Below that threshold, they bypass the queue entirely and go straight to worker selection. When capacity frees up (prefill completes or a request finishes), the queue drains highest-priority entries first. Once dispatched, SGLang, vLLM, and TRT-LLM may interpret engine priority differently, so Dynamo normalizes the engine-facing value per backend. Engines like SGLang can also use priority-based radix cache eviction where lower-priority blocks are evicted first under memory pressure.
Once dispatched, Dynamo passes `priority` through to the engine directly. SGLang, vLLM, and TRT-LLM all support priority-based request scheduling, and engines like SGLang support priority-based radix cache eviction where lower-priority blocks are evicted first under memory pressure. ![How priority flows from harness through router dispatch to engine treatment](./two-gates.svg)
![How latency_sensitivity and priority flow from harness through router dispatch to engine treatment](./two-gates.svg)
### Agentic Workload Routing Strategies ### Agentic Workload Routing Strategies
......
...@@ -18,7 +18,6 @@ Include `nvext` as a top-level field alongside standard OpenAI-compatible fields ...@@ -18,7 +18,6 @@ Include `nvext` as a top-level field alongside standard OpenAI-compatible fields
"greed_sampling": true, "greed_sampling": true,
"extra_fields": ["worker_id", "timing"], "extra_fields": ["worker_id", "timing"],
"agent_hints": { "agent_hints": {
"latency_sensitivity": 5.0,
"osl": 1024, "osl": 1024,
"priority": 5 "priority": 5
} }
...@@ -57,20 +56,21 @@ The `agent_hints` sub-object carries per-request hints that the router uses for ...@@ -57,20 +56,21 @@ The `agent_hints` sub-object carries per-request hints that the router uses for
| Field | Type | Default | Description | | Field | Type | Default | Description |
|-------|------|---------|-------------| |-------|------|---------|-------------|
| `latency_sensitivity` | `f64` | `None` | Priority scheduling hint in seconds. Shifts the request's effective arrival time earlier in the router queue. Requires `--router-queue-threshold`. | | `priority` | `i32` | `None` | Unified request priority. Higher values mean higher priority at the Dynamo API level. Used for router queue ordering and backend scheduling/eviction. |
| `osl` | `u32` | `None` | Expected output sequence length (tokens). Used for output block tracking and resource estimation. | | `osl` | `u32` | `None` | Expected output sequence length (tokens). Used for output block tracking and resource estimation. |
| `speculative_prefill` | `bool` | `false` | When `true`, speculatively prefills the predicted next-turn prompt after the current turn completes to warm the KV cache. | | `speculative_prefill` | `bool` | `false` | When `true`, speculatively prefills the predicted next-turn prompt after the current turn completes to warm the KV cache. |
| `priority` | `i32` | `None` | Backend engine scheduling priority. Forwarded to the engine's generate call for queue ordering, preemption, and KV cache eviction. |
### `latency_sensitivity` ### `priority`
`priority` is the single user-facing scheduling hint. Higher values mean "more important" across Dynamo.
When `--router-queue-threshold` is set and the queue is active, this value shifts the request's effective arrival time earlier in the queue, giving it priority over requests with lower (or no) `latency_sensitivity`. A value of `5.0` means the request is treated as if it arrived 5 seconds earlier than it actually did. A recommended default is `1.2` for latency-sensitive agentic requests. Has no effect when queueing is disabled. When `--router-queue-threshold` is set and the queue is active, higher-priority requests are shifted earlier in the router queue. Once dispatched, Dynamo forwards the same semantic priority to the backend engine for queue ordering, preemption, and KV cache eviction. Dynamo normalizes backend-specific polarity internally, including vLLM's lower-is-higher convention.
```json ```json
{ {
"nvext": { "nvext": {
"agent_hints": { "agent_hints": {
"latency_sensitivity": 5.0 "priority": 5
} }
} }
} }
...@@ -114,16 +114,11 @@ How it works: ...@@ -114,16 +114,11 @@ How it works:
} }
``` ```
### `priority` Backend details:
Backend engine scheduling priority forwarded to the engine's `generate` call. Influences queue ordering, KV cache eviction under memory pressure, and preemption of running requests.
The semantics of the priority value differ between backends:
- **SGLang**: By default, larger values = higher priority. This can be inverted with `--schedule-low-priority-values-first` to match vLLM's convention. Requires `--enable-priority-scheduling` on the engine.
- **vLLM**: Smaller values = higher priority. A request with `priority: 0` is scheduled before `priority: 10`. Ties are broken by arrival time. Requires `--scheduling-policy priority` on the engine.
When omitted, SGLang defaults to `None` (engine default); vLLM defaults to `0`. TensorRT-LLM does not currently support per-request priority. - **SGLang**: Requires `--enable-priority-scheduling` for queue ordering and `--radix-eviction-policy priority` for priority-based eviction.
- **vLLM**: Requires `--scheduling-policy priority`.
- **TensorRT-LLM**: Does not currently support per-request priority.
```json ```json
{ {
......
...@@ -21,7 +21,7 @@ For Kubernetes, set `DYN_ROUTER_MODE=kv` on the Frontend service. Workers automa ...@@ -21,7 +21,7 @@ For Kubernetes, set `DYN_ROUTER_MODE=kv` on the Frontend service. Workers automa
| `--router-mode kv` | `round_robin` | Enable KV cache-aware routing | | `--router-mode kv` | `round_robin` | Enable KV cache-aware routing |
| `--router-kv-overlap-score-weight` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) | | `--router-kv-overlap-score-weight` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) |
| `--no-router-kv-events` | enabled | Fall back to approximate routing (no event consumption from workers) | | `--no-router-kv-events` | enabled | Fall back to approximate routing (no event consumption from workers) |
| `--router-queue-threshold` | `2.0` | Backpressure queue threshold; enables priority scheduling via `nvext.agent_hints.latency_sensitivity` | | `--router-queue-threshold` | `2.0` | Backpressure queue threshold; enables priority scheduling via `nvext.agent_hints.priority` |
| `--router-queue-policy` | `fcfs` | Queue scheduling policy: `fcfs` (tail TTFT) or `wspt` (avg TTFT) | | `--router-queue-policy` | `fcfs` | Queue scheduling policy: `fcfs` (tail TTFT) or `wspt` (avg TTFT) |
### Standalone Router ### Standalone Router
......
...@@ -87,7 +87,7 @@ Backend workers register themselves using the `register_model` API, after which ...@@ -87,7 +87,7 @@ Backend workers register themselves using the `register_model` API, after which
| `--kv-cache-block-size <size>` | Backend-specific | KV cache block size (should match backend config) | | `--kv-cache-block-size <size>` | Backend-specific | KV cache block size (should match backend config) |
| `--router-kv-events` / `--no-router-kv-events` | `--router-kv-events` | Enable/disable real-time KV event tracking | | `--router-kv-events` / `--no-router-kv-events` | `--router-kv-events` | Enable/disable real-time KV event tracking |
| `--router-kv-overlap-score-weight <float>` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) | | `--router-kv-overlap-score-weight <float>` | `1.0` | Balance prefill vs decode optimization (higher = better TTFT) |
| `--router-queue-threshold <float>` | `2.0` | Queue threshold fraction; enables priority scheduling via `latency_sensitivity` | | `--router-queue-threshold <float>` | `2.0` | Queue threshold fraction; enables priority scheduling via `priority` |
| `--router-queue-policy <str>` | `fcfs` | Scheduling policy for the queue: `fcfs` (tail TTFT) or `wspt` (avg TTFT) | | `--router-queue-policy <str>` | `fcfs` | Scheduling policy for the queue: `fcfs` (tail TTFT) or `wspt` (avg TTFT) |
For all available options: `python -m dynamo.frontend --help` For all available options: `python -m dynamo.frontend --help`
...@@ -231,7 +231,7 @@ The main KV-aware routing arguments (frontend uses the same `--router-*` flag na ...@@ -231,7 +231,7 @@ The main KV-aware routing arguments (frontend uses the same `--router-*` flag na
- `--router-temperature`: Controls worker selection randomness through softmax sampling of router cost logits. A value of 0 (default) ensures deterministic selection of the lowest-cost worker, while higher values introduce more randomness. - `--router-temperature`: Controls worker selection randomness through softmax sampling of router cost logits. A value of 0 (default) ensures deterministic selection of the lowest-cost worker, while higher values introduce more randomness.
- `--router-queue-threshold`: Queue threshold fraction for prefill token capacity (default: 2.0). The router holds incoming requests in a priority queue while all workers exceed this fraction of `max_num_batched_tokens`, releasing them when capacity frees up. This defers dispatch (not rejection) so that routing decisions use the most up-to-date load metrics at the moment the request is actually sent to a worker. It also enables **priority scheduling** via `latency_sensitivity` hints in `nvext.agent_hints` — higher values shift a request's effective arrival time earlier in the queue, giving it priority over lower-valued requests. Must be > 0. Set to None to disable queueing (requests are dispatched immediately). - `--router-queue-threshold`: Queue threshold fraction for prefill token capacity (default: 2.0). The router holds incoming requests in a priority queue while all workers exceed this fraction of `max_num_batched_tokens`, releasing them when capacity frees up. This defers dispatch (not rejection) so that routing decisions use the most up-to-date load metrics at the moment the request is actually sent to a worker. It also enables **priority scheduling** via `priority` hints in `nvext.agent_hints` — higher values shift a request's effective arrival time earlier in the queue, giving it priority over lower-valued requests. Must be > 0. Set to None to disable queueing (requests are dispatched immediately).
- `--router-queue-policy`: Scheduling policy for the router queue (default: `fcfs`). Two policies are available: - `--router-queue-policy`: Scheduling policy for the router queue (default: `fcfs`). Two policies are available:
- **`fcfs`** (first-come first-served): Orders by adjusted arrival time (`priority_jump - arrival_offset`). Optimizes **tail TTFT** — no request waits longer than necessary. - **`fcfs`** (first-come first-served): Orders by adjusted arrival time (`priority_jump - arrival_offset`). Optimizes **tail TTFT** — no request waits longer than necessary.
...@@ -269,7 +269,7 @@ The main KV-aware routing arguments (frontend uses the same `--router-*` flag na ...@@ -269,7 +269,7 @@ The main KV-aware routing arguments (frontend uses the same `--router-*` flag na
To implement KV event publishing for custom inference engines, enabling them to participate in Dynamo's KV cache-aware routing, see [KV Event Publishing for Custom Engines](../../integrations/kv-events-custom-engines.md). To implement KV event publishing for custom inference engines, enabling them to participate in Dynamo's KV cache-aware routing, see [KV Event Publishing for Custom Engines](../../integrations/kv-events-custom-engines.md).
For details on per-request agent hints (`latency_sensitivity`, `osl`, `speculative_prefill`), see the [NVIDIA Request Extensions (`nvext`)](../frontend/nvext.md#agent-hints) documentation. For details on per-request agent hints (`priority`, `osl`, `speculative_prefill`), see the [NVIDIA Request Extensions (`nvext`)](../frontend/nvext.md#agent-hints) documentation.
### Tuning Guidelines ### Tuning Guidelines
...@@ -281,7 +281,7 @@ Use `--no-router-assume-kv-reuse` in disaggregated setups where the decode worke ...@@ -281,7 +281,7 @@ Use `--no-router-assume-kv-reuse` in disaggregated setups where the decode worke
Use `--router-track-output-blocks` **(experimental)** when your workload is output-heavy and you want the router to account for output-side KV cache growth in load balancing. This is useful in two scenarios: (1) workloads with long output sequences and little multi-turn reuse, where output blocks dominate the KV cache footprint; (2) agentic schedulers (e.g. NAT or other LLM routers) that can accurately predict the expected output sequence length per request. When enabled, the router adds placeholder blocks as tokens are generated. If you additionally pass `nvext.agent_hints.osl` (expected output sequence length in tokens) per request, the router applies fractional decay to output blocks — each output block's weight starts at 1.0 and decays linearly toward 0.0 as generation approaches the expected OSL. This lets the router predict that a request nearing completion will soon free its blocks, effectively modeling the future load trajectory rather than just the current snapshot. Without `osl`, output blocks are added at full weight with no decay. The flag requires `--router-track-active-blocks` (the default). Use `--router-track-output-blocks` **(experimental)** when your workload is output-heavy and you want the router to account for output-side KV cache growth in load balancing. This is useful in two scenarios: (1) workloads with long output sequences and little multi-turn reuse, where output blocks dominate the KV cache footprint; (2) agentic schedulers (e.g. NAT or other LLM routers) that can accurately predict the expected output sequence length per request. When enabled, the router adds placeholder blocks as tokens are generated. If you additionally pass `nvext.agent_hints.osl` (expected output sequence length in tokens) per request, the router applies fractional decay to output blocks — each output block's weight starts at 1.0 and decays linearly toward 0.0 as generation approaches the expected OSL. This lets the router predict that a request nearing completion will soon free its blocks, effectively modeling the future load trajectory rather than just the current snapshot. Without `osl`, output blocks are added at full weight with no decay. The flag requires `--router-track-active-blocks` (the default).
The `--router-queue-threshold` (default: 2.0) controls when incoming requests are held in a priority queue. The router holds requests while all workers exceed the given fraction of `max_num_batched_tokens`, releasing them as capacity frees up. This defers the routing decision so it is made with the freshest load metrics, rather than dispatching into an already-saturated system. It also enables priority scheduling via `nvext.agent_hints.latency_sensitivity`. Set to None to disable queueing entirely. The `--router-queue-threshold` (default: 2.0) controls when incoming requests are held in a priority queue. The router holds requests while all workers exceed the given fraction of `max_num_batched_tokens`, releasing them as capacity frees up. This defers the routing decision so it is made with the freshest load metrics, rather than dispatching into an already-saturated system. It also enables priority scheduling via `nvext.agent_hints.priority`. Set to None to disable queueing entirely.
Use `--router-queue-policy wspt` when your workload has a mix of short and long requests and you want to minimize **average** TTFT. WSPT (Smith's rule) schedules short or high-priority requests first, reducing mean latency across the batch. Use the default `fcfs` when you want to minimize **tail** TTFT — no request waits longer than necessary, since ordering is purely by (adjusted) arrival time. Use `--router-queue-policy wspt` when your workload has a mix of short and long requests and you want to minimize **average** TTFT. WSPT (Smith's rule) schedules short or high-priority requests first, reducing mean latency across the batch. Use the default `fcfs` when you want to minimize **tail** TTFT — no request waits longer than necessary, since ordering is purely by (adjusted) arrival time.
...@@ -449,7 +449,7 @@ When `--router-kv-overlap-score-weight` is set to 0, no KVIndexer is created and ...@@ -449,7 +449,7 @@ When `--router-kv-overlap-score-weight` is set to 0, no KVIndexer is created and
The cli args `--router-ttl-secs`, `--router-max-tree-size`, and `--router-prune-target-ratio` control local cache management when the router operates without receiving events from workers. When workers are configured to publish KV events (via `--kv-events-config`), the router relies on worker-side eviction events and these parameters are ignored. The cli args `--router-ttl-secs`, `--router-max-tree-size`, and `--router-prune-target-ratio` control local cache management when the router operates without receiving events from workers. When workers are configured to publish KV events (via `--kv-events-config`), the router relies on worker-side eviction events and these parameters are ignored.
**Queue threshold vs. busy rejection thresholds:** `--router-queue-threshold` and the busy thresholds (`--active-decode-blocks-threshold`, `--active-prefill-tokens-threshold`, `--active-prefill-tokens-threshold-frac`) serve different purposes. The busy thresholds **reject** a worker entirely from the candidate set when it exceeds a utilization limit — no traffic is sent until it drops below the threshold. In contrast, `--router-queue-threshold` does not reject workers; it **defers the entire routing decision** until at least one worker has capacity, so the request is routed with the freshest load metrics. The queue also enables priority scheduling via `nvext.agent_hints.latency_sensitivity`. The busy thresholds can be updated at runtime without restarting the frontend via the `/busy_threshold` HTTP endpoint. For details on busy detection, threshold tuning, and the runtime API, see [Request Rejection](../../fault-tolerance/request-rejection.md). **Queue threshold vs. busy rejection thresholds:** `--router-queue-threshold` and the busy thresholds (`--active-decode-blocks-threshold`, `--active-prefill-tokens-threshold`, `--active-prefill-tokens-threshold-frac`) serve different purposes. The busy thresholds **reject** a worker entirely from the candidate set when it exceeds a utilization limit — no traffic is sent until it drops below the threshold. In contrast, `--router-queue-threshold` does not reject workers; it **defers the entire routing decision** until at least one worker has capacity, so the request is routed with the freshest load metrics. The queue also enables priority scheduling via `nvext.agent_hints.priority`. The busy thresholds can be updated at runtime without restarting the frontend via the `/busy_threshold` HTTP endpoint. For details on busy detection, threshold tuning, and the runtime API, see [Request Rejection](../../fault-tolerance/request-rejection.md).
## See Also ## See Also
......
...@@ -34,8 +34,7 @@ The request body includes `nvext.agent_hints` (routing, scheduling) and `nvext.c ...@@ -34,8 +34,7 @@ The request body includes `nvext.agent_hints` (routing, scheduling) and `nvext.c
| Hint | Description | | Hint | Description |
|------|-------------| |------|-------------|
| `latency_sensitivity` | Router queue priority (requires `--router-queue-threshold`). Higher values shift the request earlier in the queue so user-facing turns run before background work. | | `priority` | Unified request priority. Higher values move the request earlier in the router queue and are forwarded to the backend for scheduling and priority-based eviction. |
| `priority` | Engine queue ordering and KV cache eviction. Forwarded to the backend for scheduling and priority-based eviction. |
| `osl` | Expected output sequence length (tokens). Used by the router for output block tracking and load-balancing accuracy when `--router-track-output-blocks` is enabled. | | `osl` | Expected output sequence length (tokens). Used by the router for output block tracking and load-balancing accuracy when `--router-track-output-blocks` is enabled. |
| `speculative_prefill` | When true, after the assistant turn completes the system prefills the predicted next-turn prefix (conversation history + assistant text, e.g. thinking stripped) to warm the KV cache for the next request. | | `speculative_prefill` | When true, after the assistant turn completes the system prefills the predicted next-turn prefix (conversation history + assistant text, e.g. thinking stripped) to warm the KV cache for the next request. |
| `program_id` | (Planned) Identifies the agentic program for program-level metrics and cache affinity. | | `program_id` | (Planned) Identifies the agentic program for program-level metrics and cache affinity. |
...@@ -53,7 +52,7 @@ The request body includes `nvext.agent_hints` (routing, scheduling) and `nvext.c ...@@ -53,7 +52,7 @@ The request body includes `nvext.agent_hints` (routing, scheduling) and `nvext.c
| Cache prefetching | | 🚧 | | | Cache prefetching | | 🚧 | |
| Subagent / thinking-aware cache eviction | | 🚧 | | | Subagent / thinking-aware cache eviction | | 🚧 | |
| Speculative prefill | ✅ | ✅ | ✅ | | Speculative prefill | ✅ | ✅ | ✅ |
| Latency-sensitivityaware routing | ✅ | ✅ | ✅ | | Priority-aware routing | ✅ | ✅ | ✅ |
🚧 = Work in progress or experimental. 🚧 = Work in progress or experimental.
...@@ -80,9 +79,9 @@ Dynamo is now supported directly in LangChain using the [NVIDIA AI Endpoints int ...@@ -80,9 +79,9 @@ Dynamo is now supported directly in LangChain using the [NVIDIA AI Endpoints int
After a turn finishes, the system can send a **speculative** `max_tokens=1` prefill with the **predicted next-turn prefix** (conversation history + assistant text, e.g. thinking stripped) to the same worker. When the real next request arrives, it hits a warm KV cache. Per-turn TTFT on turns 2+ can drop significantly (e.g. up to ~3× in [multiturn benchmarks](https://github.com/ai-dynamo/dynamo/blob/main/lib/bench/src/bin/README.md)). This can be extended so that Dynamo automatically sends tools and system prompt for subagents to a worker in advance, so subagent requests always hit warm cache. After a turn finishes, the system can send a **speculative** `max_tokens=1` prefill with the **predicted next-turn prefix** (conversation history + assistant text, e.g. thinking stripped) to the same worker. When the real next request arrives, it hits a warm KV cache. Per-turn TTFT on turns 2+ can drop significantly (e.g. up to ~3× in [multiturn benchmarks](https://github.com/ai-dynamo/dynamo/blob/main/lib/bench/src/bin/README.md)). This can be extended so that Dynamo automatically sends tools and system prompt for subagents to a worker in advance, so subagent requests always hit warm cache.
### Latency-sensitivityaware routing ### Priority-aware routing
When `--router-queue-threshold` is set, the router maintains a priority queue. Requests with higher `latency_sensitivity` are treated as if they arrived earlier, so they are scheduled ahead of bulk or background work. Under load, this keeps median latency low for user-facing agent turns while background work can tolerate higher latency. For a runnable demo and results, see [NeMo Agent Toolkit latency sensitivity demo](https://github.com/NVIDIA/NeMo-Agent-Toolkit/tree/develop/examples/dynamo_integration/latency_sensitivity_demo). When `--router-queue-threshold` is set, the router maintains a priority queue. Requests with higher `priority` are treated as if they arrived earlier, so they are scheduled ahead of bulk or background work. Under load, this keeps median latency low for user-facing agent turns while background work can tolerate higher latency. For a runnable demo and results, see [NeMo Agent Toolkit priority demo](https://github.com/NVIDIA/NeMo-Agent-Toolkit/tree/develop/examples/dynamo_integration/latency_sensitivity_demo).
--- ---
...@@ -91,4 +90,4 @@ When `--router-queue-threshold` is set, the router maintains a priority queue. R ...@@ -91,4 +90,4 @@ When `--router-queue-threshold` is set, the router maintains a priority queue. R
- [NeMo Agent Toolkit — Dynamo integration](https://github.com/NVIDIA/NeMo-Agent-Toolkit/tree/develop/examples/dynamo_integration) - [NeMo Agent Toolkit — Dynamo integration](https://github.com/NVIDIA/NeMo-Agent-Toolkit/tree/develop/examples/dynamo_integration)
- [Context engineering for AI agents (Manus)](https://manus.im/blog/Context-Engineering-for-AI-Agents-Lessons-from-Building-Manus) - [Context engineering for AI agents (Manus)](https://manus.im/blog/Context-Engineering-for-AI-Agents-Lessons-from-Building-Manus)
- [Stateful runtime for agents (OpenAI/Bedrock)](https://openai.com/index/introducing-the-stateful-runtime-environment-for-agents-in-amazon-bedrock/) - [Stateful runtime for agents (OpenAI/Bedrock)](https://openai.com/index/introducing-the-stateful-runtime-environment-for-agents-in-amazon-bedrock/)
- [Claude Code's Prompt Caching](https://platform.claude.com/docs/en/build-with-claude/prompt-caching) - [Claude Code's Prompt Caching](https://platform.claude.com/docs/en/build-with-claude/prompt-caching)
\ No newline at end of file
...@@ -1133,7 +1133,7 @@ class KvRouterConfig: ...@@ -1133,7 +1133,7 @@ class KvRouterConfig:
router_prune_target_ratio: Target size ratio after pruning (default: 0.8) router_prune_target_ratio: Target size ratio after pruning (default: 0.8)
router_queue_threshold: Queue threshold fraction for prefill token capacity (default: 2.0). router_queue_threshold: Queue threshold fraction for prefill token capacity (default: 2.0).
Requests are queued if all workers exceed this fraction of max_num_batched_tokens. Requests are queued if all workers exceed this fraction of max_num_batched_tokens.
Enables priority scheduling via latency_sensitivity hints. Enables priority scheduling via request priority hints.
Set to None to disable queueing (all requests go directly to the scheduler). Set to None to disable queueing (all requests go directly to the scheduler).
router_event_threads: Number of event processing threads (default: 4). router_event_threads: Number of event processing threads (default: 4).
When > 1, uses a concurrent radix tree with a thread pool. When > 1, uses a concurrent radix tree with a thread pool.
...@@ -1756,4 +1756,3 @@ class VirtualConnectorClient: ...@@ -1756,4 +1756,3 @@ class VirtualConnectorClient:
async def wait(self) -> None: async def wait(self) -> None:
"""Blocks until there is a new decision to fetch using 'get'""" """Blocks until there is a new decision to fetch using 'get'"""
... ...
...@@ -303,7 +303,11 @@ impl OpenAIPreprocessor { ...@@ -303,7 +303,11 @@ impl OpenAIPreprocessor {
decode_worker_id: nvext.decode_worker_id, decode_worker_id: nvext.decode_worker_id,
dp_rank: None, // dp_rank is set later in the pipeline dp_rank: None, // dp_rank is set later in the pipeline
expected_output_tokens: hints.and_then(|h| h.osl), expected_output_tokens: hints.and_then(|h| h.osl),
priority_jump: hints.and_then(|h| h.latency_sensitivity), priority_jump: hints.and_then(|h| {
h.priority
.map(|priority| priority.max(0) as f64)
.or(h.latency_sensitivity)
}),
priority: hints.and_then(|h| h.priority), priority: hints.and_then(|h| h.priority),
lora_name, lora_name,
cache_control_ttl: nvext.cache_control.as_ref().map(|cc| cc.ttl_seconds()), cache_control_ttl: nvext.cache_control.as_ref().map(|cc| cc.ttl_seconds()),
......
...@@ -179,11 +179,13 @@ pub struct NvExt { ...@@ -179,11 +179,13 @@ pub struct NvExt {
/// Hints from the agent/caller about request characteristics. /// Hints from the agent/caller about request characteristics.
#[derive(ToSchema, Serialize, Deserialize, Builder, Debug, Clone, Default, PartialEq)] #[derive(ToSchema, Serialize, Deserialize, Builder, Debug, Clone, Default, PartialEq)]
pub struct AgentHints { pub struct AgentHints {
/// Latency sensitivity in seconds for queue ordering. /// Unified request priority.
/// Higher values cause the request to be scheduled sooner when the router queue is enabled. /// Higher values mean "more important" at the Dynamo API level.
/// Dynamo uses this for router queue ordering and normalizes it per backend
/// before forwarding engine priority values.
#[builder(default, setter(strip_option))] #[builder(default, setter(strip_option))]
#[serde(default, skip_serializing_if = "Option::is_none")] #[serde(default, skip_serializing_if = "Option::is_none")]
pub latency_sensitivity: Option<f64>, pub priority: Option<i32>,
/// Expected output sequence length (number of output tokens). /// Expected output sequence length (number of output tokens).
/// Used as a hint for routing decisions to estimate resource requirements /// Used as a hint for routing decisions to estimate resource requirements
...@@ -199,13 +201,12 @@ pub struct AgentHints { ...@@ -199,13 +201,12 @@ pub struct AgentHints {
#[serde(default, skip_serializing_if = "Option::is_none")] #[serde(default, skip_serializing_if = "Option::is_none")]
pub speculative_prefill: Option<bool>, pub speculative_prefill: Option<bool>,
/// Backend engine scheduling priority. /// Deprecated alias for router-only priority.
/// Forwarded to the engine's generate call for queue ordering, KV cache eviction, /// Kept as an undocumented fallback while callers migrate to `priority`.
/// and preemption decisions. Interpretation is backend-specific:
/// vLLM uses lower-is-higher, SGLang uses higher-is-higher (configurable).
#[builder(default, setter(strip_option))] #[builder(default, setter(strip_option))]
#[serde(default, skip_serializing_if = "Option::is_none")] #[serde(default, skip_serializing_if = "Option::is_none")]
pub priority: Option<i32>, #[schema(ignore)]
pub latency_sensitivity: Option<f64>,
} }
/// Anthropic-style cache control hint for prefix pinning with TTL. /// Anthropic-style cache control hint for prefix pinning with TTL.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment