Unverified Commit 1ffa489e authored by Jacky's avatar Jacky Committed by GitHub
Browse files

refactor: Move --migration-limit flag from backend to frontend (#5918)


Signed-off-by: default avatarJacky <18255193+kthui@users.noreply.github.com>
parent 3842b244
......@@ -225,6 +225,12 @@ def parse_args():
default=False,
help="Enforce disaggregated prefill-decode. When set, unactivated prefill router will return an error instead of falling back to decode-only mode.",
)
parser.add_argument(
"--migration-limit",
type=int,
default=0,
help="Maximum number of times a request may be migrated to a different engine worker. When > 0, enables request migration on worker disconnect (default: 0).",
)
parser.add_argument(
"--active-decode-blocks-threshold",
type=float,
......@@ -304,6 +310,8 @@ def parse_args():
if bool(flags.tls_cert_path) ^ bool(flags.tls_key_path): # ^ is XOR
parser.error("--tls-cert-path and --tls-key-path must be provided together")
if flags.migration_limit < 0 or flags.migration_limit > 4294967295:
parser.error("--migration-limit must be between 0 and 4294967295 (0=disabled)")
return flags
......@@ -324,6 +332,10 @@ async def async_main():
flags = parse_args()
dump_config(flags.dump_config_to, flags)
os.environ["DYN_EVENT_PLANE"] = flags.event_plane
logger.info(
f"Request migration {'enabled' if flags.migration_limit > 0 else 'disabled'} "
f"(limit: {flags.migration_limit})"
)
# Warn if DYN_SYSTEM_PORT is set (frontend doesn't use system metrics server)
if os.environ.get("DYN_SYSTEM_PORT"):
logger.warning(
......@@ -393,6 +405,7 @@ async def async_main():
active_prefill_tokens_threshold_frac=flags.active_prefill_tokens_threshold_frac,
enforce_disagg=flags.enforce_disagg,
),
"migration_limit": flags.migration_limit,
}
if flags.model_name:
......
......@@ -35,12 +35,6 @@ DYNAMO_ARGS: Dict[str, Dict[str, Any]] = {
"type": str,
"help": f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Example: {DEFAULT_ENDPOINT}",
},
"migration-limit": {
"flags": ["--migration-limit"],
"type": int,
"default": 0,
"help": "Maximum number of times a request may be migrated to a different engine worker",
},
"tool-call-parser": {
"flags": ["--dyn-tool-call-parser"],
"type": str,
......@@ -159,7 +153,6 @@ class DynamoArgs:
namespace: str
component: str
endpoint: str
migration_limit: int
store_kv: str
request_plane: str
event_plane: str
......@@ -588,7 +581,6 @@ async def parse_args(args: list[str]) -> Config:
namespace=parsed_namespace,
component=parsed_component_name,
endpoint=parsed_endpoint_name,
migration_limit=parsed_args.migration_limit,
store_kv=parsed_args.store_kv,
request_plane=parsed_args.request_plane,
event_plane=parsed_args.event_plane,
......
......@@ -56,7 +56,6 @@ async def _register_llm_with_runtime_config(
server_args.model_path,
server_args.served_model_name,
kv_cache_block_size=server_args.page_size,
migration_limit=dynamo_args.migration_limit,
runtime_config=runtime_config,
custom_template_path=dynamo_args.custom_jinja_template,
)
......
......@@ -462,7 +462,6 @@ async def init(
config.model_path,
config.served_model_name,
kv_cache_block_size=config.kv_block_size,
migration_limit=config.migration_limit,
runtime_config=runtime_config,
custom_template_path=config.custom_jinja_template,
)
......
......@@ -40,7 +40,6 @@ class Config:
self.expert_parallel_size: Optional[int] = None
self.enable_attention_dp: bool = False
self.kv_block_size: int = 32
self.migration_limit: int = 0
self.gpus_per_node: Optional[int] = None
self.max_batch_size: int = BuildConfig.model_fields["max_batch_size"].default
self.max_num_tokens: int = BuildConfig.model_fields["max_num_tokens"].default
......@@ -88,7 +87,6 @@ class Config:
f"free_gpu_memory_fraction={self.free_gpu_memory_fraction}, "
f"extra_engine_args={self.extra_engine_args}, "
f"override_engine_args={self.override_engine_args}, "
f"migration_limit={self.migration_limit}, "
f"publish_events_and_metrics={self.publish_events_and_metrics}, "
f"disaggregation_mode={self.disaggregation_mode}, "
f"encode_endpoint={self.encode_endpoint}, "
......@@ -196,12 +194,6 @@ def cmd_line_args():
parser.add_argument(
"--kv-block-size", type=int, default=32, help="Size of a KV cache block."
)
parser.add_argument(
"--migration-limit",
type=int,
default=0,
help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.",
)
parser.add_argument(
"--gpus-per-node",
type=int,
......@@ -416,7 +408,6 @@ def cmd_line_args():
config.max_seq_len = args.max_seq_len
config.max_beam_width = args.max_beam_width
config.kv_block_size = args.kv_block_size
config.migration_limit = args.migration_limit
config.extra_engine_args = args.extra_engine_args
config.override_engine_args = args.override_engine_args
config.publish_events_and_metrics = args.publish_events_and_metrics
......
......@@ -36,7 +36,6 @@ class Config:
endpoint: str
is_prefill_worker: bool
is_decode_worker: bool
migration_limit: int = 0
custom_jinja_template: Optional[str] = None
store_kv: str
request_plane: str
......@@ -138,12 +137,6 @@ def parse_args() -> Config:
action="store_true",
help="Mark this as a decode worker which does not publish KV events.",
)
parser.add_argument(
"--migration-limit",
type=int,
default=0,
help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.",
)
parser.add_argument(
"--connector",
nargs="*",
......@@ -436,7 +429,6 @@ def parse_args() -> Config:
config.engine_args = engine_args
config.is_prefill_worker = args.is_prefill_worker
config.is_decode_worker = args.is_decode_worker
config.migration_limit = args.migration_limit
config.tool_call_parser = args.dyn_tool_call_parser
config.reasoning_parser = args.dyn_reasoning_parser
config.custom_jinja_template = args.custom_jinja_template
......
......@@ -499,7 +499,6 @@ async def register_vllm_model(
config: Config,
engine_client: AsyncLLM,
vllm_config,
migration_limit: int,
):
"""
Helper function to register a vLLM model with runtime configuration.
......@@ -511,7 +510,6 @@ async def register_vllm_model(
config: Configuration object
engine_client: vLLM engine client
vllm_config: vLLM configuration
migration_limit: Migration limit for the model
"""
runtime_config = ModelRuntimeConfig()
......@@ -559,7 +557,6 @@ async def register_vllm_model(
config.model,
config.served_model_name,
kv_cache_block_size=config.engine_args.block_size,
migration_limit=migration_limit,
runtime_config=runtime_config,
custom_template_path=config.custom_jinja_template,
media_decoder=media_decoder,
......@@ -660,7 +657,6 @@ async def init_prefill(
config,
engine_client,
vllm_config,
migration_limit=0, # Prefill doesn't support migration
)
health_check_payload = VllmPrefillHealthCheckPayload(
......@@ -813,7 +809,6 @@ async def init(
config,
engine_client,
vllm_config,
migration_limit=config.migration_limit,
)
health_check_payload = VllmHealthCheckPayload(
......@@ -827,7 +822,7 @@ async def init(
# because waiting them to finish can take a long time for long OSLs
generate_endpoint.serve_endpoint(
handler.generate,
graceful_shutdown=config.migration_limit <= 0,
graceful_shutdown=True,
metrics_labels=[("model", config.served_model_name or config.model)],
health_check_payload=health_check_payload,
),
......
......@@ -55,7 +55,6 @@ Dynamo SGLang uses SGLang's native argument parser, so **most SGLang engine argu
| Argument | Description | Default | SGLang Equivalent |
|----------|-------------|---------|-------------------|
| `--endpoint` | Dynamo endpoint in `dyn://namespace.component.endpoint` format | Auto-generated based on mode | N/A |
| `--migration-limit` | Max times a request can migrate between workers for fault tolerance. See [Request Migration Architecture](../../fault_tolerance/request_migration.md). | `0` (disabled) | N/A |
| `--dyn-tool-call-parser` | Tool call parser for structured outputs (takes precedence over `--tool-call-parser`) | `None` | `--tool-call-parser` |
| `--dyn-reasoning-parser` | Reasoning parser for CoT models (takes precedence over `--reasoning-parser`) | `None` | `--reasoning-parser` |
| `--use-sglang-tokenizer` | Use SGLang's tokenizer instead of Dynamo's | `False` | N/A |
......
......@@ -193,17 +193,7 @@ Dynamo with TensorRT-LLM supports two methods for transferring KV cache in disag
## Request Migration
You can enable [request migration](../../../docs/fault_tolerance/request_migration.md) to handle worker failures gracefully. Use the `--migration-limit` flag to specify how many times a request can be migrated to another worker:
```bash
# For decode and aggregated workers
python3 -m dynamo.trtllm ... --migration-limit=3
```
> [!IMPORTANT]
> **Prefill workers do not support request migration** and must use `--migration-limit=0` (the default). Prefill workers only process prompts and return KV cache state - they don't maintain long-running generation requests that would benefit from migration.
See the [Request Migration Architecture](../../../docs/fault_tolerance/request_migration.md) documentation for details on how this works.
Dynamo supports [request migration](../../../docs/fault_tolerance/request_migration.md) to handle worker failures gracefully. When enabled, requests can be automatically migrated to healthy workers if a worker fails mid-generation. See the [Request Migration Architecture](../../../docs/fault_tolerance/request_migration.md) documentation for configuration details.
## Request Cancellation
......
......@@ -183,13 +183,7 @@ See the high-level notes in [Router Design](../../design_docs/router_design.md#d
## Request Migration
You can enable [request migration](../../../docs/fault_tolerance/request_migration.md) to handle worker failures gracefully. Use the `--migration-limit` flag to specify how many times a request can be migrated to another worker:
```bash
python3 -m dynamo.vllm ... --migration-limit=3
```
This allows a request to be migrated up to 3 times before failing. See the [Request Migration Architecture](../../../docs/fault_tolerance/request_migration.md) documentation for details on how this works.
Dynamo supports [request migration](../../../docs/fault_tolerance/request_migration.md) to handle worker failures gracefully. When enabled, requests can be automatically migrated to healthy workers if a worker fails mid-generation. See the [Request Migration Architecture](../../../docs/fault_tolerance/request_migration.md) documentation for configuration details.
## Request Cancellation
......
......@@ -73,7 +73,6 @@ The `model_type` can be:
- `model_name`: The name to call the model. Your incoming HTTP requests model name must match this. Defaults to the hugging face repo name or the folder name.
- `context_length`: Max model length in tokens. Defaults to the model's set max. Only set this if you need to reduce KV cache allocation to fit into VRAM.
- `kv_cache_block_size`: Size of a KV block for the engine, in tokens. Defaults to 16.
- `migration_limit`: Maximum number of times a request may be [migrated to another Instance](../fault_tolerance/request_migration.md). Defaults to 0.
- `user_data`: Optional dictionary containing custom metadata for worker behavior (e.g., LoRA configuration). Defaults to None.
See `examples/backends` for full code examples.
......
......@@ -83,30 +83,16 @@ generate_endpoint.serve_endpoint(
|-----------|------------------|-----------|
| **Frontend** | N/A (HTTP server) | HTTP server handles its own shutdown |
| **Prefill Workers** | `graceful_shutdown=True` | Prefill operations must complete to avoid wasted computation |
| **Decode Workers** | Conditional | If migration is enabled (`migration_limit > 0`), shutdown immediately to allow migration; otherwise wait |
| **Decode Workers** | `graceful_shutdown=True` | Decode operations should complete to avoid wasted computation |
| **Router** | `graceful_shutdown=True` | Ensure routing decisions complete |
### Decode Worker Migration Integration
### Migration Integration
Decode workers use conditional draining based on whether request migration is supported:
Backend workers always use `graceful_shutdown=True`, meaning they wait for in-flight requests to complete until the engine is stopped. Request migration is configured at the **frontend** level via `--migration-limit`:
```python
generate_endpoint.serve_endpoint(
handler.generate,
graceful_shutdown=config.migration_limit <= 0, # If no migration, wait for requests
...
)
```
When `migration_limit > 0`:
- Worker shuts down immediately (`graceful_shutdown=False`)
- In-flight requests are migrated to healthy workers
- No request loss occurs
When `migration_limit <= 0`:
- Worker waits for in-flight requests (`graceful_shutdown=True`)
- Migration is not available
- Requests complete on the shutting-down worker
- When migration is enabled at the frontend, disconnected streams from failed workers are automatically retried on healthy workers
- Workers don't need to know about migration configuration - they simply complete their work or signal incomplete streams
- See [Request Migration Architecture](./request_migration.md) for details on how migration works
## Resource Cleanup
......@@ -233,15 +219,15 @@ Match `terminationGracePeriodSeconds` to your expected request completion time:
- Short requests (< 10s): 30s grace period
- Long generation (> 30s): 120s+ grace period
### 2. Enable Request Migration for Decode Workers
### 2. Enable Request Migration
If using disaggregated serving, enable migration for decode workers:
Enable migration at the frontend to allow request recovery when workers shut down:
```python
--migration-limit 3 # Allow up to 3 migration attempts
```bash
python3 -m dynamo.frontend ... --migration-limit 3 # Allow up to 3 migration attempts
```
This allows immediate shutdown while preserving request state.
This allows the frontend to automatically retry disconnected streams on healthy workers.
### 3. Monitor Shutdown Metrics
......
......@@ -20,12 +20,11 @@ Key responsibilities:
### Migration Limit Configuration
Each model can be configured with a migration limit parameter that specifies the maximum number of times a request can be migrated to another worker:
The migration limit is configured at the **frontend** level and applies globally to all models served by that frontend. This parameter specifies the maximum number of times a request can be migrated to another worker:
- Default behavior: no migration allowed
- Can be set independently for different engine types
- Applicable to LLM worker nodes that perform inference
- Allows engines to override user-specified limits for compatibility
- Default behavior: no migration allowed (migration_limit=0)
- Set via `--migration-limit` flag on the frontend
- Applies to all models served by the frontend
## Token State Tracking and Request Migration
......@@ -101,9 +100,7 @@ This token accumulation mechanism ensures that migrations are truly seamless, pr
The migration system is designed with several important architectural considerations:
**Engine Compatibility**: Different LLM engines may have varying capabilities for handling migrated requests. The system allows engines to override migration settings to ensure compatibility and correctness.
**Multi-Model Support**: Since a frontend may serve multiple models simultaneously, migration limits can be configured at the engine level, providing flexibility for different model types with varying reliability characteristics.
**Multi-Model Support**: Since a frontend may serve multiple models simultaneously, the migration limit is configured at the frontend level and applies uniformly to all models, simplifying operational management.
**State Management**: The system carefully tracks not only token sequences but also metadata such as remaining token budgets, stop conditions, and sampling parameters to ensure complete state preservation.
......
......@@ -95,8 +95,8 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full
| **SLA-Based Planner** | ✅ | ✅ | — | | | | | | | |
| **KV Block Manager** | ✅ | ✅ | ✅ | — | | | | | | |
| **Multimodal** | ✅<sup>1</sup> | <sup>2</sup> | — | ✅ | — | | | | | |
| **Request Migration** | 🚧<sup>3</sup> | ✅ | ✅ | ✅ | 🚧 | — | | | | |
| **Request Cancellation** | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | — | | | |
| **Request Migration** | | ✅ | ✅ | ✅ | 🚧 | — | | | | |
| **Request Cancellation** | ✅<sup>3</sup> | ✅<sup>3</sup> | ✅<sup>3</sup> | ✅<sup>3</sup> | ✅<sup>3</sup> | ✅<sup>3</sup> | — | | | |
| **LoRA** | | | | | | | | — | | |
| **Tool Calling** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | — | |
| **Speculative Decoding** | ✅ | ✅ | — | ✅ | — | ✅ | ✅ | | ✅ | — |
......@@ -104,9 +104,7 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full
> **Notes:**
> 1. **Multimodal Disaggregation**: Fully supports **EP/D** (Traditional) pattern. **E/P/D** (Full Disaggregation) is WIP and currently supports pre-computed embeddings only. ([Source][mm-trtllm])
> 2. **Multimodal + KV-Aware Routing**: Not supported. The KV router currently tracks token-based blocks only. ([Source][kv-routing])
> 3. **Request Migration**: Supported on **Decode/Aggregated** workers only. **Prefill** workers do not support migration. ([Source][trtllm-readme])
> 4. **Speculative Decoding**: Llama 4 + Eagle support documented. ([Source][trtllm-eagle])
> 5. **Request Cancellation**: Due to known issues, the TensorRT-LLM engine is temporarily not notified of request cancellations, meaning allocated resources for cancelled requests are not freed.
> 3. **Request Cancellation**: Due to known issues, the TensorRT-LLM engine is temporarily not notified of request cancellations, meaning allocated resources for cancelled requests are not freed.
---
......
......@@ -151,11 +151,6 @@ impl Flags {
"'--kv-cache-block-size' flag should only be used on the worker node, not on the ingress"
);
}
if self.migration_limit.is_some() {
anyhow::bail!(
"'--migration-limit' flag should only be used on the worker node, not on the ingress"
);
}
}
Output::Echo => {}
#[cfg(feature = "mistralrs")]
......
......@@ -62,8 +62,8 @@ pub async fn run(
.tls_cert_path(flags.tls_cert_path.take())
.tls_key_path(flags.tls_key_path.take())
.router_config(Some(flags.router_config()))
.request_template(flags.request_template.clone())
.migration_limit(flags.migration_limit)
.request_template(flags.request_template.clone())
.is_mocker(matches!(out_opt, Some(Output::Mocker)));
// Only the worker has a model path
......
......@@ -1377,6 +1377,7 @@ pub async fn create_worker_selection_pipeline_chat(
component.drt().clone(),
model_manager.clone(),
router_config,
0, // migration_limit - default to 0 for C bindings
None,
metrics.clone(),
);
......@@ -1498,6 +1499,7 @@ pub async fn create_worker_selection_pipeline_chat(
hf_tokenizer,
prefill_chooser,
enforce_disagg,
0, // migration_limit - default to 0 for C bindings
metrics,
)
.await?;
......
......@@ -225,7 +225,7 @@ fn lora_name_to_id(lora_name: &str) -> i32 {
/// For LoRA mode, both `lora_name` and `base_model_path` must be provided together.
/// Providing only one of them will result in an error.
#[pyfunction]
#[pyo3(signature = (model_input, model_type, endpoint, model_path, model_name=None, context_length=None, kv_cache_block_size=None, router_mode=None, migration_limit=0, runtime_config=None, user_data=None, custom_template_path=None, media_decoder=None, media_fetcher=None, lora_name=None, base_model_path=None))]
#[pyo3(signature = (model_input, model_type, endpoint, model_path, model_name=None, context_length=None, kv_cache_block_size=None, router_mode=None, runtime_config=None, user_data=None, custom_template_path=None, media_decoder=None, media_fetcher=None, lora_name=None, base_model_path=None))]
#[allow(clippy::too_many_arguments)]
fn register_llm<'p>(
py: Python<'p>,
......@@ -237,7 +237,6 @@ fn register_llm<'p>(
context_length: Option<u32>,
kv_cache_block_size: Option<u32>,
router_mode: Option<RouterMode>,
migration_limit: u32,
runtime_config: Option<ModelRuntimeConfig>,
user_data: Option<&Bound<'p, PyDict>>,
custom_template_path: Option<&str>,
......@@ -247,18 +246,13 @@ fn register_llm<'p>(
base_model_path: Option<&str>,
) -> PyResult<Bound<'p, PyAny>> {
// Validate Prefill model type requirements
if model_type.inner == llm_rs::model_type::ModelType::Prefill {
if !matches!(model_input, ModelInput::Tokens) {
if model_type.inner == llm_rs::model_type::ModelType::Prefill
&& !matches!(model_input, ModelInput::Tokens)
{
return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
"ModelType::Prefill requires model_input to be ModelInput::Tokens",
));
}
if migration_limit != 0 {
return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
"ModelType::Prefill requires migration_limit to be 0",
));
}
}
let model_input = match model_input {
ModelInput::Text => llm_rs::model_type::ModelInput::Text,
......@@ -364,7 +358,6 @@ fn register_llm<'p>(
.context_length(context_length)
.kv_cache_block_size(kv_cache_block_size)
.router_config(Some(router_config))
.migration_limit(Some(migration_limit))
.runtime_config(runtime_config.unwrap_or_default().inner)
.user_data(user_data_json)
.custom_template_path(custom_template_path_owned)
......
......@@ -172,6 +172,7 @@ pub(crate) struct EntrypointArgs {
extra_engine_args: Option<PathBuf>,
namespace: Option<String>,
is_prefill: bool,
migration_limit: u32,
engine_factory: Option<PyEngineFactory>,
}
......@@ -179,7 +180,7 @@ pub(crate) struct EntrypointArgs {
impl EntrypointArgs {
#[allow(clippy::too_many_arguments)]
#[new]
#[pyo3(signature = (engine_type, model_path=None, model_name=None, endpoint_id=None, context_length=None, template_file=None, router_config=None, kv_cache_block_size=None, http_host=None, http_port=None, http_metrics_port=None, tls_cert_path=None, tls_key_path=None, extra_engine_args=None, namespace=None, is_prefill=false, engine_factory=None))]
#[pyo3(signature = (engine_type, model_path=None, model_name=None, endpoint_id=None, context_length=None, template_file=None, router_config=None, kv_cache_block_size=None, http_host=None, http_port=None, http_metrics_port=None, tls_cert_path=None, tls_key_path=None, extra_engine_args=None, namespace=None, is_prefill=false, migration_limit=0, engine_factory=None))]
pub fn new(
py: Python<'_>,
engine_type: EngineType,
......@@ -198,6 +199,7 @@ impl EntrypointArgs {
extra_engine_args: Option<PathBuf>,
namespace: Option<String>,
is_prefill: bool,
migration_limit: u32,
engine_factory: Option<PyObject>,
) -> PyResult<Self> {
let endpoint_id_obj: Option<EndpointId> = endpoint_id.as_deref().map(EndpointId::from);
......@@ -242,6 +244,7 @@ impl EntrypointArgs {
extra_engine_args,
namespace,
is_prefill,
migration_limit,
engine_factory,
})
}
......@@ -274,6 +277,7 @@ pub fn make_engine<'p>(
.request_template(args.template_file.clone())
.kv_cache_block_size(args.kv_cache_block_size)
.router_config(args.router_config.clone().map(|rc| rc.into()))
.migration_limit(Some(args.migration_limit))
.http_host(args.http_host.clone())
.http_port(args.http_port)
.http_metrics_port(args.http_metrics_port)
......
......@@ -979,7 +979,6 @@ async def register_llm(
context_length: Optional[int] = None,
kv_cache_block_size: Optional[int] = None,
router_mode: Optional[RouterMode] = None,
migration_limit: int = 0,
runtime_config: Optional[ModelRuntimeConfig] = None,
user_data: Optional[Dict[str, Any]] = None,
custom_template_path: Optional[str] = None,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment