refactor: Move --migration-limit flag from backend to frontend (#5918)

Signed-off-by: Jacky <18255193+kthui@users.noreply.github.com>

refactor: Move --migration-limit flag from backend to frontend (#5918)
Signed-off-by: Jacky <18255193+kthui@users.noreply.github.com>
1ffa489e · Jacky · GitHub · 3842b244 · 1ffa489e · 1ffa489e
Unverified Commit 1ffa489e authored Feb 06, 2026 by Jacky Committed by GitHub Feb 06, 2026
20 changed files
--- a/components/src/dynamo/frontend/main.py
+++ b/components/src/dynamo/frontend/main.py
@@ -225,6 +225,12 @@ def parse_args():
        default=False,
        help="Enforce disaggregated prefill-decode. When set, unactivated prefill router will return an error instead of falling back to decode-only mode.",
    )
+    parser.add_argument(
+        "--migration-limit",
+        type=int,
+        default=0,
+        help="Maximum number of times a request may be migrated to a different engine worker. When > 0, enables request migration on worker disconnect (default: 0).",
+    )
    parser.add_argument(
        "--active-decode-blocks-threshold",
        type=float,
@@ -304,6 +310,8 @@ def parse_args():

    if bool(flags.tls_cert_path) ^ bool(flags.tls_key_path):  # ^ is XOR
        parser.error("--tls-cert-path and --tls-key-path must be provided together")
+    if flags.migration_limit < 0 or flags.migration_limit > 4294967295:
+        parser.error("--migration-limit must be between 0 and 4294967295 (0=disabled)")

    return flags

@@ -324,6 +332,10 @@ async def async_main():
    flags = parse_args()
    dump_config(flags.dump_config_to, flags)
    os.environ["DYN_EVENT_PLANE"] = flags.event_plane
+    logger.info(
+        f"Request migration {'enabled' if flags.migration_limit > 0 else 'disabled'} "
+        f"(limit: {flags.migration_limit})"
+    )
    # Warn if DYN_SYSTEM_PORT is set (frontend doesn't use system metrics server)
    if os.environ.get("DYN_SYSTEM_PORT"):
        logger.warning(
@@ -393,6 +405,7 @@ async def async_main():
            active_prefill_tokens_threshold_frac=flags.active_prefill_tokens_threshold_frac,
            enforce_disagg=flags.enforce_disagg,
        ),
+        "migration_limit": flags.migration_limit,
    }

    if flags.model_name:

--- a/components/src/dynamo/sglang/args.py
+++ b/components/src/dynamo/sglang/args.py
@@ -35,12 +35,6 @@ DYNAMO_ARGS: Dict[str, Dict[str, Any]] = {
        "type": str,
        "help": f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Example: {DEFAULT_ENDPOINT}",
    },
-    "migration-limit": {
-        "flags": ["--migration-limit"],
-        "type": int,
-        "default": 0,
-        "help": "Maximum number of times a request may be migrated to a different engine worker",
-    },
    "tool-call-parser": {
        "flags": ["--dyn-tool-call-parser"],
        "type": str,
@@ -159,7 +153,6 @@ class DynamoArgs:
    namespace: str
    component: str
    endpoint: str
-    migration_limit: int
    store_kv: str
    request_plane: str
    event_plane: str
@@ -588,7 +581,6 @@ async def parse_args(args: list[str]) -> Config:
        namespace=parsed_namespace,
        component=parsed_component_name,
        endpoint=parsed_endpoint_name,
-        migration_limit=parsed_args.migration_limit,
        store_kv=parsed_args.store_kv,
        request_plane=parsed_args.request_plane,
        event_plane=parsed_args.event_plane,

--- a/components/src/dynamo/sglang/register.py
+++ b/components/src/dynamo/sglang/register.py
@@ -56,7 +56,6 @@ async def _register_llm_with_runtime_config(
            server_args.model_path,
            server_args.served_model_name,
            kv_cache_block_size=server_args.page_size,
-            migration_limit=dynamo_args.migration_limit,
            runtime_config=runtime_config,
            custom_template_path=dynamo_args.custom_jinja_template,
        )

--- a/components/src/dynamo/trtllm/main.py
+++ b/components/src/dynamo/trtllm/main.py
@@ -462,7 +462,6 @@ async def init(
                config.model_path,
                config.served_model_name,
                kv_cache_block_size=config.kv_block_size,
-                migration_limit=config.migration_limit,
                runtime_config=runtime_config,
                custom_template_path=config.custom_jinja_template,
            )

--- a/components/src/dynamo/trtllm/utils/trtllm_utils.py
+++ b/components/src/dynamo/trtllm/utils/trtllm_utils.py
@@ -40,7 +40,6 @@ class Config:
        self.expert_parallel_size: Optional[int] = None
        self.enable_attention_dp: bool = False
        self.kv_block_size: int = 32
-        self.migration_limit: int = 0
        self.gpus_per_node: Optional[int] = None
        self.max_batch_size: int = BuildConfig.model_fields["max_batch_size"].default
        self.max_num_tokens: int = BuildConfig.model_fields["max_num_tokens"].default
@@ -88,7 +87,6 @@ class Config:
            f"free_gpu_memory_fraction={self.free_gpu_memory_fraction}, "
            f"extra_engine_args={self.extra_engine_args}, "
            f"override_engine_args={self.override_engine_args}, "
-            f"migration_limit={self.migration_limit}, "
            f"publish_events_and_metrics={self.publish_events_and_metrics}, "
            f"disaggregation_mode={self.disaggregation_mode}, "
            f"encode_endpoint={self.encode_endpoint}, "
@@ -196,12 +194,6 @@ def cmd_line_args():
    parser.add_argument(
        "--kv-block-size", type=int, default=32, help="Size of a KV cache block."
    )
-    parser.add_argument(
-        "--migration-limit",
-        type=int,
-        default=0,
-        help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.",
-    )
    parser.add_argument(
        "--gpus-per-node",
        type=int,
@@ -416,7 +408,6 @@ def cmd_line_args():
    config.max_seq_len = args.max_seq_len
    config.max_beam_width = args.max_beam_width
    config.kv_block_size = args.kv_block_size
-    config.migration_limit = args.migration_limit
    config.extra_engine_args = args.extra_engine_args
    config.override_engine_args = args.override_engine_args
    config.publish_events_and_metrics = args.publish_events_and_metrics

--- a/components/src/dynamo/vllm/args.py
+++ b/components/src/dynamo/vllm/args.py
@@ -36,7 +36,6 @@ class Config:
    endpoint: str
    is_prefill_worker: bool
    is_decode_worker: bool
-    migration_limit: int = 0
    custom_jinja_template: Optional[str] = None
    store_kv: str
    request_plane: str
@@ -138,12 +137,6 @@ def parse_args() -> Config:
        action="store_true",
        help="Mark this as a decode worker which does not publish KV events.",
    )
-    parser.add_argument(
-        "--migration-limit",
-        type=int,
-        default=0,
-        help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.",
-    )
    parser.add_argument(
        "--connector",
        nargs="*",
@@ -436,7 +429,6 @@ def parse_args() -> Config:
    config.engine_args = engine_args
    config.is_prefill_worker = args.is_prefill_worker
    config.is_decode_worker = args.is_decode_worker
-    config.migration_limit = args.migration_limit
    config.tool_call_parser = args.dyn_tool_call_parser
    config.reasoning_parser = args.dyn_reasoning_parser
    config.custom_jinja_template = args.custom_jinja_template

--- a/components/src/dynamo/vllm/main.py
+++ b/components/src/dynamo/vllm/main.py
@@ -499,7 +499,6 @@ async def register_vllm_model(
    config: Config,
    engine_client: AsyncLLM,
    vllm_config,
-    migration_limit: int,
 ):
    """
    Helper function to register a vLLM model with runtime configuration.
@@ -511,7 +510,6 @@ async def register_vllm_model(
        config: Configuration object
        engine_client: vLLM engine client
        vllm_config: vLLM configuration
-        migration_limit: Migration limit for the model
    """
    runtime_config = ModelRuntimeConfig()

@@ -559,7 +557,6 @@ async def register_vllm_model(
        config.model,
        config.served_model_name,
        kv_cache_block_size=config.engine_args.block_size,
-        migration_limit=migration_limit,
        runtime_config=runtime_config,
        custom_template_path=config.custom_jinja_template,
        media_decoder=media_decoder,
@@ -660,7 +657,6 @@ async def init_prefill(
        config,
        engine_client,
        vllm_config,
-        migration_limit=0,  # Prefill doesn't support migration
    )

    health_check_payload = VllmPrefillHealthCheckPayload(
@@ -813,7 +809,6 @@ async def init(
        config,
        engine_client,
        vllm_config,
-        migration_limit=config.migration_limit,
    )

    health_check_payload = VllmHealthCheckPayload(
@@ -827,7 +822,7 @@ async def init(
            # because waiting them to finish can take a long time for long OSLs
            generate_endpoint.serve_endpoint(
                handler.generate,
-                graceful_shutdown=config.migration_limit <= 0,
+                graceful_shutdown=True,
                metrics_labels=[("model", config.served_model_name or config.model)],
                health_check_payload=health_check_payload,
            ),

--- a/docs/backends/sglang/README.md
+++ b/docs/backends/sglang/README.md
@@ -55,7 +55,6 @@ Dynamo SGLang uses SGLang's native argument parser, so **most SGLang engine argu
 | Argument | Description | Default | SGLang Equivalent |
 |----------|-------------|---------|-------------------|
 | `--endpoint` | Dynamo endpoint in `dyn://namespace.component.endpoint` format | Auto-generated based on mode | N/A |
-| `--migration-limit` | Max times a request can migrate between workers for fault tolerance. See [Request Migration Architecture](../../fault_tolerance/request_migration.md). | `0` (disabled) | N/A |
 | `--dyn-tool-call-parser` | Tool call parser for structured outputs (takes precedence over `--tool-call-parser`) | `None` | `--tool-call-parser` |
 | `--dyn-reasoning-parser` | Reasoning parser for CoT models (takes precedence over `--reasoning-parser`) | `None` | `--reasoning-parser` |
 | `--use-sglang-tokenizer` | Use SGLang's tokenizer instead of Dynamo's | `False` | N/A |

--- a/docs/backends/trtllm/README.md
+++ b/docs/backends/trtllm/README.md
@@ -193,17 +193,7 @@ Dynamo with TensorRT-LLM supports two methods for transferring KV cache in disag

 ## Request Migration

-You can enable [request migration](../../../docs/fault_tolerance/request_migration.md) to handle worker failures gracefully. Use the `--migration-limit` flag to specify how many times a request can be migrated to another worker:
-
-```bash
-# For decode and aggregated workers
-python3 -m dynamo.trtllm ... --migration-limit=3
-```
-
-> [!IMPORTANT]
-> **Prefill workers do not support request migration** and must use `--migration-limit=0` (the default). Prefill workers only process prompts and return KV cache state - they don't maintain long-running generation requests that would benefit from migration.
-
-See the [Request Migration Architecture](../../../docs/fault_tolerance/request_migration.md) documentation for details on how this works.
+Dynamo supports [request migration](../../../docs/fault_tolerance/request_migration.md) to handle worker failures gracefully. When enabled, requests can be automatically migrated to healthy workers if a worker fails mid-generation. See the [Request Migration Architecture](../../../docs/fault_tolerance/request_migration.md) documentation for configuration details.

 ## Request Cancellation


--- a/docs/backends/vllm/README.md
+++ b/docs/backends/vllm/README.md
@@ -183,13 +183,7 @@ See the high-level notes in [Router Design](../../design_docs/router_design.md#d

 ## Request Migration

-You can enable [request migration](../../../docs/fault_tolerance/request_migration.md) to handle worker failures gracefully. Use the `--migration-limit` flag to specify how many times a request can be migrated to another worker:
-
-```bash
-python3 -m dynamo.vllm ... --migration-limit=3
-```
-
-This allows a request to be migrated up to 3 times before failing. See the [Request Migration Architecture](../../../docs/fault_tolerance/request_migration.md) documentation for details on how this works.
+Dynamo supports [request migration](../../../docs/fault_tolerance/request_migration.md) to handle worker failures gracefully. When enabled, requests can be automatically migrated to healthy workers if a worker fails mid-generation. See the [Request Migration Architecture](../../../docs/fault_tolerance/request_migration.md) documentation for configuration details.

 ## Request Cancellation


--- a/docs/development/backend-guide.md
+++ b/docs/development/backend-guide.md
@@ -73,7 +73,6 @@ The `model_type` can be:
 - `model_name`: The name to call the model. Your incoming HTTP requests model name must match this. Defaults to the hugging face repo name or the folder name.
 - `context_length`: Max model length in tokens. Defaults to the model's set max. Only set this if you need to reduce KV cache allocation to fit into VRAM.
 - `kv_cache_block_size`: Size of a KV block for the engine, in tokens. Defaults to 16.
- `migration_limit`: Maximum number of times a request may be [migrated to another Instance](../fault_tolerance/request_migration.md). Defaults to 0.
 - `user_data`: Optional dictionary containing custom metadata for worker behavior (e.g., LoRA configuration). Defaults to None.

 See `examples/backends` for full code examples.

--- a/docs/fault_tolerance/graceful_shutdown.md
+++ b/docs/fault_tolerance/graceful_shutdown.md
@@ -83,30 +83,16 @@ generate_endpoint.serve_endpoint(
 |-----------|------------------|-----------|
 | **Frontend** | N/A (HTTP server) | HTTP server handles its own shutdown |
 | **Prefill Workers** | `graceful_shutdown=True` | Prefill operations must complete to avoid wasted computation |
-| **Decode Workers** | Conditional | If migration is enabled (`migration_limit > 0`), shutdown immediately to allow migration; otherwise wait |
+| **Decode Workers** | `graceful_shutdown=True` | Decode operations should complete to avoid wasted computation |
 | **Router** | `graceful_shutdown=True` | Ensure routing decisions complete |

-### Decode Worker Migration Integration
+### Migration Integration

-Decode workers use conditional draining based on whether request migration is supported:
+Backend workers always use `graceful_shutdown=True`, meaning they wait for in-flight requests to complete until the engine is stopped. Request migration is configured at the **frontend** level via `--migration-limit`:

-```python
-generate_endpoint.serve_endpoint(
-    handler.generate,
-    graceful_shutdown=config.migration_limit <= 0,  # If no migration, wait for requests
-    ...
-)
-```
-
-When `migration_limit > 0`:
- Worker shuts down immediately (`graceful_shutdown=False`)
- In-flight requests are migrated to healthy workers
- No request loss occurs
-
-When `migration_limit <= 0`:
- Worker waits for in-flight requests (`graceful_shutdown=True`)
- Migration is not available
- Requests complete on the shutting-down worker
+- When migration is enabled at the frontend, disconnected streams from failed workers are automatically retried on healthy workers
+- Workers don't need to know about migration configuration - they simply complete their work or signal incomplete streams
+- See [Request Migration Architecture](./request_migration.md) for details on how migration works

 ## Resource Cleanup

@@ -233,15 +219,15 @@ Match `terminationGracePeriodSeconds` to your expected request completion time:
 - Short requests (< 10s): 30s grace period
 - Long generation (> 30s): 120s+ grace period

-### 2. Enable Request Migration for Decode Workers
+### 2. Enable Request Migration

-If using disaggregated serving, enable migration for decode workers:
+Enable migration at the frontend to allow request recovery when workers shut down:

-```python
--migration-limit 3  # Allow up to 3 migration attempts
+```bash
+python3 -m dynamo.frontend ... --migration-limit 3  # Allow up to 3 migration attempts
 ```

-This allows immediate shutdown while preserving request state.
+This allows the frontend to automatically retry disconnected streams on healthy workers.

 ### 3. Monitor Shutdown Metrics


--- a/docs/fault_tolerance/request_migration.md
+++ b/docs/fault_tolerance/request_migration.md
@@ -20,12 +20,11 @@ Key responsibilities:

 ### Migration Limit Configuration

-Each model can be configured with a migration limit parameter that specifies the maximum number of times a request can be migrated to another worker:
+The migration limit is configured at the **frontend** level and applies globally to all models served by that frontend. This parameter specifies the maximum number of times a request can be migrated to another worker:

- Default behavior: no migration allowed
- Can be set independently for different engine types
- Applicable to LLM worker nodes that perform inference
- Allows engines to override user-specified limits for compatibility
+- Default behavior: no migration allowed (migration_limit=0)
+- Set via `--migration-limit` flag on the frontend
+- Applies to all models served by the frontend

 ## Token State Tracking and Request Migration

@@ -101,9 +100,7 @@ This token accumulation mechanism ensures that migrations are truly seamless, pr

 The migration system is designed with several important architectural considerations:

-**Engine Compatibility**: Different LLM engines may have varying capabilities for handling migrated requests. The system allows engines to override migration settings to ensure compatibility and correctness.
-
-**Multi-Model Support**: Since a frontend may serve multiple models simultaneously, migration limits can be configured at the engine level, providing flexibility for different model types with varying reliability characteristics.
+**Multi-Model Support**: Since a frontend may serve multiple models simultaneously, the migration limit is configured at the frontend level and applies uniformly to all models, simplifying operational management.

 **State Management**: The system carefully tracks not only token sequences but also metadata such as remaining token budgets, stop conditions, and sampling parameters to ensure complete state preservation.


--- a/docs/reference/feature-matrix.md
+++ b/docs/reference/feature-matrix.md
@@ -95,8 +95,8 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full
 | **SLA-Based Planner** | ✅ | ✅ | — | | | | | | | |
 | **KV Block Manager** | ✅ | ✅ | ✅ | — | | | | | | |
 | **Multimodal** | ✅<sup>1</sup> | <sup>2</sup> | — | ✅ | — | | | | | |
-| **Request Migration** | 🚧<sup>3</sup> | ✅ | ✅ | ✅ | 🚧 | — | | | | |
-| **Request Cancellation** | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | — | | | |
+| **Request Migration** | ✅ | ✅ | ✅ | ✅ | 🚧 | — | | | | |
+| **Request Cancellation** | ✅<sup>3</sup> | ✅<sup>3</sup> | ✅<sup>3</sup> | ✅<sup>3</sup> | ✅<sup>3</sup> | ✅<sup>3</sup> | — | | | |
 | **LoRA** | | | | | | | | — | | |
 | **Tool Calling** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | — | |
 | **Speculative Decoding** | ✅ | ✅ | — | ✅ | — | ✅ | ✅ | | ✅ | — |
@@ -104,9 +104,7 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full
 > **Notes:**
 > 1. **Multimodal Disaggregation**: Fully supports **EP/D** (Traditional) pattern. **E/P/D** (Full Disaggregation) is WIP and currently supports pre-computed embeddings only. ([Source][mm-trtllm])
 > 2. **Multimodal + KV-Aware Routing**: Not supported. The KV router currently tracks token-based blocks only. ([Source][kv-routing])
-> 3. **Request Migration**: Supported on **Decode/Aggregated** workers only. **Prefill** workers do not support migration. ([Source][trtllm-readme])
-> 4. **Speculative Decoding**: Llama 4 + Eagle support documented. ([Source][trtllm-eagle])
-> 5. **Request Cancellation**: Due to known issues, the TensorRT-LLM engine is temporarily not notified of request cancellations, meaning allocated resources for cancelled requests are not freed.
+> 3. **Request Cancellation**: Due to known issues, the TensorRT-LLM engine is temporarily not notified of request cancellations, meaning allocated resources for cancelled requests are not freed.

 ---


--- a/launch/dynamo-run/src/flags.rs
+++ b/launch/dynamo-run/src/flags.rs
@@ -151,11 +151,6 @@ impl Flags {
                        "'--kv-cache-block-size' flag should only be used on the worker node, not on the ingress"
                    );
                }
-                if self.migration_limit.is_some() {
-                    anyhow::bail!(
-                        "'--migration-limit' flag should only be used on the worker node, not on the ingress"
-                    );
-                }
            }
            Output::Echo => {}
            #[cfg(feature = "mistralrs")]

--- a/launch/dynamo-run/src/lib.rs
+++ b/launch/dynamo-run/src/lib.rs
@@ -62,8 +62,8 @@ pub async fn run(
        .tls_cert_path(flags.tls_cert_path.take())
        .tls_key_path(flags.tls_key_path.take())
        .router_config(Some(flags.router_config()))
-        .request_template(flags.request_template.clone())
        .migration_limit(flags.migration_limit)
+        .request_template(flags.request_template.clone())
        .is_mocker(matches!(out_opt, Some(Output::Mocker)));

    // Only the worker has a model path

--- a/lib/bindings/c/src/lib.rs
+++ b/lib/bindings/c/src/lib.rs
@@ -1377,6 +1377,7 @@ pub async fn create_worker_selection_pipeline_chat(
        component.drt().clone(),
        model_manager.clone(),
        router_config,
+        0, // migration_limit - default to 0 for C bindings
        None,
        metrics.clone(),
    );
@@ -1498,6 +1499,7 @@ pub async fn create_worker_selection_pipeline_chat(
        hf_tokenizer,
        prefill_chooser,
        enforce_disagg,
+        0, // migration_limit - default to 0 for C bindings
        metrics,
    )
    .await?;

--- a/lib/bindings/python/rust/lib.rs
+++ b/lib/bindings/python/rust/lib.rs
@@ -225,7 +225,7 @@ fn lora_name_to_id(lora_name: &str) -> i32 {
 /// For LoRA mode, both `lora_name` and `base_model_path` must be provided together.
 /// Providing only one of them will result in an error.
 #[pyfunction]
-#[pyo3(signature = (model_input, model_type, endpoint, model_path, model_name=None, context_length=None, kv_cache_block_size=None, router_mode=None, migration_limit=0, runtime_config=None, user_data=None, custom_template_path=None, media_decoder=None, media_fetcher=None, lora_name=None, base_model_path=None))]
+#[pyo3(signature = (model_input, model_type, endpoint, model_path, model_name=None, context_length=None, kv_cache_block_size=None, router_mode=None, runtime_config=None, user_data=None, custom_template_path=None, media_decoder=None, media_fetcher=None, lora_name=None, base_model_path=None))]
 #[allow(clippy::too_many_arguments)]
 fn register_llm<'p>(
    py: Python<'p>,
@@ -237,7 +237,6 @@ fn register_llm<'p>(
    context_length: Option<u32>,
    kv_cache_block_size: Option<u32>,
    router_mode: Option<RouterMode>,
-    migration_limit: u32,
    runtime_config: Option<ModelRuntimeConfig>,
    user_data: Option<&Bound<'p, PyDict>>,
    custom_template_path: Option<&str>,
@@ -247,18 +246,13 @@ fn register_llm<'p>(
    base_model_path: Option<&str>,
 ) -> PyResult<Bound<'p, PyAny>> {
    // Validate Prefill model type requirements
-    if model_type.inner == llm_rs::model_type::ModelType::Prefill {
-        if !matches!(model_input, ModelInput::Tokens) {
+    if model_type.inner == llm_rs::model_type::ModelType::Prefill
+        && !matches!(model_input, ModelInput::Tokens)
+    {
        return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
            "ModelType::Prefill requires model_input to be ModelInput::Tokens",
        ));
    }
-        if migration_limit != 0 {
-            return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
-                "ModelType::Prefill requires migration_limit to be 0",
-            ));
-        }
-    }

    let model_input = match model_input {
        ModelInput::Text => llm_rs::model_type::ModelInput::Text,
@@ -364,7 +358,6 @@ fn register_llm<'p>(
            .context_length(context_length)
            .kv_cache_block_size(kv_cache_block_size)
            .router_config(Some(router_config))
-            .migration_limit(Some(migration_limit))
            .runtime_config(runtime_config.unwrap_or_default().inner)
            .user_data(user_data_json)
            .custom_template_path(custom_template_path_owned)

--- a/lib/bindings/python/rust/llm/entrypoint.rs
+++ b/lib/bindings/python/rust/llm/entrypoint.rs
@@ -172,6 +172,7 @@ pub(crate) struct EntrypointArgs {
    extra_engine_args: Option<PathBuf>,
    namespace: Option<String>,
    is_prefill: bool,
+    migration_limit: u32,
    engine_factory: Option<PyEngineFactory>,
 }

@@ -179,7 +180,7 @@ pub(crate) struct EntrypointArgs {
 impl EntrypointArgs {
    #[allow(clippy::too_many_arguments)]
    #[new]
-    #[pyo3(signature = (engine_type, model_path=None, model_name=None, endpoint_id=None, context_length=None, template_file=None, router_config=None, kv_cache_block_size=None, http_host=None, http_port=None, http_metrics_port=None, tls_cert_path=None, tls_key_path=None, extra_engine_args=None, namespace=None, is_prefill=false, engine_factory=None))]
+    #[pyo3(signature = (engine_type, model_path=None, model_name=None, endpoint_id=None, context_length=None, template_file=None, router_config=None, kv_cache_block_size=None, http_host=None, http_port=None, http_metrics_port=None, tls_cert_path=None, tls_key_path=None, extra_engine_args=None, namespace=None, is_prefill=false, migration_limit=0, engine_factory=None))]
    pub fn new(
        py: Python<'_>,
        engine_type: EngineType,
@@ -198,6 +199,7 @@ impl EntrypointArgs {
        extra_engine_args: Option<PathBuf>,
        namespace: Option<String>,
        is_prefill: bool,
+        migration_limit: u32,
        engine_factory: Option<PyObject>,
    ) -> PyResult<Self> {
        let endpoint_id_obj: Option<EndpointId> = endpoint_id.as_deref().map(EndpointId::from);
@@ -242,6 +244,7 @@ impl EntrypointArgs {
            extra_engine_args,
            namespace,
            is_prefill,
+            migration_limit,
            engine_factory,
        })
    }
@@ -274,6 +277,7 @@ pub fn make_engine<'p>(
        .request_template(args.template_file.clone())
        .kv_cache_block_size(args.kv_cache_block_size)
        .router_config(args.router_config.clone().map(|rc| rc.into()))
+        .migration_limit(Some(args.migration_limit))
        .http_host(args.http_host.clone())
        .http_port(args.http_port)
        .http_metrics_port(args.http_metrics_port)

--- a/lib/bindings/python/src/dynamo/_core.pyi
+++ b/lib/bindings/python/src/dynamo/_core.pyi
@@ -979,7 +979,6 @@ async def register_llm(
    context_length: Optional[int] = None,
    kv_cache_block_size: Optional[int] = None,
    router_mode: Optional[RouterMode] = None,
-    migration_limit: int = 0,
    runtime_config: Optional[ModelRuntimeConfig] = None,
    user_data: Optional[Dict[str, Any]] = None,
    custom_template_path: Optional[str] = None,