"lib/llm/src/vscode:/vscode.git/clone" did not exist on "f437c8cf442e7169ad70a501cb13e645c116ea2a"
Unverified Commit 1ffa489e authored by Jacky's avatar Jacky Committed by GitHub
Browse files

refactor: Move --migration-limit flag from backend to frontend (#5918)


Signed-off-by: default avatarJacky <18255193+kthui@users.noreply.github.com>
parent 3842b244
...@@ -225,6 +225,12 @@ def parse_args(): ...@@ -225,6 +225,12 @@ def parse_args():
default=False, default=False,
help="Enforce disaggregated prefill-decode. When set, unactivated prefill router will return an error instead of falling back to decode-only mode.", help="Enforce disaggregated prefill-decode. When set, unactivated prefill router will return an error instead of falling back to decode-only mode.",
) )
parser.add_argument(
"--migration-limit",
type=int,
default=0,
help="Maximum number of times a request may be migrated to a different engine worker. When > 0, enables request migration on worker disconnect (default: 0).",
)
parser.add_argument( parser.add_argument(
"--active-decode-blocks-threshold", "--active-decode-blocks-threshold",
type=float, type=float,
...@@ -304,6 +310,8 @@ def parse_args(): ...@@ -304,6 +310,8 @@ def parse_args():
if bool(flags.tls_cert_path) ^ bool(flags.tls_key_path): # ^ is XOR if bool(flags.tls_cert_path) ^ bool(flags.tls_key_path): # ^ is XOR
parser.error("--tls-cert-path and --tls-key-path must be provided together") parser.error("--tls-cert-path and --tls-key-path must be provided together")
if flags.migration_limit < 0 or flags.migration_limit > 4294967295:
parser.error("--migration-limit must be between 0 and 4294967295 (0=disabled)")
return flags return flags
...@@ -324,6 +332,10 @@ async def async_main(): ...@@ -324,6 +332,10 @@ async def async_main():
flags = parse_args() flags = parse_args()
dump_config(flags.dump_config_to, flags) dump_config(flags.dump_config_to, flags)
os.environ["DYN_EVENT_PLANE"] = flags.event_plane os.environ["DYN_EVENT_PLANE"] = flags.event_plane
logger.info(
f"Request migration {'enabled' if flags.migration_limit > 0 else 'disabled'} "
f"(limit: {flags.migration_limit})"
)
# Warn if DYN_SYSTEM_PORT is set (frontend doesn't use system metrics server) # Warn if DYN_SYSTEM_PORT is set (frontend doesn't use system metrics server)
if os.environ.get("DYN_SYSTEM_PORT"): if os.environ.get("DYN_SYSTEM_PORT"):
logger.warning( logger.warning(
...@@ -393,6 +405,7 @@ async def async_main(): ...@@ -393,6 +405,7 @@ async def async_main():
active_prefill_tokens_threshold_frac=flags.active_prefill_tokens_threshold_frac, active_prefill_tokens_threshold_frac=flags.active_prefill_tokens_threshold_frac,
enforce_disagg=flags.enforce_disagg, enforce_disagg=flags.enforce_disagg,
), ),
"migration_limit": flags.migration_limit,
} }
if flags.model_name: if flags.model_name:
......
...@@ -35,12 +35,6 @@ DYNAMO_ARGS: Dict[str, Dict[str, Any]] = { ...@@ -35,12 +35,6 @@ DYNAMO_ARGS: Dict[str, Dict[str, Any]] = {
"type": str, "type": str,
"help": f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Example: {DEFAULT_ENDPOINT}", "help": f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Example: {DEFAULT_ENDPOINT}",
}, },
"migration-limit": {
"flags": ["--migration-limit"],
"type": int,
"default": 0,
"help": "Maximum number of times a request may be migrated to a different engine worker",
},
"tool-call-parser": { "tool-call-parser": {
"flags": ["--dyn-tool-call-parser"], "flags": ["--dyn-tool-call-parser"],
"type": str, "type": str,
...@@ -159,7 +153,6 @@ class DynamoArgs: ...@@ -159,7 +153,6 @@ class DynamoArgs:
namespace: str namespace: str
component: str component: str
endpoint: str endpoint: str
migration_limit: int
store_kv: str store_kv: str
request_plane: str request_plane: str
event_plane: str event_plane: str
...@@ -588,7 +581,6 @@ async def parse_args(args: list[str]) -> Config: ...@@ -588,7 +581,6 @@ async def parse_args(args: list[str]) -> Config:
namespace=parsed_namespace, namespace=parsed_namespace,
component=parsed_component_name, component=parsed_component_name,
endpoint=parsed_endpoint_name, endpoint=parsed_endpoint_name,
migration_limit=parsed_args.migration_limit,
store_kv=parsed_args.store_kv, store_kv=parsed_args.store_kv,
request_plane=parsed_args.request_plane, request_plane=parsed_args.request_plane,
event_plane=parsed_args.event_plane, event_plane=parsed_args.event_plane,
......
...@@ -56,7 +56,6 @@ async def _register_llm_with_runtime_config( ...@@ -56,7 +56,6 @@ async def _register_llm_with_runtime_config(
server_args.model_path, server_args.model_path,
server_args.served_model_name, server_args.served_model_name,
kv_cache_block_size=server_args.page_size, kv_cache_block_size=server_args.page_size,
migration_limit=dynamo_args.migration_limit,
runtime_config=runtime_config, runtime_config=runtime_config,
custom_template_path=dynamo_args.custom_jinja_template, custom_template_path=dynamo_args.custom_jinja_template,
) )
......
...@@ -462,7 +462,6 @@ async def init( ...@@ -462,7 +462,6 @@ async def init(
config.model_path, config.model_path,
config.served_model_name, config.served_model_name,
kv_cache_block_size=config.kv_block_size, kv_cache_block_size=config.kv_block_size,
migration_limit=config.migration_limit,
runtime_config=runtime_config, runtime_config=runtime_config,
custom_template_path=config.custom_jinja_template, custom_template_path=config.custom_jinja_template,
) )
......
...@@ -40,7 +40,6 @@ class Config: ...@@ -40,7 +40,6 @@ class Config:
self.expert_parallel_size: Optional[int] = None self.expert_parallel_size: Optional[int] = None
self.enable_attention_dp: bool = False self.enable_attention_dp: bool = False
self.kv_block_size: int = 32 self.kv_block_size: int = 32
self.migration_limit: int = 0
self.gpus_per_node: Optional[int] = None self.gpus_per_node: Optional[int] = None
self.max_batch_size: int = BuildConfig.model_fields["max_batch_size"].default self.max_batch_size: int = BuildConfig.model_fields["max_batch_size"].default
self.max_num_tokens: int = BuildConfig.model_fields["max_num_tokens"].default self.max_num_tokens: int = BuildConfig.model_fields["max_num_tokens"].default
...@@ -88,7 +87,6 @@ class Config: ...@@ -88,7 +87,6 @@ class Config:
f"free_gpu_memory_fraction={self.free_gpu_memory_fraction}, " f"free_gpu_memory_fraction={self.free_gpu_memory_fraction}, "
f"extra_engine_args={self.extra_engine_args}, " f"extra_engine_args={self.extra_engine_args}, "
f"override_engine_args={self.override_engine_args}, " f"override_engine_args={self.override_engine_args}, "
f"migration_limit={self.migration_limit}, "
f"publish_events_and_metrics={self.publish_events_and_metrics}, " f"publish_events_and_metrics={self.publish_events_and_metrics}, "
f"disaggregation_mode={self.disaggregation_mode}, " f"disaggregation_mode={self.disaggregation_mode}, "
f"encode_endpoint={self.encode_endpoint}, " f"encode_endpoint={self.encode_endpoint}, "
...@@ -196,12 +194,6 @@ def cmd_line_args(): ...@@ -196,12 +194,6 @@ def cmd_line_args():
parser.add_argument( parser.add_argument(
"--kv-block-size", type=int, default=32, help="Size of a KV cache block." "--kv-block-size", type=int, default=32, help="Size of a KV cache block."
) )
parser.add_argument(
"--migration-limit",
type=int,
default=0,
help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.",
)
parser.add_argument( parser.add_argument(
"--gpus-per-node", "--gpus-per-node",
type=int, type=int,
...@@ -416,7 +408,6 @@ def cmd_line_args(): ...@@ -416,7 +408,6 @@ def cmd_line_args():
config.max_seq_len = args.max_seq_len config.max_seq_len = args.max_seq_len
config.max_beam_width = args.max_beam_width config.max_beam_width = args.max_beam_width
config.kv_block_size = args.kv_block_size config.kv_block_size = args.kv_block_size
config.migration_limit = args.migration_limit
config.extra_engine_args = args.extra_engine_args config.extra_engine_args = args.extra_engine_args
config.override_engine_args = args.override_engine_args config.override_engine_args = args.override_engine_args
config.publish_events_and_metrics = args.publish_events_and_metrics config.publish_events_and_metrics = args.publish_events_and_metrics
......
...@@ -36,7 +36,6 @@ class Config: ...@@ -36,7 +36,6 @@ class Config:
endpoint: str endpoint: str
is_prefill_worker: bool is_prefill_worker: bool
is_decode_worker: bool is_decode_worker: bool
migration_limit: int = 0
custom_jinja_template: Optional[str] = None custom_jinja_template: Optional[str] = None
store_kv: str store_kv: str
request_plane: str request_plane: str
...@@ -138,12 +137,6 @@ def parse_args() -> Config: ...@@ -138,12 +137,6 @@ def parse_args() -> Config:
action="store_true", action="store_true",
help="Mark this as a decode worker which does not publish KV events.", help="Mark this as a decode worker which does not publish KV events.",
) )
parser.add_argument(
"--migration-limit",
type=int,
default=0,
help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.",
)
parser.add_argument( parser.add_argument(
"--connector", "--connector",
nargs="*", nargs="*",
...@@ -436,7 +429,6 @@ def parse_args() -> Config: ...@@ -436,7 +429,6 @@ def parse_args() -> Config:
config.engine_args = engine_args config.engine_args = engine_args
config.is_prefill_worker = args.is_prefill_worker config.is_prefill_worker = args.is_prefill_worker
config.is_decode_worker = args.is_decode_worker config.is_decode_worker = args.is_decode_worker
config.migration_limit = args.migration_limit
config.tool_call_parser = args.dyn_tool_call_parser config.tool_call_parser = args.dyn_tool_call_parser
config.reasoning_parser = args.dyn_reasoning_parser config.reasoning_parser = args.dyn_reasoning_parser
config.custom_jinja_template = args.custom_jinja_template config.custom_jinja_template = args.custom_jinja_template
......
...@@ -499,7 +499,6 @@ async def register_vllm_model( ...@@ -499,7 +499,6 @@ async def register_vllm_model(
config: Config, config: Config,
engine_client: AsyncLLM, engine_client: AsyncLLM,
vllm_config, vllm_config,
migration_limit: int,
): ):
""" """
Helper function to register a vLLM model with runtime configuration. Helper function to register a vLLM model with runtime configuration.
...@@ -511,7 +510,6 @@ async def register_vllm_model( ...@@ -511,7 +510,6 @@ async def register_vllm_model(
config: Configuration object config: Configuration object
engine_client: vLLM engine client engine_client: vLLM engine client
vllm_config: vLLM configuration vllm_config: vLLM configuration
migration_limit: Migration limit for the model
""" """
runtime_config = ModelRuntimeConfig() runtime_config = ModelRuntimeConfig()
...@@ -559,7 +557,6 @@ async def register_vllm_model( ...@@ -559,7 +557,6 @@ async def register_vllm_model(
config.model, config.model,
config.served_model_name, config.served_model_name,
kv_cache_block_size=config.engine_args.block_size, kv_cache_block_size=config.engine_args.block_size,
migration_limit=migration_limit,
runtime_config=runtime_config, runtime_config=runtime_config,
custom_template_path=config.custom_jinja_template, custom_template_path=config.custom_jinja_template,
media_decoder=media_decoder, media_decoder=media_decoder,
...@@ -660,7 +657,6 @@ async def init_prefill( ...@@ -660,7 +657,6 @@ async def init_prefill(
config, config,
engine_client, engine_client,
vllm_config, vllm_config,
migration_limit=0, # Prefill doesn't support migration
) )
health_check_payload = VllmPrefillHealthCheckPayload( health_check_payload = VllmPrefillHealthCheckPayload(
...@@ -813,7 +809,6 @@ async def init( ...@@ -813,7 +809,6 @@ async def init(
config, config,
engine_client, engine_client,
vllm_config, vllm_config,
migration_limit=config.migration_limit,
) )
health_check_payload = VllmHealthCheckPayload( health_check_payload = VllmHealthCheckPayload(
...@@ -827,7 +822,7 @@ async def init( ...@@ -827,7 +822,7 @@ async def init(
# because waiting them to finish can take a long time for long OSLs # because waiting them to finish can take a long time for long OSLs
generate_endpoint.serve_endpoint( generate_endpoint.serve_endpoint(
handler.generate, handler.generate,
graceful_shutdown=config.migration_limit <= 0, graceful_shutdown=True,
metrics_labels=[("model", config.served_model_name or config.model)], metrics_labels=[("model", config.served_model_name or config.model)],
health_check_payload=health_check_payload, health_check_payload=health_check_payload,
), ),
......
...@@ -55,7 +55,6 @@ Dynamo SGLang uses SGLang's native argument parser, so **most SGLang engine argu ...@@ -55,7 +55,6 @@ Dynamo SGLang uses SGLang's native argument parser, so **most SGLang engine argu
| Argument | Description | Default | SGLang Equivalent | | Argument | Description | Default | SGLang Equivalent |
|----------|-------------|---------|-------------------| |----------|-------------|---------|-------------------|
| `--endpoint` | Dynamo endpoint in `dyn://namespace.component.endpoint` format | Auto-generated based on mode | N/A | | `--endpoint` | Dynamo endpoint in `dyn://namespace.component.endpoint` format | Auto-generated based on mode | N/A |
| `--migration-limit` | Max times a request can migrate between workers for fault tolerance. See [Request Migration Architecture](../../fault_tolerance/request_migration.md). | `0` (disabled) | N/A |
| `--dyn-tool-call-parser` | Tool call parser for structured outputs (takes precedence over `--tool-call-parser`) | `None` | `--tool-call-parser` | | `--dyn-tool-call-parser` | Tool call parser for structured outputs (takes precedence over `--tool-call-parser`) | `None` | `--tool-call-parser` |
| `--dyn-reasoning-parser` | Reasoning parser for CoT models (takes precedence over `--reasoning-parser`) | `None` | `--reasoning-parser` | | `--dyn-reasoning-parser` | Reasoning parser for CoT models (takes precedence over `--reasoning-parser`) | `None` | `--reasoning-parser` |
| `--use-sglang-tokenizer` | Use SGLang's tokenizer instead of Dynamo's | `False` | N/A | | `--use-sglang-tokenizer` | Use SGLang's tokenizer instead of Dynamo's | `False` | N/A |
......
...@@ -193,17 +193,7 @@ Dynamo with TensorRT-LLM supports two methods for transferring KV cache in disag ...@@ -193,17 +193,7 @@ Dynamo with TensorRT-LLM supports two methods for transferring KV cache in disag
## Request Migration ## Request Migration
You can enable [request migration](../../../docs/fault_tolerance/request_migration.md) to handle worker failures gracefully. Use the `--migration-limit` flag to specify how many times a request can be migrated to another worker: Dynamo supports [request migration](../../../docs/fault_tolerance/request_migration.md) to handle worker failures gracefully. When enabled, requests can be automatically migrated to healthy workers if a worker fails mid-generation. See the [Request Migration Architecture](../../../docs/fault_tolerance/request_migration.md) documentation for configuration details.
```bash
# For decode and aggregated workers
python3 -m dynamo.trtllm ... --migration-limit=3
```
> [!IMPORTANT]
> **Prefill workers do not support request migration** and must use `--migration-limit=0` (the default). Prefill workers only process prompts and return KV cache state - they don't maintain long-running generation requests that would benefit from migration.
See the [Request Migration Architecture](../../../docs/fault_tolerance/request_migration.md) documentation for details on how this works.
## Request Cancellation ## Request Cancellation
......
...@@ -183,13 +183,7 @@ See the high-level notes in [Router Design](../../design_docs/router_design.md#d ...@@ -183,13 +183,7 @@ See the high-level notes in [Router Design](../../design_docs/router_design.md#d
## Request Migration ## Request Migration
You can enable [request migration](../../../docs/fault_tolerance/request_migration.md) to handle worker failures gracefully. Use the `--migration-limit` flag to specify how many times a request can be migrated to another worker: Dynamo supports [request migration](../../../docs/fault_tolerance/request_migration.md) to handle worker failures gracefully. When enabled, requests can be automatically migrated to healthy workers if a worker fails mid-generation. See the [Request Migration Architecture](../../../docs/fault_tolerance/request_migration.md) documentation for configuration details.
```bash
python3 -m dynamo.vllm ... --migration-limit=3
```
This allows a request to be migrated up to 3 times before failing. See the [Request Migration Architecture](../../../docs/fault_tolerance/request_migration.md) documentation for details on how this works.
## Request Cancellation ## Request Cancellation
......
...@@ -73,7 +73,6 @@ The `model_type` can be: ...@@ -73,7 +73,6 @@ The `model_type` can be:
- `model_name`: The name to call the model. Your incoming HTTP requests model name must match this. Defaults to the hugging face repo name or the folder name. - `model_name`: The name to call the model. Your incoming HTTP requests model name must match this. Defaults to the hugging face repo name or the folder name.
- `context_length`: Max model length in tokens. Defaults to the model's set max. Only set this if you need to reduce KV cache allocation to fit into VRAM. - `context_length`: Max model length in tokens. Defaults to the model's set max. Only set this if you need to reduce KV cache allocation to fit into VRAM.
- `kv_cache_block_size`: Size of a KV block for the engine, in tokens. Defaults to 16. - `kv_cache_block_size`: Size of a KV block for the engine, in tokens. Defaults to 16.
- `migration_limit`: Maximum number of times a request may be [migrated to another Instance](../fault_tolerance/request_migration.md). Defaults to 0.
- `user_data`: Optional dictionary containing custom metadata for worker behavior (e.g., LoRA configuration). Defaults to None. - `user_data`: Optional dictionary containing custom metadata for worker behavior (e.g., LoRA configuration). Defaults to None.
See `examples/backends` for full code examples. See `examples/backends` for full code examples.
......
...@@ -83,30 +83,16 @@ generate_endpoint.serve_endpoint( ...@@ -83,30 +83,16 @@ generate_endpoint.serve_endpoint(
|-----------|------------------|-----------| |-----------|------------------|-----------|
| **Frontend** | N/A (HTTP server) | HTTP server handles its own shutdown | | **Frontend** | N/A (HTTP server) | HTTP server handles its own shutdown |
| **Prefill Workers** | `graceful_shutdown=True` | Prefill operations must complete to avoid wasted computation | | **Prefill Workers** | `graceful_shutdown=True` | Prefill operations must complete to avoid wasted computation |
| **Decode Workers** | Conditional | If migration is enabled (`migration_limit > 0`), shutdown immediately to allow migration; otherwise wait | | **Decode Workers** | `graceful_shutdown=True` | Decode operations should complete to avoid wasted computation |
| **Router** | `graceful_shutdown=True` | Ensure routing decisions complete | | **Router** | `graceful_shutdown=True` | Ensure routing decisions complete |
### Decode Worker Migration Integration ### Migration Integration
Decode workers use conditional draining based on whether request migration is supported: Backend workers always use `graceful_shutdown=True`, meaning they wait for in-flight requests to complete until the engine is stopped. Request migration is configured at the **frontend** level via `--migration-limit`:
```python - When migration is enabled at the frontend, disconnected streams from failed workers are automatically retried on healthy workers
generate_endpoint.serve_endpoint( - Workers don't need to know about migration configuration - they simply complete their work or signal incomplete streams
handler.generate, - See [Request Migration Architecture](./request_migration.md) for details on how migration works
graceful_shutdown=config.migration_limit <= 0, # If no migration, wait for requests
...
)
```
When `migration_limit > 0`:
- Worker shuts down immediately (`graceful_shutdown=False`)
- In-flight requests are migrated to healthy workers
- No request loss occurs
When `migration_limit <= 0`:
- Worker waits for in-flight requests (`graceful_shutdown=True`)
- Migration is not available
- Requests complete on the shutting-down worker
## Resource Cleanup ## Resource Cleanup
...@@ -233,15 +219,15 @@ Match `terminationGracePeriodSeconds` to your expected request completion time: ...@@ -233,15 +219,15 @@ Match `terminationGracePeriodSeconds` to your expected request completion time:
- Short requests (< 10s): 30s grace period - Short requests (< 10s): 30s grace period
- Long generation (> 30s): 120s+ grace period - Long generation (> 30s): 120s+ grace period
### 2. Enable Request Migration for Decode Workers ### 2. Enable Request Migration
If using disaggregated serving, enable migration for decode workers: Enable migration at the frontend to allow request recovery when workers shut down:
```python ```bash
--migration-limit 3 # Allow up to 3 migration attempts python3 -m dynamo.frontend ... --migration-limit 3 # Allow up to 3 migration attempts
``` ```
This allows immediate shutdown while preserving request state. This allows the frontend to automatically retry disconnected streams on healthy workers.
### 3. Monitor Shutdown Metrics ### 3. Monitor Shutdown Metrics
......
...@@ -20,12 +20,11 @@ Key responsibilities: ...@@ -20,12 +20,11 @@ Key responsibilities:
### Migration Limit Configuration ### Migration Limit Configuration
Each model can be configured with a migration limit parameter that specifies the maximum number of times a request can be migrated to another worker: The migration limit is configured at the **frontend** level and applies globally to all models served by that frontend. This parameter specifies the maximum number of times a request can be migrated to another worker:
- Default behavior: no migration allowed - Default behavior: no migration allowed (migration_limit=0)
- Can be set independently for different engine types - Set via `--migration-limit` flag on the frontend
- Applicable to LLM worker nodes that perform inference - Applies to all models served by the frontend
- Allows engines to override user-specified limits for compatibility
## Token State Tracking and Request Migration ## Token State Tracking and Request Migration
...@@ -101,9 +100,7 @@ This token accumulation mechanism ensures that migrations are truly seamless, pr ...@@ -101,9 +100,7 @@ This token accumulation mechanism ensures that migrations are truly seamless, pr
The migration system is designed with several important architectural considerations: The migration system is designed with several important architectural considerations:
**Engine Compatibility**: Different LLM engines may have varying capabilities for handling migrated requests. The system allows engines to override migration settings to ensure compatibility and correctness. **Multi-Model Support**: Since a frontend may serve multiple models simultaneously, the migration limit is configured at the frontend level and applies uniformly to all models, simplifying operational management.
**Multi-Model Support**: Since a frontend may serve multiple models simultaneously, migration limits can be configured at the engine level, providing flexibility for different model types with varying reliability characteristics.
**State Management**: The system carefully tracks not only token sequences but also metadata such as remaining token budgets, stop conditions, and sampling parameters to ensure complete state preservation. **State Management**: The system carefully tracks not only token sequences but also metadata such as remaining token budgets, stop conditions, and sampling parameters to ensure complete state preservation.
......
...@@ -95,8 +95,8 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full ...@@ -95,8 +95,8 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full
| **SLA-Based Planner** | ✅ | ✅ | — | | | | | | | | | **SLA-Based Planner** | ✅ | ✅ | — | | | | | | | |
| **KV Block Manager** | ✅ | ✅ | ✅ | — | | | | | | | | **KV Block Manager** | ✅ | ✅ | ✅ | — | | | | | | |
| **Multimodal** | ✅<sup>1</sup> | <sup>2</sup> | — | ✅ | — | | | | | | | **Multimodal** | ✅<sup>1</sup> | <sup>2</sup> | — | ✅ | — | | | | | |
| **Request Migration** | 🚧<sup>3</sup> | ✅ | ✅ | ✅ | 🚧 | — | | | | | | **Request Migration** | | ✅ | ✅ | ✅ | 🚧 | — | | | | |
| **Request Cancellation** | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | ✅<sup>5</sup> | — | | | | | **Request Cancellation** | ✅<sup>3</sup> | ✅<sup>3</sup> | ✅<sup>3</sup> | ✅<sup>3</sup> | ✅<sup>3</sup> | ✅<sup>3</sup> | — | | | |
| **LoRA** | | | | | | | | — | | | | **LoRA** | | | | | | | | — | | |
| **Tool Calling** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | — | | | **Tool Calling** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | — | |
| **Speculative Decoding** | ✅ | ✅ | — | ✅ | — | ✅ | ✅ | | ✅ | — | | **Speculative Decoding** | ✅ | ✅ | — | ✅ | — | ✅ | ✅ | | ✅ | — |
...@@ -104,9 +104,7 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full ...@@ -104,9 +104,7 @@ TensorRT-LLM delivers maximum inference performance and optimization, with full
> **Notes:** > **Notes:**
> 1. **Multimodal Disaggregation**: Fully supports **EP/D** (Traditional) pattern. **E/P/D** (Full Disaggregation) is WIP and currently supports pre-computed embeddings only. ([Source][mm-trtllm]) > 1. **Multimodal Disaggregation**: Fully supports **EP/D** (Traditional) pattern. **E/P/D** (Full Disaggregation) is WIP and currently supports pre-computed embeddings only. ([Source][mm-trtllm])
> 2. **Multimodal + KV-Aware Routing**: Not supported. The KV router currently tracks token-based blocks only. ([Source][kv-routing]) > 2. **Multimodal + KV-Aware Routing**: Not supported. The KV router currently tracks token-based blocks only. ([Source][kv-routing])
> 3. **Request Migration**: Supported on **Decode/Aggregated** workers only. **Prefill** workers do not support migration. ([Source][trtllm-readme]) > 3. **Request Cancellation**: Due to known issues, the TensorRT-LLM engine is temporarily not notified of request cancellations, meaning allocated resources for cancelled requests are not freed.
> 4. **Speculative Decoding**: Llama 4 + Eagle support documented. ([Source][trtllm-eagle])
> 5. **Request Cancellation**: Due to known issues, the TensorRT-LLM engine is temporarily not notified of request cancellations, meaning allocated resources for cancelled requests are not freed.
--- ---
......
...@@ -151,11 +151,6 @@ impl Flags { ...@@ -151,11 +151,6 @@ impl Flags {
"'--kv-cache-block-size' flag should only be used on the worker node, not on the ingress" "'--kv-cache-block-size' flag should only be used on the worker node, not on the ingress"
); );
} }
if self.migration_limit.is_some() {
anyhow::bail!(
"'--migration-limit' flag should only be used on the worker node, not on the ingress"
);
}
} }
Output::Echo => {} Output::Echo => {}
#[cfg(feature = "mistralrs")] #[cfg(feature = "mistralrs")]
......
...@@ -62,8 +62,8 @@ pub async fn run( ...@@ -62,8 +62,8 @@ pub async fn run(
.tls_cert_path(flags.tls_cert_path.take()) .tls_cert_path(flags.tls_cert_path.take())
.tls_key_path(flags.tls_key_path.take()) .tls_key_path(flags.tls_key_path.take())
.router_config(Some(flags.router_config())) .router_config(Some(flags.router_config()))
.request_template(flags.request_template.clone())
.migration_limit(flags.migration_limit) .migration_limit(flags.migration_limit)
.request_template(flags.request_template.clone())
.is_mocker(matches!(out_opt, Some(Output::Mocker))); .is_mocker(matches!(out_opt, Some(Output::Mocker)));
// Only the worker has a model path // Only the worker has a model path
......
...@@ -1377,6 +1377,7 @@ pub async fn create_worker_selection_pipeline_chat( ...@@ -1377,6 +1377,7 @@ pub async fn create_worker_selection_pipeline_chat(
component.drt().clone(), component.drt().clone(),
model_manager.clone(), model_manager.clone(),
router_config, router_config,
0, // migration_limit - default to 0 for C bindings
None, None,
metrics.clone(), metrics.clone(),
); );
...@@ -1498,6 +1499,7 @@ pub async fn create_worker_selection_pipeline_chat( ...@@ -1498,6 +1499,7 @@ pub async fn create_worker_selection_pipeline_chat(
hf_tokenizer, hf_tokenizer,
prefill_chooser, prefill_chooser,
enforce_disagg, enforce_disagg,
0, // migration_limit - default to 0 for C bindings
metrics, metrics,
) )
.await?; .await?;
......
...@@ -225,7 +225,7 @@ fn lora_name_to_id(lora_name: &str) -> i32 { ...@@ -225,7 +225,7 @@ fn lora_name_to_id(lora_name: &str) -> i32 {
/// For LoRA mode, both `lora_name` and `base_model_path` must be provided together. /// For LoRA mode, both `lora_name` and `base_model_path` must be provided together.
/// Providing only one of them will result in an error. /// Providing only one of them will result in an error.
#[pyfunction] #[pyfunction]
#[pyo3(signature = (model_input, model_type, endpoint, model_path, model_name=None, context_length=None, kv_cache_block_size=None, router_mode=None, migration_limit=0, runtime_config=None, user_data=None, custom_template_path=None, media_decoder=None, media_fetcher=None, lora_name=None, base_model_path=None))] #[pyo3(signature = (model_input, model_type, endpoint, model_path, model_name=None, context_length=None, kv_cache_block_size=None, router_mode=None, runtime_config=None, user_data=None, custom_template_path=None, media_decoder=None, media_fetcher=None, lora_name=None, base_model_path=None))]
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
fn register_llm<'p>( fn register_llm<'p>(
py: Python<'p>, py: Python<'p>,
...@@ -237,7 +237,6 @@ fn register_llm<'p>( ...@@ -237,7 +237,6 @@ fn register_llm<'p>(
context_length: Option<u32>, context_length: Option<u32>,
kv_cache_block_size: Option<u32>, kv_cache_block_size: Option<u32>,
router_mode: Option<RouterMode>, router_mode: Option<RouterMode>,
migration_limit: u32,
runtime_config: Option<ModelRuntimeConfig>, runtime_config: Option<ModelRuntimeConfig>,
user_data: Option<&Bound<'p, PyDict>>, user_data: Option<&Bound<'p, PyDict>>,
custom_template_path: Option<&str>, custom_template_path: Option<&str>,
...@@ -247,18 +246,13 @@ fn register_llm<'p>( ...@@ -247,18 +246,13 @@ fn register_llm<'p>(
base_model_path: Option<&str>, base_model_path: Option<&str>,
) -> PyResult<Bound<'p, PyAny>> { ) -> PyResult<Bound<'p, PyAny>> {
// Validate Prefill model type requirements // Validate Prefill model type requirements
if model_type.inner == llm_rs::model_type::ModelType::Prefill { if model_type.inner == llm_rs::model_type::ModelType::Prefill
if !matches!(model_input, ModelInput::Tokens) { && !matches!(model_input, ModelInput::Tokens)
{
return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>( return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
"ModelType::Prefill requires model_input to be ModelInput::Tokens", "ModelType::Prefill requires model_input to be ModelInput::Tokens",
)); ));
} }
if migration_limit != 0 {
return Err(PyErr::new::<pyo3::exceptions::PyValueError, _>(
"ModelType::Prefill requires migration_limit to be 0",
));
}
}
let model_input = match model_input { let model_input = match model_input {
ModelInput::Text => llm_rs::model_type::ModelInput::Text, ModelInput::Text => llm_rs::model_type::ModelInput::Text,
...@@ -364,7 +358,6 @@ fn register_llm<'p>( ...@@ -364,7 +358,6 @@ fn register_llm<'p>(
.context_length(context_length) .context_length(context_length)
.kv_cache_block_size(kv_cache_block_size) .kv_cache_block_size(kv_cache_block_size)
.router_config(Some(router_config)) .router_config(Some(router_config))
.migration_limit(Some(migration_limit))
.runtime_config(runtime_config.unwrap_or_default().inner) .runtime_config(runtime_config.unwrap_or_default().inner)
.user_data(user_data_json) .user_data(user_data_json)
.custom_template_path(custom_template_path_owned) .custom_template_path(custom_template_path_owned)
......
...@@ -172,6 +172,7 @@ pub(crate) struct EntrypointArgs { ...@@ -172,6 +172,7 @@ pub(crate) struct EntrypointArgs {
extra_engine_args: Option<PathBuf>, extra_engine_args: Option<PathBuf>,
namespace: Option<String>, namespace: Option<String>,
is_prefill: bool, is_prefill: bool,
migration_limit: u32,
engine_factory: Option<PyEngineFactory>, engine_factory: Option<PyEngineFactory>,
} }
...@@ -179,7 +180,7 @@ pub(crate) struct EntrypointArgs { ...@@ -179,7 +180,7 @@ pub(crate) struct EntrypointArgs {
impl EntrypointArgs { impl EntrypointArgs {
#[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)]
#[new] #[new]
#[pyo3(signature = (engine_type, model_path=None, model_name=None, endpoint_id=None, context_length=None, template_file=None, router_config=None, kv_cache_block_size=None, http_host=None, http_port=None, http_metrics_port=None, tls_cert_path=None, tls_key_path=None, extra_engine_args=None, namespace=None, is_prefill=false, engine_factory=None))] #[pyo3(signature = (engine_type, model_path=None, model_name=None, endpoint_id=None, context_length=None, template_file=None, router_config=None, kv_cache_block_size=None, http_host=None, http_port=None, http_metrics_port=None, tls_cert_path=None, tls_key_path=None, extra_engine_args=None, namespace=None, is_prefill=false, migration_limit=0, engine_factory=None))]
pub fn new( pub fn new(
py: Python<'_>, py: Python<'_>,
engine_type: EngineType, engine_type: EngineType,
...@@ -198,6 +199,7 @@ impl EntrypointArgs { ...@@ -198,6 +199,7 @@ impl EntrypointArgs {
extra_engine_args: Option<PathBuf>, extra_engine_args: Option<PathBuf>,
namespace: Option<String>, namespace: Option<String>,
is_prefill: bool, is_prefill: bool,
migration_limit: u32,
engine_factory: Option<PyObject>, engine_factory: Option<PyObject>,
) -> PyResult<Self> { ) -> PyResult<Self> {
let endpoint_id_obj: Option<EndpointId> = endpoint_id.as_deref().map(EndpointId::from); let endpoint_id_obj: Option<EndpointId> = endpoint_id.as_deref().map(EndpointId::from);
...@@ -242,6 +244,7 @@ impl EntrypointArgs { ...@@ -242,6 +244,7 @@ impl EntrypointArgs {
extra_engine_args, extra_engine_args,
namespace, namespace,
is_prefill, is_prefill,
migration_limit,
engine_factory, engine_factory,
}) })
} }
...@@ -274,6 +277,7 @@ pub fn make_engine<'p>( ...@@ -274,6 +277,7 @@ pub fn make_engine<'p>(
.request_template(args.template_file.clone()) .request_template(args.template_file.clone())
.kv_cache_block_size(args.kv_cache_block_size) .kv_cache_block_size(args.kv_cache_block_size)
.router_config(args.router_config.clone().map(|rc| rc.into())) .router_config(args.router_config.clone().map(|rc| rc.into()))
.migration_limit(Some(args.migration_limit))
.http_host(args.http_host.clone()) .http_host(args.http_host.clone())
.http_port(args.http_port) .http_port(args.http_port)
.http_metrics_port(args.http_metrics_port) .http_metrics_port(args.http_metrics_port)
......
...@@ -979,7 +979,6 @@ async def register_llm( ...@@ -979,7 +979,6 @@ async def register_llm(
context_length: Optional[int] = None, context_length: Optional[int] = None,
kv_cache_block_size: Optional[int] = None, kv_cache_block_size: Optional[int] = None,
router_mode: Optional[RouterMode] = None, router_mode: Optional[RouterMode] = None,
migration_limit: int = 0,
runtime_config: Optional[ModelRuntimeConfig] = None, runtime_config: Optional[ModelRuntimeConfig] = None,
user_data: Optional[Dict[str, Any]] = None, user_data: Optional[Dict[str, Any]] = None,
custom_template_path: Optional[str] = None, custom_template_path: Optional[str] = None,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment