// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 use std::collections::HashMap; use std::path::PathBuf; use clap::ValueEnum; use dynamo_llm::entrypoint::RouterConfig; use dynamo_llm::kv_router::KvRouterConfig; use dynamo_llm::mocker::protocols::MockEngineArgs; use dynamo_runtime::pipeline::RouterMode as RuntimeRouterMode; use crate::Output; /// Required options depend on the in and out choices #[derive(clap::Parser, Debug, Clone)] #[command(version, about, long_about = None)] pub struct Flags { /// The model. The options depend on the engine. /// /// The full list - only mistralrs supports all three currently: /// - Full path of a checked out Hugging Face repository containing safetensor files /// - Name of a Hugging Face repository, e.g 'google/flan-t5-small'. The model will be /// downloaded and cached. #[arg(index = 1)] pub model_path_pos: Option, // `--model-path`. The one above is `dynamo-run ` #[arg(long = "model-path")] pub model_path_flag: Option, /// HTTP port. `in=http` only /// If tls_cert_path and tls_key_path are provided, this will be TLS/HTTPS. #[arg(long, default_value = "8080")] pub http_port: u16, /// TLS certificate file #[arg(long, requires = "tls_key_path")] pub tls_cert_path: Option, /// TLS certificate key file #[arg(long, requires = "tls_cert_path")] pub tls_key_path: Option, /// The name of the model we are serving #[arg(long)] pub model_name: Option, /// Verbose output (-v for debug, -vv for trace) #[arg(short = 'v', action = clap::ArgAction::Count, default_value_t = 0)] pub verbosity: u8, /// If using `out=dyn` with multiple instances, this says how to route the requests. /// /// Mostly interesting for KV-aware routing. /// Defaults to RouterMode::RoundRobin #[arg(long, default_value = "round-robin")] pub router_mode: RouterMode, /// KV Router: Weight for overlap score in worker selection. /// Higher values prioritize KV cache reuse. Default: 1.0 #[arg(long)] pub kv_overlap_score_weight: Option, /// KV Router: Temperature for worker sampling via softmax. /// Higher values promote more randomness, and 0 fallbacks to deterministic. /// Default: 0.0 #[arg(long)] pub router_temperature: Option, /// KV Router: Whether to use KV events to maintain the view of cached blocks /// If false, would use ApproxKvRouter for predicting block creation / deletion /// based only on incoming requests at a timer. /// Default: true #[arg(long)] pub use_kv_events: Option, /// KV Router: Whether to enable replica synchronization across multiple router instances. /// When true, routers will publish and subscribe to events to maintain consistent state. /// Default: false #[arg(long)] pub router_replica_sync: Option, /// KV Router: Whether to track active blocks in the router for memory management. /// When false, the router will not maintain state about which blocks are active, /// reducing memory overhead but potentially affecting scheduling decisions. /// Default: true #[arg(long)] pub router_track_active_blocks: Option, /// Max model context length. Reduce this if you don't have enough VRAM for the full model /// context length (e.g. Llama 4). /// Defaults to the model's max, which is usually model_max_length in tokenizer_config.json. #[arg(long)] pub context_length: Option, /// KV cache block size (is this used? Maybe by Python vllm worker?) #[arg(long)] pub kv_cache_block_size: Option, /// Mocker engine only. /// Additional engine-specific arguments from a JSON file. /// Contains a mapping of parameter names to values. #[arg(long)] pub extra_engine_args: Option, /// Path to a JSON file containing default request fields. /// These fields will be merged with each request, but can be overridden by the request. /// Example file contents: /// { /// "model": "Qwen2.5-3B-Instruct", /// "temperature": 0.7, /// "max_completion_tokens": 4096 /// } #[arg(long)] pub request_template: Option, /// How many times a request can be migrated to another worker if the HTTP server lost /// connection to the current worker. #[arg(long, value_parser = clap::value_parser!(u32).range(0..1024))] pub migration_limit: Option, /// Which key-value backend to use: etcd, mem, file. /// Etcd uses the ETCD_* env vars (e.g. ETCD_ENPOINTS) for connection details. /// File uses root dir from env var DYN_FILE_KV or defaults to $TMPDIR/dynamo_store_kv. #[arg(long, default_value = "etcd", value_parser = ["etcd", "file", "mem"])] pub store_kv: String, /// Determines how requests are distributed from routers to workers. 'tcp' is fastest [nats|http|tcp]. #[arg(long, default_value = "nats", value_parser = ["nats", "http", "tcp"])] pub request_plane: String, /// Everything after a `--`. Not currently used. #[arg(index = 2, last = true, hide = true, allow_hyphen_values = true)] pub last: Vec, } impl Flags { /// For each Output variant, check if it would be able to run. /// This takes validation out of the main engine creation path. pub fn validate(&self, out_opt: &Output) -> anyhow::Result<()> { match out_opt { Output::Auto => { if self.context_length.is_some() { anyhow::bail!( "'--context-length' flag should only be used on the worker node, not on the ingress" ); } if self.kv_cache_block_size.is_some() { anyhow::bail!( "'--kv-cache-block-size' flag should only be used on the worker node, not on the ingress" ); } if self.migration_limit.is_some() { anyhow::bail!( "'--migration-limit' flag should only be used on the worker node, not on the ingress" ); } } Output::Echo => {} #[cfg(feature = "mistralrs")] Output::MistralRs => {} Output::Mocker => { // nothing to check here } } match out_opt { Output::Mocker => {} _ => { if self.extra_engine_args.is_some() { anyhow::bail!("`--extra-engine-args` is only for the mocker engine"); } } } Ok(()) } pub fn router_config(&self) -> RouterConfig { RouterConfig::new( self.router_mode.into(), KvRouterConfig::new( self.kv_overlap_score_weight, self.router_temperature, self.use_kv_events, self.router_replica_sync, self.router_track_active_blocks, // defaulting below args (no longer maintaining new flags for dynamo-run) None, None, ), ) } /// Load extra engine arguments from a JSON file /// Returns a HashMap of parameter names to values pub fn load_extra_engine_args( &self, ) -> anyhow::Result>> { if let Some(path) = &self.extra_engine_args { let file_content = std::fs::read_to_string(path)?; let args: HashMap = serde_json::from_str(&file_content)?; Ok(Some(args)) } else { Ok(None) } } pub fn mocker_config(&self) -> MockEngineArgs { let Some(path) = &self.extra_engine_args else { tracing::warn!("Did not specify extra engine args. Using default mocker args."); return MockEngineArgs::default(); }; MockEngineArgs::from_json_file(path) .unwrap_or_else(|e| panic!("Failed to build mocker engine args from {path:?}: {e}")) } } #[derive(Default, PartialEq, Eq, ValueEnum, Clone, Debug, Copy)] pub enum RouterMode { #[default] #[value(name = "round-robin")] RoundRobin, Random, #[value(name = "kv")] KV, } impl From for RuntimeRouterMode { fn from(r: RouterMode) -> RuntimeRouterMode { match r { RouterMode::RoundRobin => RuntimeRouterMode::RoundRobin, RouterMode::Random => RuntimeRouterMode::Random, RouterMode::KV => RuntimeRouterMode::KV, } } }