For development purposes, you can install the package in editable mode:
For development purposes, you can install the package in editable mode:
Warning: Using editable python binding can suffer from performance degradation!! Please build a fresh wheel for every update if you want to test performance.
help="Host address to bind the router server to. Default: 127.0.0.1"
)]
)]
host:String,
host:String,
#[arg(long,default_value_t=3001,help="Port number to listen on")]
#[arg(
long,
default_value_t=3001,
help="Port number to bind the router server to. Default: 3001"
)]
port:u16,
port:u16,
#[arg(
#[arg(
long,
long,
value_delimiter=',',
value_delimiter=',',
help="Comma-separated list of worker URLs to distribute requests to"
help="Comma-separated list of worker URLs that will handle the requests. Each URL should include the protocol, host, and port (e.g., http://worker1:8000,http://worker2:8000)"
)]
)]
worker_urls:Vec<String>,
worker_urls:Vec<String>,
#[arg(
#[arg(
long,
long,
default_value_t=PolicyType::RoundRobin,
default_value_t=PolicyType::CacheAware,
value_enum,
value_enum,
help="Load balancing policy to use: random, round_robin, or approx_tree"
help="Load balancing policy to use for request distribution:\n\
- random: Randomly select workers\n\
- round_robin: Distribute requests in round-robin fashion\n\
- cache_aware: Distribute requests in cache-aware fashion\n"
)]
)]
policy:PolicyType,
policy:PolicyType,
#[arg(
#[arg(
long,
long,
default_value_t=0.5,
requires="policy",
required_if_eq("policy","cache_aware"),
help="Cache threshold (0.0-1.0) for cache-aware routing. Routes to cached worker if the match rate exceeds threshold, otherwise routes to the worker with the smallest tree. Default: 0.5"
)]
cache_threshold:f32,
#[arg(
long,
default_value_t=1.0,
requires="policy",
required_if_eq("policy","cache_aware"),
help="Probability of using cache-aware routing (0.0-1.0). Default 1.0 for full cache-aware routing, suitable for perfectly divided prefix workloads. For uneven workloads, use a lower value to better distribute requests"
)]
cache_routing_prob:f32,
#[arg(
long,
default_value_t=60,
requires="policy",
requires="policy",
required_if_eq("policy","approx_tree"),
required_if_eq("policy","cache_aware"),
help="Path to the tokenizer file, required when using approx_tree policy"
help="Interval in seconds between cache eviction operations in cache-aware routing. Default: 60"
)]
)]
tokenizer_path:Option<String>,
eviction_interval_secs:u64,
#[arg(
#[arg(
long,
long,
default_value="0.50",
default_value_t=2usize.pow(24),
requires="policy",
requires="policy",
required_if_eq("policy","approx_tree"),
required_if_eq("policy","cache_aware"),
help="Cache threshold (0.0-1.0) for approx_tree routing. Routes to cached worker if match rate exceeds threshold, otherwise routes to shortest queue worker"
help="Maximum size of the approximation tree for cache-aware routing. Default: 2^24"
// even though the tree is thread-safe, we still put a lock to ensure the whole op (tree read + queue read + tree write + queue write) is atomic to handle some edge cases (e.g. multiple requests with long prefix entering at the same time)