help="Load balancing policy to use for request distribution:\n\
- random: Randomly select workers\n\
- round_robin: Distribute requests in round-robin fashion\n\
- cache_aware: Distribute requests in cache-aware fashion\n"
- cache_aware: Distribute requests based on cache state and load balance\n"
)]
policy:PolicyType,
...
...
@@ -57,12 +56,21 @@ struct Args {
#[arg(
long,
default_value_t=1.0,
default_value_t=32,
requires="policy",
required_if_eq("policy","cache_aware"),
help="Probability of using cache-aware routing (0.0-1.0). Default 1.0 for full cache-aware routing, suitable for perfectly divided prefix workloads. For uneven workloads, use a lower value to better distribute requests"
help="Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold. Otherwise, use cache aware. Default: 32"
)]
cache_routing_prob:f32,
balance_abs_threshold:usize,
#[arg(
long,
default_value_t=1.0001,
requires="policy",
required_if_eq("policy","cache_aware"),
help="Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold. Otherwise, use cache aware. Default: 1.0001"
// even though the tree is thread-safe, we still put a lock to ensure the whole op (tree read + queue read + tree write + queue write) is atomic to handle some edge cases (e.g. multiple requests with long prefix entering at the same time)
// TODO: delay scheduling if cache hit rate is high because it may cause imbalance. prioritize low hit rate ones