Unverified Commit 4b0a1c93 authored by Byron Hsu's avatar Byron Hsu Committed by GitHub
Browse files

Replace prob based with threshold based load balancing (#2170)

parent 8e1adb84
......@@ -25,6 +25,7 @@ import warnings
from argparse import ArgumentParser
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
import aiohttp
......@@ -693,6 +694,19 @@ def gen_prompt(tokenizer, token_num):
return tokenizer.decode(selected_tokens)
def get_gen_prefix_cache_path(args, tokenizer):
"""Create cache directory under ~/.cache/sglang/benchmark"""
cache_dir = Path.home() / ".cache" / "sglang" / "benchmark"
# Create a unique cache filename based on the generation parameters
cache_key = (
f"gen_prefix_{args.gen_num_groups}_{args.gen_prompts_per_group}_"
f"{args.gen_system_prompt_len}_{args.gen_question_len}_{args.gen_output_len}_"
f"{tokenizer.__class__.__name__}.pkl"
)
return cache_dir / cache_key
def sample_generated_shared_prefix_requests(
num_groups: int,
prompts_per_group: int,
......@@ -701,12 +715,17 @@ def sample_generated_shared_prefix_requests(
output_len: int,
tokenizer: PreTrainedTokenizerBase,
) -> List[Tuple[str, int, int]]:
if args.generated_input_path and os.path.exists(args.generated_input_path):
print(f"\nloading generated input data from {args.generated_input_path}")
with open(args.generated_input_path, "rb") as f:
"""Generate benchmark requests with shared system prompts using random tokens and caching."""
cache_path = get_gen_prefix_cache_path(args, tokenizer)
# Try to load from cache first
if cache_path.exists():
print(f"\nLoading cached generated input data from {cache_path}")
with open(cache_path, "rb") as f:
return pickle.load(f)
"""Generate benchmark requests with shared system prompts using random tokens."""
print("\nGenerating new input data...")
# Generate system prompts for each group
system_prompts = []
for _ in range(num_groups):
......@@ -719,9 +738,6 @@ def sample_generated_shared_prefix_requests(
question = gen_prompt(tokenizer, question_len)
questions.append(question)
# Shuffle questions
random.shuffle(questions)
# Combine system prompts with questions
input_requests = []
total_input_tokens = 0
......@@ -729,7 +745,9 @@ def sample_generated_shared_prefix_requests(
for group_idx in tqdm(range(num_groups), desc="Generating system prompt"):
system_prompt = system_prompts[group_idx]
for prompt_idx in tqdm(range(prompts_per_group), desc="Generating questions"):
for prompt_idx in tqdm(
range(prompts_per_group), desc="Generating questions", leave=False
):
question = questions[group_idx * prompts_per_group + prompt_idx]
full_prompt = f"{system_prompt}\n\n{question}"
prompt_len = len(tokenizer.encode(full_prompt))
......@@ -738,6 +756,10 @@ def sample_generated_shared_prefix_requests(
total_input_tokens += prompt_len
total_output_tokens += output_len
# Shuffle questions
random.shuffle(input_requests)
# Print statistics
print(f"\nGenerated shared prefix dataset statistics:")
print(f"Number of groups: {num_groups}")
print(f"Prompts per group: {prompts_per_group}")
......@@ -750,11 +772,12 @@ def sample_generated_shared_prefix_requests(
print(
f"Average question length: {sum(len(tokenizer.encode(q)) for q in questions) / len(questions):.1f} tokens\n"
)
if args.generated_input_save_path:
print(f"Saving generated input data to {args.generated_input_save_path}")
os.makedirs(os.path.dirname(args.generated_input_save_path), exist_ok=True)
with open(args.generated_input_save_path, "wb") as f:
pickle.dump(input_requests, f)
# Save to cache
cache_path.parent.mkdir(parents=True, exist_ok=True)
print(f"Caching generated input data to {cache_path}")
with open(cache_path, "wb") as f:
pickle.dump(input_requests, f)
return input_requests
......@@ -1422,16 +1445,6 @@ if __name__ == "__main__":
default=256,
help="Target length in tokens for outputs in generated-shared-prefix dataset",
)
parser.add_argument(
"--generated-input-save-path",
type=str,
help="Path to save generated input data",
)
parser.add_argument(
"--generated-input-path",
type=str,
help="Path to load previously generated input data",
)
parser.add_argument(
"--profile",
action="store_true",
......
......@@ -20,33 +20,35 @@ $ python -m sglang_router.launch_server --model-path meta-llama/Meta-Llama-3.1-8
```
### 2. Launch only router
This is useful if you for multi node DP. You can launch workers on different nodes, then connect the router to them.
This is useful for multi-node DP. You can launch workers on different nodes, then connect the router to them.
```bash
$ python -m sglang_router.launch_router --worker-urls http://worker1:8000 http://worker2:8000
$ python -m sglang_router.launch_router --help
usage: launch_router.py [-h] [--host HOST] [--port PORT] [--worker-urls WORKER_URLS [WORKER_URLS ...]]
[--policy {random,round_robin,cache_aware}] [--cache-threshold CACHE_THRESHOLD]
[--cache-routing-prob CACHE_ROUTING_PROB] [--eviction-interval EVICTION_INTERVAL]
[--max-tree-size MAX_TREE_SIZE]
[--policy {random,round_robin,cache_aware}] [--cache-threshold CACHE_THRESHOLD]
[--balance-abs-threshold BALANCE_ABS_THRESHOLD] [--balance-rel-threshold BALANCE_REL_THRESHOLD]
[--eviction-interval EVICTION_INTERVAL] [--max-tree-size MAX_TREE_SIZE]
options:
-h, --help show this help message and exit
--host HOST Host address to bind the router server (default: 127.0.0.1)
--port PORT Port number to bind the router server (default: 30000)
--host HOST Host address to bind the router server (default: 127.0.0.1)
--port PORT Port number to bind the router server (default: 30000)
--worker-urls WORKER_URLS [WORKER_URLS ...]
List of worker URLs (e.g., http://worker1:8000 http://worker2:8000) (default: None)
List of worker URLs (e.g., http://worker1:8000 http://worker2:8000) (default: None)
--policy {random,round_robin,cache_aware}
Load balancing policy to use (default: cache_aware)
Load balancing policy to use (default: cache_aware)
--cache-threshold CACHE_THRESHOLD
Cache threshold (0.0-1.0) for cache-aware routing (default: 0.5)
--cache-routing-prob CACHE_ROUTING_PROB
Probability of using cache-aware routing (0.0-1.0) (default: 1.0)
Cache threshold (0.0-1.0) for cache-aware routing (default: 0.5)
--balance-abs-threshold BALANCE_ABS_THRESHOLD
Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold (default: 32)
--balance-rel-threshold BALANCE_REL_THRESHOLD
Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold (default: 1.0001)
--eviction-interval EVICTION_INTERVAL
Interval in seconds between cache eviction operations (default: 60)
Interval in seconds between cache eviction operations (default: 60)
--max-tree-size MAX_TREE_SIZE
Maximum size of the approximation tree for cache-aware routing (default: 16777216)
Maximum size of the approximation tree for cache-aware routing (default: 16777216)
```
## Strategy
......@@ -56,7 +58,15 @@ options:
This router combines two strategies to optimize both cache utilization and request distribution:
1. Cache-Aware Routing (Approximate Tree)
2. Load-Balancing Routing (Shortest Queue)
2. Load-Balancing Routing (Shortest Queue with Balance Thresholds)
The router dynamically switches between these strategies based on load conditions:
- Uses load balancing when the system is imbalanced
- Uses cache-aware routing when the system is balanced
A system is considered imbalanced if both conditions are met:
1. (max_load - min_load) > balance_abs_threshold
2. max_load > balance_rel_threshold * min_load
#### 1. Cache-Aware Routing (Approximate Tree)
This strategy maintains an approximate radix tree for each worker based on request history,
......@@ -74,27 +84,32 @@ Process:
#### 2. Load-Balancing (Shortest Queue)
This strategy tracks pending request counts per worker and routes new requests
to the least busy worker for optimal load distribution.
to the least busy worker when the system is detected to be imbalanced. This helps
maintain optimal load distribution across workers.
### Configuration Parameters
1. `cache_routing_prob`: (float, 0.0 to 1.0)
- 0.0: Exclusively use load balancing
- 1.0: Exclusively use cache-aware routing
- Between 0-1: Probability of using cache-aware routing vs load balancing
2. `cache_threshold`: (float, 0.0 to 1.0)
1. `cache_threshold`: (float, 0.0 to 1.0, default: 0.5)
- Minimum prefix match ratio to use highest-match routing
- Below this threshold, routes to worker with most available cache space
3. `eviction_interval_secs`: (integer)
- Interval between LRU eviction cycles for the approximate trees
2. `balance_abs_threshold`: (integer, default: 32)
- Absolute difference threshold for load imbalance detection
- System is potentially imbalanced if (max_load - min_load) > abs_threshold
4. `max_tree_size`: (integer)
3. `balance_rel_threshold`: (float, default: 1.0001)
- Relative ratio threshold for load imbalance detection
- System is potentially imbalanced if max_load > min_load * rel_threshold
- Used in conjunction with abs_threshold to determine final imbalance state
4. `eviction_interval`: (integer, default: 60)
- Interval in seconds between LRU eviction cycles for the approximate trees
- Background thread periodically evicts least recently used nodes to maintain tree size
5. `max_tree_size`: (integer, default: 16777216)
- Maximum nodes per tree
- When exceeded, LRU leaf nodes are evicted during the next eviction cycle
## Development
- Rust and Cargo installed
......
......@@ -17,7 +17,8 @@ class RouterArgs:
# Routing policy
policy: str = "cache_aware"
cache_threshold: float = 0.5
cache_routing_prob: float = 1.0
balance_abs_threshold: int = 32
balance_rel_threshold: float = 1.0001
eviction_interval: int = 60
max_tree_size: int = 2**24
......@@ -74,10 +75,16 @@ class RouterArgs:
help="Cache threshold (0.0-1.0) for cache-aware routing",
)
parser.add_argument(
f"--{prefix}cache-routing-prob",
f"--{prefix}balance-abs-threshold",
type=int,
default=RouterArgs.balance_abs_threshold,
help="Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold. Otherwise, use cache aware",
)
parser.add_argument(
f"--{prefix}balance-rel-threshold",
type=float,
default=RouterArgs.cache_routing_prob,
help="Probability of using cache-aware routing (0.0-1.0)",
default=RouterArgs.balance_rel_threshold,
help="Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold. Otherwise, use cache aware",
)
parser.add_argument(
f"--{prefix}eviction-interval",
......@@ -110,7 +117,8 @@ class RouterArgs:
port=args.port,
policy=getattr(args, f"{prefix}policy"),
cache_threshold=getattr(args, f"{prefix}cache_threshold"),
cache_routing_prob=getattr(args, f"{prefix}cache_routing_prob"),
balance_abs_threshold=getattr(args, f"{prefix}balance_abs_threshold"),
balance_rel_threshold=getattr(args, f"{prefix}balance_rel_threshold"),
eviction_interval=getattr(args, f"{prefix}eviction_interval"),
max_tree_size=getattr(args, f"{prefix}max_tree_size"),
)
......@@ -150,7 +158,8 @@ def launch_router(args: argparse.Namespace) -> Optional[Router]:
host=router_args.host,
port=router_args.port,
cache_threshold=router_args.cache_threshold,
cache_routing_prob=router_args.cache_routing_prob,
balance_abs_threshold=router_args.balance_abs_threshold,
balance_rel_threshold=router_args.balance_rel_threshold,
eviction_interval_secs=router_args.eviction_interval,
max_tree_size=router_args.max_tree_size,
)
......@@ -182,7 +191,7 @@ multi-node setups or when you want to start workers and router separately.
Examples:
python -m sglang_router.launch_router --worker-urls http://worker1:8000 http://worker2:8000
python -m sglang_router.launch_router --worker-urls http://worker1:8000 http://worker2:8000 --cache-threshold 0.7 --cache-routing-prob 0.5
python -m sglang_router.launch_router --worker-urls http://worker1:8000 http://worker2:8000 --cache-threshold 0.7 --balance-abs-threshold 64 --balance-rel-threshold 1.2
""",
formatter_class=CustomHelpFormatter,
......
......@@ -14,15 +14,16 @@ class Router:
policy: Load balancing policy to use. Options:
- PolicyType.Random: Randomly select workers
- PolicyType.RoundRobin: Distribute requests in round-robin fashion
- PolicyType.CacheAware: Distribute requests in cache-aware fashion
- PolicyType.CacheAware: Distribute requests based on cache state and load balance
host: Host address to bind the router server. Default: '127.0.0.1'
port: Port number to bind the router server. Default: 3001
cache_threshold: Cache threshold (0.0-1.0) for cache-aware routing. Routes to cached worker
if the match rate exceeds threshold, otherwise routes to the worker with the smallest
tree. Default: 0.5
cache_routing_prob: Probability of using cache-aware routing (0.0-1.0). Default 1.0 for
full cache-aware routing, suitable for perfectly divided prefix workloads. For uneven
workloads, use a lower value to better distribute requests
balance_abs_threshold: Load balancing is triggered when (max_load - min_load) > abs_threshold
AND max_load > min_load * rel_threshold. Otherwise, use cache aware. Default: 32
balance_rel_threshold: Load balancing is triggered when (max_load - min_load) > abs_threshold
AND max_load > min_load * rel_threshold. Otherwise, use cache aware. Default: 1.0001
eviction_interval_secs: Interval in seconds between cache eviction operations in cache-aware
routing. Default: 60
max_tree_size: Maximum size of the approximation tree for cache-aware routing. Default: 2^24
......@@ -35,7 +36,8 @@ class Router:
host: str = "127.0.0.1",
port: int = 3001,
cache_threshold: float = 0.50,
cache_routing_prob: float = 1.0,
balance_abs_threshold: int = 32,
balance_rel_threshold: float = 1.0001,
eviction_interval_secs: int = 60,
max_tree_size: int = 2**24,
):
......@@ -45,7 +47,8 @@ class Router:
host=host,
port=port,
cache_threshold=cache_threshold,
cache_routing_prob=cache_routing_prob,
balance_abs_threshold=balance_abs_threshold,
balance_rel_threshold=balance_rel_threshold,
eviction_interval_secs=eviction_interval_secs,
max_tree_size=max_tree_size,
)
......
......@@ -18,7 +18,8 @@ struct Router {
worker_urls: Vec<String>,
policy: PolicyType,
cache_threshold: f32,
cache_routing_prob: f32,
balance_abs_threshold: usize,
balance_rel_threshold: f32,
eviction_interval_secs: u64,
max_tree_size: usize,
}
......@@ -32,7 +33,8 @@ impl Router {
host = String::from("127.0.0.1"),
port = 3001,
cache_threshold = 0.50,
cache_routing_prob = 1.0,
balance_abs_threshold = 32,
balance_rel_threshold = 1.0001,
eviction_interval_secs = 60,
max_tree_size = 2usize.pow(24)
))]
......@@ -42,7 +44,8 @@ impl Router {
host: String,
port: u16,
cache_threshold: f32,
cache_routing_prob: f32,
balance_abs_threshold: usize,
balance_rel_threshold: f32,
eviction_interval_secs: u64,
max_tree_size: usize,
) -> PyResult<Self> {
......@@ -52,7 +55,8 @@ impl Router {
worker_urls,
policy,
cache_threshold,
cache_routing_prob,
balance_abs_threshold,
balance_rel_threshold,
eviction_interval_secs,
max_tree_size,
})
......@@ -68,7 +72,8 @@ impl Router {
PolicyType::RoundRobin => router::PolicyConfig::RoundRobinConfig,
PolicyType::CacheAware => router::PolicyConfig::CacheAwareConfig {
cache_threshold: self.cache_threshold,
cache_routing_prob: self.cache_routing_prob,
balance_abs_threshold: self.balance_abs_threshold,
balance_rel_threshold: self.balance_rel_threshold,
eviction_interval_secs: self.eviction_interval_secs,
max_tree_size: self.max_tree_size,
},
......
// src/main.rs
use clap::Parser;
use clap::ValueEnum;
......@@ -42,7 +41,7 @@ struct Args {
help = "Load balancing policy to use for request distribution:\n\
- random: Randomly select workers\n\
- round_robin: Distribute requests in round-robin fashion\n\
- cache_aware: Distribute requests in cache-aware fashion\n"
- cache_aware: Distribute requests based on cache state and load balance\n"
)]
policy: PolicyType,
......@@ -57,12 +56,21 @@ struct Args {
#[arg(
long,
default_value_t = 1.0,
default_value_t = 32,
requires = "policy",
required_if_eq("policy", "cache_aware"),
help = "Probability of using cache-aware routing (0.0-1.0). Default 1.0 for full cache-aware routing, suitable for perfectly divided prefix workloads. For uneven workloads, use a lower value to better distribute requests"
help = "Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold. Otherwise, use cache aware. Default: 32"
)]
cache_routing_prob: f32,
balance_abs_threshold: usize,
#[arg(
long,
default_value_t = 1.0001,
requires = "policy",
required_if_eq("policy", "cache_aware"),
help = "Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold. Otherwise, use cache aware. Default: 1.0001"
)]
balance_rel_threshold: f32,
#[arg(
long,
......@@ -90,7 +98,8 @@ impl Args {
PolicyType::RoundRobin => PolicyConfig::RoundRobinConfig,
PolicyType::CacheAware => PolicyConfig::CacheAwareConfig {
cache_threshold: self.cache_threshold,
cache_routing_prob: self.cache_routing_prob,
balance_abs_threshold: self.balance_abs_threshold,
balance_rel_threshold: self.balance_rel_threshold,
eviction_interval_secs: self.eviction_interval_secs,
max_tree_size: self.max_tree_size,
},
......
......@@ -23,65 +23,73 @@ pub enum Router {
},
CacheAware {
/*
Cache-Aware Load Balancing Router
This router combines two strategies to optimize both cache utilization and request distribution:
1. Cache-Aware Routing (Approximate Tree)
2. Load Balancing (Shortest Queue)
For each incoming request, the router chooses between these strategies:
- With probability P: Uses cache-aware routing
- With probability (1-P): Uses load balancing
where P is configured via `cache_routing_prob`
Strategy Details:
1. Cache-Aware Routing (Approximate Tree)
-------------------------------------------
This strategy maintains an approximate radix tree for each worker based on request history,
eliminating the need for direct cache state queries. The tree stores raw text characters
instead of token IDs to avoid tokenization overhead.
Process:
a. For each request, find the worker with the highest prefix match
b. If match rate > cache_threshold:
Route to the worker with highest match (likely has relevant data cached)
c. If match rate ≤ cache_threshold:
Route to the worker with smallest tree size (most available cache capacity)
d. Background maintenance:
Periodically evict least recently used leaf nodes to prevent memory overflow
2. Load Balancing (Shortest Queue)
-------------------------------------------
This strategy tracks pending request counts per worker and routes new requests
to the least busy worker for optimal load distribution.
Configuration Parameters:
------------------------
1. cache_routing_prob: (float, 0.0 to 1.0)
- 0.0: Exclusively use load balancing
- 1.0: Exclusively use cache-aware routing
- Between 0-1: Probability of using cache-aware routing vs load balancing
2. cache_threshold: (float, 0.0 to 1.0)
Minimum prefix match ratio to use highest-match routing.
Below this threshold, routes to worker with most available cache space.
3. eviction_interval_secs: (integer)
Interval between LRU eviction cycles for the approximate trees.
4. max_tree_size: (integer)
Maximum nodes per tree. When exceeded, LRU leaf nodes are evicted
during the next eviction cycle.
Cache-Aware Load Balancing Router
This router combines two strategies to optimize both cache utilization and request distribution:
1. Cache-Aware Routing (Approximate Tree)
2. Load Balancing (Shortest Queue with Balance Thresholds)
The router dynamically switches between these strategies based on load conditions:
- Uses load balancing when the system is imbalanced
- Uses cache-aware routing when the system is balanced
A system is considered imbalanced if both conditions are met:
1. (max - min) > abs_threshold
2. max > rel_threshold * min
Strategy Details:
1. Cache-Aware Routing (Approximate Tree)
-------------------------------------------
This strategy maintains an approximate radix tree for each worker based on request history,
eliminating the need for direct cache state queries. The tree stores raw text characters
instead of token IDs to avoid tokenization overhead.
Process:
a. For each request, find the worker with the highest prefix match
b. If match rate > cache_threshold:
Route to the worker with highest match (likely has relevant data cached)
c. If match rate ≤ cache_threshold:
Route to the worker with smallest tree size (most available cache capacity)
d. Background maintenance:
Periodically evict least recently used leaf nodes to prevent memory overflow
2. Load Balancing (Shortest Queue)
-------------------------------------------
This strategy tracks pending request counts per worker and routes new requests
to the least busy worker when the system is detected to be imbalanced.
Configuration Parameters:
------------------------
1. cache_threshold: (float, 0.0 to 1.0)
Minimum prefix match ratio to use highest-match routing.
Below this threshold, routes to worker with most available cache space.
2. balance_abs_threshold: (integer)
Absolute difference threshold for load imbalance detection.
System is potentially imbalanced if (max_load - min_load) > abs_threshold
3. balance_rel_threshold: (float)
Relative ratio threshold for load imbalance detection.
System is potentially imbalanced if max_load > min_load * rel_threshold
Used in conjunction with abs_threshold to determine final imbalance state.
4. eviction_interval_secs: (integer)
Interval between LRU eviction cycles for the approximate trees.
5. max_tree_size: (integer)
Maximum nodes per tree. When exceeded, LRU leaf nodes are evicted
during the next eviction cycle.
*/
worker_urls: Vec<String>,
tree: Arc<Mutex<Tree>>,
running_queue: Arc<Mutex<HashMap<String, usize>>>,
processed_queue: Arc<Mutex<HashMap<String, usize>>>,
cache_threshold: f32,
cache_routing_prob: f32,
_eviction_thread: Option<thread::JoinHandle<()>>, // Store thread handle
balance_abs_threshold: usize,
balance_rel_threshold: f32,
_eviction_thread: Option<thread::JoinHandle<()>>,
},
}
......@@ -91,7 +99,8 @@ pub enum PolicyConfig {
RoundRobinConfig,
CacheAwareConfig {
cache_threshold: f32,
cache_routing_prob: f32,
balance_abs_threshold: usize,
balance_rel_threshold: f32,
eviction_interval_secs: u64,
max_tree_size: usize,
},
......@@ -128,7 +137,8 @@ impl Router {
},
PolicyConfig::CacheAwareConfig {
cache_threshold,
cache_routing_prob,
balance_abs_threshold,
balance_rel_threshold,
eviction_interval_secs,
max_tree_size,
} => {
......@@ -149,6 +159,7 @@ impl Router {
// Create background eviction thread
let tree_clone = Arc::clone(&tree);
let processed_queue_clone = Arc::clone(&processed_queue);
let running_queue_clone = Arc::clone(&running_queue);
let eviction_thread = thread::spawn(move || {
loop {
// Sleep for the specified interval
......@@ -161,6 +172,10 @@ impl Router {
// Print the process queue
let locked_processed_queue = processed_queue_clone.lock().unwrap();
println!("Processed Queue: {:?}", locked_processed_queue);
// Print the running queue
let locked_running_queue = running_queue_clone.lock().unwrap();
println!("Running Queue: {:?}", locked_running_queue);
}
});
......@@ -174,7 +189,8 @@ impl Router {
running_queue,
processed_queue,
cache_threshold,
cache_routing_prob,
balance_abs_threshold,
balance_rel_threshold,
_eviction_thread: Some(eviction_thread),
}
}
......@@ -203,8 +219,6 @@ impl Router {
route: &str,
) -> HttpResponse {
let text = get_text_from_request(&body, route);
// For Debug
// println!("text: {:?}, route: {:?}", text, route);
let worker_url = match self {
Router::RoundRobin {
......@@ -218,7 +232,6 @@ impl Router {
|x| Some((x + 1) % worker_urls.len()),
)
.unwrap();
worker_urls[idx].clone()
}
......@@ -232,19 +245,42 @@ impl Router {
running_queue,
processed_queue,
cache_threshold,
cache_routing_prob,
balance_abs_threshold,
balance_rel_threshold,
..
} => {
// even though the tree is thread-safe, we still put a lock to ensure the whole op (tree read + queue read + tree write + queue write) is atomic to handle some edge cases (e.g. multiple requests with long prefix entering at the same time)
// TODO: delay scheduling if cache hit rate is high because it may cause imbalance. prioritize low hit rate ones
let mut tree = tree.lock().unwrap();
let mut running_queue = running_queue.lock().unwrap();
// Generate a random float between 0 and 1 for probability check
let sampled_p: f32 = rand::random();
let selected_url = if sampled_p < *cache_routing_prob {
// Cache-aware routing logic
// Get current load statistics
let max_load = *running_queue.values().max().unwrap_or(&0);
let min_load = *running_queue.values().min().unwrap_or(&0);
// Load is considered imbalanced if:
// 1. (max - min) > abs_threshold AND
// 2. max > rel_threshold * min
let is_imbalanced = max_load.saturating_sub(min_load) > *balance_abs_threshold
&& (max_load as f32) > (min_load as f32 * balance_rel_threshold);
let selected_url = if is_imbalanced {
// Log load balancing trigger and current queue state
println!(
"Load balancing triggered due to workload imbalance:\n\
Max load: {}, Min load: {}\n\
Current running queue: {:?}",
max_load, min_load, running_queue
);
// Use shortest queue routing when load is imbalanced
running_queue
.iter()
.min_by_key(|(_url, &count)| count)
.map(|(url, _)| url.clone())
.unwrap_or_else(|| worker_urls[0].clone())
} else {
// Use cache-aware routing when load is balanced
let (matched_text, matched_worker) = tree.prefix_match(&text);
let matched_rate =
matched_text.chars().count() as f32 / text.chars().count() as f32;
......@@ -252,36 +288,18 @@ impl Router {
if matched_rate > *cache_threshold {
matched_worker.to_string()
} else {
// For Debug
// let m_map: HashMap<String, usize> = tree
// .tenant_char_count
// .iter()
// .map(|entry| (entry.key().clone(), *entry.value()))
// .collect();
// println!("map: {:?}, mmap: {:?}", tree.get_tenant_char_count(), m_map);
tree.get_smallest_tenant()
}
} else {
// Shortest queue routing logic
running_queue
.iter()
.min_by_key(|(_url, &count)| count)
.map(|(url, _)| url.clone())
.unwrap_or_else(|| worker_urls[0].clone())
};
// Update running queue
let count = running_queue.get_mut(&selected_url).unwrap();
*count += 1;
// Update processed queue
let mut locked_processed_queue = processed_queue.lock().unwrap();
let count = locked_processed_queue.get_mut(&selected_url).unwrap();
*count += 1;
// Update queues and tree
*running_queue.get_mut(&selected_url).unwrap() += 1;
// Update tree with the new request
*processed_queue
.lock()
.unwrap()
.get_mut(&selected_url)
.unwrap() += 1;
tree.insert(&text, &selected_url);
selected_url
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment