Replace prob based with threshold based load balancing (#2170)

4b0a1c93 · Byron Hsu · GitHub · 8e1adb84 · 4b0a1c93 · 4b0a1c93
Unverified Commit 4b0a1c93 authored Nov 24, 2024 by Byron Hsu Committed by GitHub Nov 24, 2024
7 changed files
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -25,6 +25,7 @@ import warnings
 from argparse import ArgumentParser
 from dataclasses import dataclass, field
 from datetime import datetime
+from pathlib import Path
 from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union

 import aiohttp
@@ -693,6 +694,19 @@ def gen_prompt(tokenizer, token_num):
    return tokenizer.decode(selected_tokens)


+def get_gen_prefix_cache_path(args, tokenizer):
+    """Create cache directory under ~/.cache/sglang/benchmark"""
+    cache_dir = Path.home() / ".cache" / "sglang" / "benchmark"
+
+    # Create a unique cache filename based on the generation parameters
+    cache_key = (
+        f"gen_prefix_{args.gen_num_groups}_{args.gen_prompts_per_group}_"
+        f"{args.gen_system_prompt_len}_{args.gen_question_len}_{args.gen_output_len}_"
+        f"{tokenizer.__class__.__name__}.pkl"
+    )
+    return cache_dir / cache_key
+
+
 def sample_generated_shared_prefix_requests(
    num_groups: int,
    prompts_per_group: int,
@@ -701,12 +715,17 @@ def sample_generated_shared_prefix_requests(
    output_len: int,
    tokenizer: PreTrainedTokenizerBase,
 ) -> List[Tuple[str, int, int]]:
-    if args.generated_input_path and os.path.exists(args.generated_input_path):
-        print(f"\nloading generated input data from {args.generated_input_path}")
-        with open(args.generated_input_path, "rb") as f:
+    """Generate benchmark requests with shared system prompts using random tokens and caching."""
+    cache_path = get_gen_prefix_cache_path(args, tokenizer)
+
+    # Try to load from cache first
+    if cache_path.exists():
+        print(f"\nLoading cached generated input data from {cache_path}")
+        with open(cache_path, "rb") as f:
            return pickle.load(f)

-    """Generate benchmark requests with shared system prompts using random tokens."""
+    print("\nGenerating new input data...")
+
    # Generate system prompts for each group
    system_prompts = []
    for _ in range(num_groups):
@@ -719,9 +738,6 @@ def sample_generated_shared_prefix_requests(
        question = gen_prompt(tokenizer, question_len)
        questions.append(question)

-    # Shuffle questions
-    random.shuffle(questions)
-
    # Combine system prompts with questions
    input_requests = []
    total_input_tokens = 0
@@ -729,7 +745,9 @@ def sample_generated_shared_prefix_requests(

    for group_idx in tqdm(range(num_groups), desc="Generating system prompt"):
        system_prompt = system_prompts[group_idx]
-        for prompt_idx in tqdm(range(prompts_per_group), desc="Generating questions"):
+        for prompt_idx in tqdm(
+            range(prompts_per_group), desc="Generating questions", leave=False
+        ):
            question = questions[group_idx * prompts_per_group + prompt_idx]
            full_prompt = f"{system_prompt}\n\n{question}"
            prompt_len = len(tokenizer.encode(full_prompt))
@@ -738,6 +756,10 @@ def sample_generated_shared_prefix_requests(
            total_input_tokens += prompt_len
            total_output_tokens += output_len

+    # Shuffle questions
+    random.shuffle(input_requests)
+
+    # Print statistics
    print(f"\nGenerated shared prefix dataset statistics:")
    print(f"Number of groups: {num_groups}")
    print(f"Prompts per group: {prompts_per_group}")
@@ -750,11 +772,12 @@ def sample_generated_shared_prefix_requests(
    print(
        f"Average question length: {sum(len(tokenizer.encode(q)) for q in questions) / len(questions):.1f} tokens\n"
    )
-    if args.generated_input_save_path:
-        print(f"Saving generated input data to {args.generated_input_save_path}")
-        os.makedirs(os.path.dirname(args.generated_input_save_path), exist_ok=True)
-        with open(args.generated_input_save_path, "wb") as f:
-            pickle.dump(input_requests, f)
+
+    # Save to cache
+    cache_path.parent.mkdir(parents=True, exist_ok=True)
+    print(f"Caching generated input data to {cache_path}")
+    with open(cache_path, "wb") as f:
+        pickle.dump(input_requests, f)

    return input_requests

@@ -1422,16 +1445,6 @@ if __name__ == "__main__":
        default=256,
        help="Target length in tokens for outputs in generated-shared-prefix dataset",
    )
-    parser.add_argument(
-        "--generated-input-save-path",
-        type=str,
-        help="Path to save generated input data",
-    )
-    parser.add_argument(
-        "--generated-input-path",
-        type=str,
-        help="Path to load previously generated input data",
-    )
    parser.add_argument(
        "--profile",
        action="store_true",

--- a/rust/README.md
+++ b/rust/README.md
@@ -20,33 +20,35 @@ $ python -m sglang_router.launch_server --model-path meta-llama/Meta-Llama-3.1-8
 ```

 ### 2. Launch only router
-This is useful if you for multi node DP. You can launch workers on different nodes, then connect the router to them.
+This is useful for multi-node DP. You can launch workers on different nodes, then connect the router to them.

 ```bash
 $ python -m sglang_router.launch_router --worker-urls http://worker1:8000 http://worker2:8000

 $ python -m sglang_router.launch_router --help
 usage: launch_router.py [-h] [--host HOST] [--port PORT] [--worker-urls WORKER_URLS [WORKER_URLS ...]]
-                        [--policy {random,round_robin,cache_aware}] [--cache-threshold CACHE_THRESHOLD]
-                        [--cache-routing-prob CACHE_ROUTING_PROB] [--eviction-interval EVICTION_INTERVAL]
-                        [--max-tree-size MAX_TREE_SIZE]
+                       [--policy {random,round_robin,cache_aware}] [--cache-threshold CACHE_THRESHOLD]
+                       [--balance-abs-threshold BALANCE_ABS_THRESHOLD] [--balance-rel-threshold BALANCE_REL_THRESHOLD]
+                       [--eviction-interval EVICTION_INTERVAL] [--max-tree-size MAX_TREE_SIZE]

 options:
  -h, --help            show this help message and exit
-  --host HOST           Host address to bind the router server (default: 127.0.0.1)
-  --port PORT           Port number to bind the router server (default: 30000)
+  --host HOST          Host address to bind the router server (default: 127.0.0.1)
+  --port PORT          Port number to bind the router server (default: 30000)
  --worker-urls WORKER_URLS [WORKER_URLS ...]
-                        List of worker URLs (e.g., http://worker1:8000 http://worker2:8000) (default: None)
+                       List of worker URLs (e.g., http://worker1:8000 http://worker2:8000) (default: None)
  --policy {random,round_robin,cache_aware}
-                        Load balancing policy to use (default: cache_aware)
+                       Load balancing policy to use (default: cache_aware)
  --cache-threshold CACHE_THRESHOLD
-                        Cache threshold (0.0-1.0) for cache-aware routing (default: 0.5)
-  --cache-routing-prob CACHE_ROUTING_PROB
-                        Probability of using cache-aware routing (0.0-1.0) (default: 1.0)
+                       Cache threshold (0.0-1.0) for cache-aware routing (default: 0.5)
+  --balance-abs-threshold BALANCE_ABS_THRESHOLD
+                       Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold (default: 32)
+  --balance-rel-threshold BALANCE_REL_THRESHOLD
+                       Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold (default: 1.0001)
  --eviction-interval EVICTION_INTERVAL
-                        Interval in seconds between cache eviction operations (default: 60)
+                       Interval in seconds between cache eviction operations (default: 60)
  --max-tree-size MAX_TREE_SIZE
-                        Maximum size of the approximation tree for cache-aware routing (default: 16777216)
+                       Maximum size of the approximation tree for cache-aware routing (default: 16777216)
 ```

 ## Strategy
@@ -56,7 +58,15 @@ options:
 This router combines two strategies to optimize both cache utilization and request distribution:

 1. Cache-Aware Routing (Approximate Tree)
-2. Load-Balancing Routing (Shortest Queue)
+2. Load-Balancing Routing (Shortest Queue with Balance Thresholds)
+
+The router dynamically switches between these strategies based on load conditions:
+- Uses load balancing when the system is imbalanced
+- Uses cache-aware routing when the system is balanced
+
+A system is considered imbalanced if both conditions are met:
+1. (max_load - min_load) > balance_abs_threshold
+2. max_load > balance_rel_threshold * min_load

 #### 1. Cache-Aware Routing (Approximate Tree)
 This strategy maintains an approximate radix tree for each worker based on request history,
@@ -74,27 +84,32 @@ Process:

 #### 2. Load-Balancing (Shortest Queue)
 This strategy tracks pending request counts per worker and routes new requests
-to the least busy worker for optimal load distribution.
+to the least busy worker when the system is detected to be imbalanced. This helps
+maintain optimal load distribution across workers.

 ### Configuration Parameters

-1. `cache_routing_prob`: (float, 0.0 to 1.0)
-   - 0.0: Exclusively use load balancing
-   - 1.0: Exclusively use cache-aware routing
-   - Between 0-1: Probability of using cache-aware routing vs load balancing
-
-2. `cache_threshold`: (float, 0.0 to 1.0)
+1. `cache_threshold`: (float, 0.0 to 1.0, default: 0.5)
   - Minimum prefix match ratio to use highest-match routing
   - Below this threshold, routes to worker with most available cache space

-3. `eviction_interval_secs`: (integer)
-   - Interval between LRU eviction cycles for the approximate trees
+2. `balance_abs_threshold`: (integer, default: 32)
+   - Absolute difference threshold for load imbalance detection
+   - System is potentially imbalanced if (max_load - min_load) > abs_threshold

-4. `max_tree_size`: (integer)
+3. `balance_rel_threshold`: (float, default: 1.0001)
+   - Relative ratio threshold for load imbalance detection
+   - System is potentially imbalanced if max_load > min_load * rel_threshold
+   - Used in conjunction with abs_threshold to determine final imbalance state
+
+4. `eviction_interval`: (integer, default: 60)
+   - Interval in seconds between LRU eviction cycles for the approximate trees
+   - Background thread periodically evicts least recently used nodes to maintain tree size
+
+5. `max_tree_size`: (integer, default: 16777216)
   - Maximum nodes per tree
   - When exceeded, LRU leaf nodes are evicted during the next eviction cycle

-
 ## Development

 - Rust and Cargo installed

--- a/rust/py_src/sglang_router/launch_router.py
+++ b/rust/py_src/sglang_router/launch_router.py
@@ -17,7 +17,8 @@ class RouterArgs:
    # Routing policy
    policy: str = "cache_aware"
    cache_threshold: float = 0.5
-    cache_routing_prob: float = 1.0
+    balance_abs_threshold: int = 32
+    balance_rel_threshold: float = 1.0001
    eviction_interval: int = 60
    max_tree_size: int = 2**24

@@ -74,10 +75,16 @@ class RouterArgs:
            help="Cache threshold (0.0-1.0) for cache-aware routing",
        )
        parser.add_argument(
-            f"--{prefix}cache-routing-prob",
+            f"--{prefix}balance-abs-threshold",
+            type=int,
+            default=RouterArgs.balance_abs_threshold,
+            help="Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold. Otherwise, use cache aware",
+        )
+        parser.add_argument(
+            f"--{prefix}balance-rel-threshold",
            type=float,
-            default=RouterArgs.cache_routing_prob,
-            help="Probability of using cache-aware routing (0.0-1.0)",
+            default=RouterArgs.balance_rel_threshold,
+            help="Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold. Otherwise, use cache aware",
        )
        parser.add_argument(
            f"--{prefix}eviction-interval",
@@ -110,7 +117,8 @@ class RouterArgs:
            port=args.port,
            policy=getattr(args, f"{prefix}policy"),
            cache_threshold=getattr(args, f"{prefix}cache_threshold"),
-            cache_routing_prob=getattr(args, f"{prefix}cache_routing_prob"),
+            balance_abs_threshold=getattr(args, f"{prefix}balance_abs_threshold"),
+            balance_rel_threshold=getattr(args, f"{prefix}balance_rel_threshold"),
            eviction_interval=getattr(args, f"{prefix}eviction_interval"),
            max_tree_size=getattr(args, f"{prefix}max_tree_size"),
        )
@@ -150,7 +158,8 @@ def launch_router(args: argparse.Namespace) -> Optional[Router]:
            host=router_args.host,
            port=router_args.port,
            cache_threshold=router_args.cache_threshold,
-            cache_routing_prob=router_args.cache_routing_prob,
+            balance_abs_threshold=router_args.balance_abs_threshold,
+            balance_rel_threshold=router_args.balance_rel_threshold,
            eviction_interval_secs=router_args.eviction_interval,
            max_tree_size=router_args.max_tree_size,
        )
@@ -182,7 +191,7 @@ multi-node setups or when you want to start workers and router separately.

 Examples:
  python -m sglang_router.launch_router --worker-urls http://worker1:8000 http://worker2:8000
-  python -m sglang_router.launch_router --worker-urls http://worker1:8000 http://worker2:8000 --cache-threshold 0.7 --cache-routing-prob 0.5
+  python -m sglang_router.launch_router --worker-urls http://worker1:8000 http://worker2:8000 --cache-threshold 0.7 --balance-abs-threshold 64 --balance-rel-threshold 1.2

    """,
        formatter_class=CustomHelpFormatter,

--- a/rust/py_src/sglang_router/router.py
+++ b/rust/py_src/sglang_router/router.py
@@ -14,15 +14,16 @@ class Router:
        policy: Load balancing policy to use. Options:
            - PolicyType.Random: Randomly select workers
            - PolicyType.RoundRobin: Distribute requests in round-robin fashion
-            - PolicyType.CacheAware: Distribute requests in cache-aware fashion
+            - PolicyType.CacheAware: Distribute requests based on cache state and load balance
        host: Host address to bind the router server. Default: '127.0.0.1'
        port: Port number to bind the router server. Default: 3001
        cache_threshold: Cache threshold (0.0-1.0) for cache-aware routing. Routes to cached worker
            if the match rate exceeds threshold, otherwise routes to the worker with the smallest
            tree. Default: 0.5
-        cache_routing_prob: Probability of using cache-aware routing (0.0-1.0). Default 1.0 for
-            full cache-aware routing, suitable for perfectly divided prefix workloads. For uneven
-            workloads, use a lower value to better distribute requests
+        balance_abs_threshold: Load balancing is triggered when (max_load - min_load) > abs_threshold
+            AND max_load > min_load * rel_threshold. Otherwise, use cache aware. Default: 32
+        balance_rel_threshold: Load balancing is triggered when (max_load - min_load) > abs_threshold
+            AND max_load > min_load * rel_threshold. Otherwise, use cache aware. Default: 1.0001
        eviction_interval_secs: Interval in seconds between cache eviction operations in cache-aware
            routing. Default: 60
        max_tree_size: Maximum size of the approximation tree for cache-aware routing. Default: 2^24
@@ -35,7 +36,8 @@ class Router:
        host: str = "127.0.0.1",
        port: int = 3001,
        cache_threshold: float = 0.50,
-        cache_routing_prob: float = 1.0,
+        balance_abs_threshold: int = 32,
+        balance_rel_threshold: float = 1.0001,
        eviction_interval_secs: int = 60,
        max_tree_size: int = 2**24,
    ):
@@ -45,7 +47,8 @@ class Router:
            host=host,
            port=port,
            cache_threshold=cache_threshold,
-            cache_routing_prob=cache_routing_prob,
+            balance_abs_threshold=balance_abs_threshold,
+            balance_rel_threshold=balance_rel_threshold,
            eviction_interval_secs=eviction_interval_secs,
            max_tree_size=max_tree_size,
        )

--- a/rust/src/lib.rs
+++ b/rust/src/lib.rs
@@ -18,7 +18,8 @@ struct Router {
    worker_urls: Vec<String>,
    policy: PolicyType,
    cache_threshold: f32,
-    cache_routing_prob: f32,
+    balance_abs_threshold: usize,
+    balance_rel_threshold: f32,
    eviction_interval_secs: u64,
    max_tree_size: usize,
 }
@@ -32,7 +33,8 @@ impl Router {
        host = String::from("127.0.0.1"),
        port = 3001,
        cache_threshold = 0.50,
-        cache_routing_prob = 1.0,
+        balance_abs_threshold = 32,
+        balance_rel_threshold = 1.0001,
        eviction_interval_secs = 60,
        max_tree_size = 2usize.pow(24)
    ))]
@@ -42,7 +44,8 @@ impl Router {
        host: String,
        port: u16,
        cache_threshold: f32,
-        cache_routing_prob: f32,
+        balance_abs_threshold: usize,
+        balance_rel_threshold: f32,
        eviction_interval_secs: u64,
        max_tree_size: usize,
    ) -> PyResult<Self> {
@@ -52,7 +55,8 @@ impl Router {
            worker_urls,
            policy,
            cache_threshold,
-            cache_routing_prob,
+            balance_abs_threshold,
+            balance_rel_threshold,
            eviction_interval_secs,
            max_tree_size,
        })
@@ -68,7 +72,8 @@ impl Router {
            PolicyType::RoundRobin => router::PolicyConfig::RoundRobinConfig,
            PolicyType::CacheAware => router::PolicyConfig::CacheAwareConfig {
                cache_threshold: self.cache_threshold,
-                cache_routing_prob: self.cache_routing_prob,
+                balance_abs_threshold: self.balance_abs_threshold,
+                balance_rel_threshold: self.balance_rel_threshold,
                eviction_interval_secs: self.eviction_interval_secs,
                max_tree_size: self.max_tree_size,
            },

--- a/rust/src/main.rs
+++ b/rust/src/main.rs
-// src/main.rs
 use clap::Parser;
 use clap::ValueEnum;

@@ -42,7 +41,7 @@ struct Args {
        help = "Load balancing policy to use for request distribution:\n\
              - random: Randomly select workers\n\
              - round_robin: Distribute requests in round-robin fashion\n\
-              - cache_aware: Distribute requests in cache-aware fashion\n"
+              - cache_aware: Distribute requests based on cache state and load balance\n"
    )]
    policy: PolicyType,

@@ -57,12 +56,21 @@ struct Args {

    #[arg(
        long,
-        default_value_t = 1.0,
+        default_value_t = 32,
        requires = "policy",
        required_if_eq("policy", "cache_aware"),
-        help = "Probability of using cache-aware routing (0.0-1.0). Default 1.0 for full cache-aware routing, suitable for perfectly divided prefix workloads. For uneven workloads, use a lower value to better distribute requests"
+        help = "Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold. Otherwise, use cache aware. Default: 32"
    )]
-    cache_routing_prob: f32,
+    balance_abs_threshold: usize,
+
+    #[arg(
+        long,
+        default_value_t = 1.0001,
+        requires = "policy",
+        required_if_eq("policy", "cache_aware"),
+        help = "Load balancing is triggered when (max_load - min_load) > abs_threshold AND max_load > min_load * rel_threshold. Otherwise, use cache aware. Default: 1.0001"
+    )]
+    balance_rel_threshold: f32,

    #[arg(
        long,
@@ -90,7 +98,8 @@ impl Args {
            PolicyType::RoundRobin => PolicyConfig::RoundRobinConfig,
            PolicyType::CacheAware => PolicyConfig::CacheAwareConfig {
                cache_threshold: self.cache_threshold,
-                cache_routing_prob: self.cache_routing_prob,
+                balance_abs_threshold: self.balance_abs_threshold,
+                balance_rel_threshold: self.balance_rel_threshold,
                eviction_interval_secs: self.eviction_interval_secs,
                max_tree_size: self.max_tree_size,
            },

--- a/rust/src/router.rs
+++ b/rust/src/router.rs
@@ -23,65 +23,73 @@ pub enum Router {
    },
    CacheAware {
        /*
-        Cache-Aware Load Balancing Router
-
-        This router combines two strategies to optimize both cache utilization and request distribution:
-
-        1. Cache-Aware Routing (Approximate Tree)
-        2. Load Balancing (Shortest Queue)
-
-        For each incoming request, the router chooses between these strategies:
-        - With probability P: Uses cache-aware routing
-        - With probability (1-P): Uses load balancing
-        where P is configured via `cache_routing_prob`
-
-        Strategy Details:
-
-        1. Cache-Aware Routing (Approximate Tree)
-        -------------------------------------------
-        This strategy maintains an approximate radix tree for each worker based on request history,
-        eliminating the need for direct cache state queries. The tree stores raw text characters
-        instead of token IDs to avoid tokenization overhead.
-
-        Process:
-        a. For each request, find the worker with the highest prefix match
-        b. If match rate > cache_threshold:
-        Route to the worker with highest match (likely has relevant data cached)
-        c. If match rate ≤ cache_threshold:
-        Route to the worker with smallest tree size (most available cache capacity)
-        d. Background maintenance:
-        Periodically evict least recently used leaf nodes to prevent memory overflow
-
-        2. Load Balancing (Shortest Queue)
-        -------------------------------------------
-        This strategy tracks pending request counts per worker and routes new requests
-        to the least busy worker for optimal load distribution.
-
-        Configuration Parameters:
-        ------------------------
-        1. cache_routing_prob: (float, 0.0 to 1.0)
-        - 0.0: Exclusively use load balancing
-        - 1.0: Exclusively use cache-aware routing
-        - Between 0-1: Probability of using cache-aware routing vs load balancing
-
-        2. cache_threshold: (float, 0.0 to 1.0)
-        Minimum prefix match ratio to use highest-match routing.
-        Below this threshold, routes to worker with most available cache space.
-
-        3. eviction_interval_secs: (integer)
-        Interval between LRU eviction cycles for the approximate trees.
-
-        4. max_tree_size: (integer)
-        Maximum nodes per tree. When exceeded, LRU leaf nodes are evicted
-        during the next eviction cycle.
+            Cache-Aware Load Balancing Router
+
+            This router combines two strategies to optimize both cache utilization and request distribution:
+
+            1. Cache-Aware Routing (Approximate Tree)
+            2. Load Balancing (Shortest Queue with Balance Thresholds)
+
+            The router dynamically switches between these strategies based on load conditions:
+            - Uses load balancing when the system is imbalanced
+            - Uses cache-aware routing when the system is balanced
+
+            A system is considered imbalanced if both conditions are met:
+            1. (max - min) > abs_threshold
+            2. max > rel_threshold * min
+
+            Strategy Details:
+
+            1. Cache-Aware Routing (Approximate Tree)
+            -------------------------------------------
+            This strategy maintains an approximate radix tree for each worker based on request history,
+            eliminating the need for direct cache state queries. The tree stores raw text characters
+            instead of token IDs to avoid tokenization overhead.
+
+            Process:
+            a. For each request, find the worker with the highest prefix match
+            b. If match rate > cache_threshold:
+            Route to the worker with highest match (likely has relevant data cached)
+            c. If match rate ≤ cache_threshold:
+            Route to the worker with smallest tree size (most available cache capacity)
+            d. Background maintenance:
+            Periodically evict least recently used leaf nodes to prevent memory overflow
+
+            2. Load Balancing (Shortest Queue)
+            -------------------------------------------
+            This strategy tracks pending request counts per worker and routes new requests
+            to the least busy worker when the system is detected to be imbalanced.
+
+            Configuration Parameters:
+            ------------------------
+            1. cache_threshold: (float, 0.0 to 1.0)
+            Minimum prefix match ratio to use highest-match routing.
+            Below this threshold, routes to worker with most available cache space.
+
+            2. balance_abs_threshold: (integer)
+            Absolute difference threshold for load imbalance detection.
+            System is potentially imbalanced if (max_load - min_load) > abs_threshold
+
+            3. balance_rel_threshold: (float)
+            Relative ratio threshold for load imbalance detection.
+            System is potentially imbalanced if max_load > min_load * rel_threshold
+            Used in conjunction with abs_threshold to determine final imbalance state.
+
+            4. eviction_interval_secs: (integer)
+            Interval between LRU eviction cycles for the approximate trees.
+
+            5. max_tree_size: (integer)
+            Maximum nodes per tree. When exceeded, LRU leaf nodes are evicted
+            during the next eviction cycle.
        */
        worker_urls: Vec<String>,
        tree: Arc<Mutex<Tree>>,
        running_queue: Arc<Mutex<HashMap<String, usize>>>,
        processed_queue: Arc<Mutex<HashMap<String, usize>>>,
        cache_threshold: f32,
-        cache_routing_prob: f32,
-        _eviction_thread: Option<thread::JoinHandle<()>>, // Store thread handle
+        balance_abs_threshold: usize,
+        balance_rel_threshold: f32,
+        _eviction_thread: Option<thread::JoinHandle<()>>,
    },
 }

@@ -91,7 +99,8 @@ pub enum PolicyConfig {
    RoundRobinConfig,
    CacheAwareConfig {
        cache_threshold: f32,
-        cache_routing_prob: f32,
+        balance_abs_threshold: usize,
+        balance_rel_threshold: f32,
        eviction_interval_secs: u64,
        max_tree_size: usize,
    },
@@ -128,7 +137,8 @@ impl Router {
            },
            PolicyConfig::CacheAwareConfig {
                cache_threshold,
-                cache_routing_prob,
+                balance_abs_threshold,
+                balance_rel_threshold,
                eviction_interval_secs,
                max_tree_size,
            } => {
@@ -149,6 +159,7 @@ impl Router {
                // Create background eviction thread
                let tree_clone = Arc::clone(&tree);
                let processed_queue_clone = Arc::clone(&processed_queue);
+                let running_queue_clone = Arc::clone(&running_queue);
                let eviction_thread = thread::spawn(move || {
                    loop {
                        // Sleep for the specified interval
@@ -161,6 +172,10 @@ impl Router {
                        // Print the process queue
                        let locked_processed_queue = processed_queue_clone.lock().unwrap();
                        println!("Processed Queue: {:?}", locked_processed_queue);
+
+                        // Print the running queue
+                        let locked_running_queue = running_queue_clone.lock().unwrap();
+                        println!("Running Queue: {:?}", locked_running_queue);
                    }
                });

@@ -174,7 +189,8 @@ impl Router {
                    running_queue,
                    processed_queue,
                    cache_threshold,
-                    cache_routing_prob,
+                    balance_abs_threshold,
+                    balance_rel_threshold,
                    _eviction_thread: Some(eviction_thread),
                }
            }
@@ -203,8 +219,6 @@ impl Router {
        route: &str,
    ) -> HttpResponse {
        let text = get_text_from_request(&body, route);
-        // For Debug
-        // println!("text: {:?}, route: {:?}", text, route);

        let worker_url = match self {
            Router::RoundRobin {
@@ -218,7 +232,6 @@ impl Router {
                        |x| Some((x + 1) % worker_urls.len()),
                    )
                    .unwrap();
-
                worker_urls[idx].clone()
            }

@@ -232,19 +245,42 @@ impl Router {
                running_queue,
                processed_queue,
                cache_threshold,
-                cache_routing_prob,
+                balance_abs_threshold,
+                balance_rel_threshold,
                ..
            } => {
-                // even though the tree is thread-safe, we still put a lock to ensure the whole op (tree read + queue read + tree write + queue write) is atomic to handle some edge cases (e.g. multiple requests with long prefix entering at the same time)
+                // TODO: delay scheduling if cache hit rate is high because it may cause imbalance. prioritize low hit rate ones

                let mut tree = tree.lock().unwrap();
                let mut running_queue = running_queue.lock().unwrap();

-                // Generate a random float between 0 and 1 for probability check
-                let sampled_p: f32 = rand::random();
-
-                let selected_url = if sampled_p < *cache_routing_prob {
-                    // Cache-aware routing logic
+                // Get current load statistics
+                let max_load = *running_queue.values().max().unwrap_or(&0);
+                let min_load = *running_queue.values().min().unwrap_or(&0);
+
+                // Load is considered imbalanced if:
+                // 1. (max - min) > abs_threshold AND
+                // 2. max > rel_threshold * min
+                let is_imbalanced = max_load.saturating_sub(min_load) > *balance_abs_threshold
+                    && (max_load as f32) > (min_load as f32 * balance_rel_threshold);
+
+                let selected_url = if is_imbalanced {
+                    // Log load balancing trigger and current queue state
+                    println!(
+                        "Load balancing triggered due to workload imbalance:\n\
+                        Max load: {}, Min load: {}\n\
+                        Current running queue: {:?}",
+                        max_load, min_load, running_queue
+                    );
+
+                    // Use shortest queue routing when load is imbalanced
+                    running_queue
+                        .iter()
+                        .min_by_key(|(_url, &count)| count)
+                        .map(|(url, _)| url.clone())
+                        .unwrap_or_else(|| worker_urls[0].clone())
+                } else {
+                    // Use cache-aware routing when load is balanced
                    let (matched_text, matched_worker) = tree.prefix_match(&text);
                    let matched_rate =
                        matched_text.chars().count() as f32 / text.chars().count() as f32;
@@ -252,36 +288,18 @@ impl Router {
                    if matched_rate > *cache_threshold {
                        matched_worker.to_string()
                    } else {
-                        // For Debug
-                        // let m_map: HashMap<String, usize> = tree
-                        //     .tenant_char_count
-                        //     .iter()
-                        //     .map(|entry| (entry.key().clone(), *entry.value()))
-                        //     .collect();
-
-                        // println!("map: {:?}, mmap: {:?}", tree.get_tenant_char_count(), m_map);
-
                        tree.get_smallest_tenant()
                    }
-                } else {
-                    // Shortest queue routing logic
-                    running_queue
-                        .iter()
-                        .min_by_key(|(_url, &count)| count)
-                        .map(|(url, _)| url.clone())
-                        .unwrap_or_else(|| worker_urls[0].clone())
                };

-                // Update running queue
-                let count = running_queue.get_mut(&selected_url).unwrap();
-                *count += 1;
-
-                // Update processed queue
-                let mut locked_processed_queue = processed_queue.lock().unwrap();
-                let count = locked_processed_queue.get_mut(&selected_url).unwrap();
-                *count += 1;
+                // Update queues and tree
+                *running_queue.get_mut(&selected_url).unwrap() += 1;

-                // Update tree with the new request
+                *processed_queue
+                    .lock()
+                    .unwrap()
+                    .get_mut(&selected_url)
+                    .unwrap() += 1;
                tree.insert(&text, &selected_url);

                selected_url