Make request payload size configurable (#2444)

Signed-off-by: Ata Fatahi <immrata@gmail.com>

Make request payload size configurable (#2444)
Signed-off-by: Ata Fatahi <immrata@gmail.com>
2ac36b9a · Ata Fatahi · GitHub · 2d60a5ee · 2ac36b9a · 2ac36b9a
Unverified Commit 2ac36b9a authored Dec 11, 2024 by Ata Fatahi Committed by GitHub Dec 11, 2024
7 changed files
--- a/.github/workflows/pr-test-rust.yml
+++ b/.github/workflows/pr-test-rust.yml
@@ -60,6 +60,7 @@ jobs:
          pip install --force-reinstall dist/*.whl
      - name: Run e2e test
        run: |
+          bash scripts/killall_sglang.sh
          cd rust/py_test
          python3 run_suite.py

--- a/rust/py_src/sglang_router/launch_router.py
+++ b/rust/py_src/sglang_router/launch_router.py
@@ -38,6 +38,7 @@ class RouterArgs:
    balance_rel_threshold: float = 1.0001
    eviction_interval: int = 60
    max_tree_size: int = 2**24
+    max_payload_size: int = 4 * 1024 * 1024  # 4MB
    verbose: bool = False
    @staticmethod
@@ -116,6 +117,12 @@ class RouterArgs:
            default=RouterArgs.max_tree_size,
            help="Maximum size of the approximation tree for cache-aware routing",
        )
+        parser.add_argument(
+            f"--{prefix}max-payload-size",
+            type=int,
+            default=RouterArgs.max_payload_size,
+            help="Maximum payload size in bytes",
+        )
        parser.add_argument(
            f"--{prefix}verbose",
            action="store_true",
@@ -144,6 +151,7 @@ class RouterArgs:
            balance_rel_threshold=getattr(args, f"{prefix}balance_rel_threshold"),
            eviction_interval=getattr(args, f"{prefix}eviction_interval"),
            max_tree_size=getattr(args, f"{prefix}max_tree_size"),
+            max_payload_size=getattr(args, f"{prefix}max_payload_size"),
            verbose=getattr(args, f"{prefix}verbose", False),
        )
@@ -187,6 +195,7 @@ def launch_router(args: argparse.Namespace) -> Optional[Router]:
            balance_rel_threshold=router_args.balance_rel_threshold,
            eviction_interval_secs=router_args.eviction_interval,
            max_tree_size=router_args.max_tree_size,
+            max_payload_size=router_args.max_payload_size,
            verbose=router_args.verbose,
        )
@@ -194,7 +203,7 @@ def launch_router(args: argparse.Namespace) -> Optional[Router]:
        return router
    except Exception as e:
-        logger.error(f"Error starting router: {e}", file=sys.stderr)
+        logger.error(f"Error starting router: {e}")
        return None

--- a/rust/py_src/sglang_router/router.py
+++ b/rust/py_src/sglang_router/router.py
@@ -26,6 +26,7 @@ class Router:
            AND max_load > min_load * rel_threshold. Otherwise, use cache aware. Default: 1.0001
        eviction_interval_secs: Interval in seconds between cache eviction operations in cache-aware
            routing. Default: 60
+        max_payload_size: Maximum payload size in bytes. Default: 4MB
        max_tree_size: Maximum size of the approximation tree for cache-aware routing. Default: 2^24
        verbose: Enable verbose logging. Default: False
    """
@@ -41,6 +42,7 @@ class Router:
        balance_rel_threshold: float = 1.0001,
        eviction_interval_secs: int = 60,
        max_tree_size: int = 2**24,
+        max_payload_size: int = 4 * 1024 * 1024,  # 4MB
        verbose: bool = False,
    ):
        self._router = _Router(
@@ -53,6 +55,7 @@ class Router:
            balance_rel_threshold=balance_rel_threshold,
            eviction_interval_secs=eviction_interval_secs,
            max_tree_size=max_tree_size,
+            max_payload_size=max_payload_size,
            verbose=verbose,
        )

--- a/rust/py_test/test_launch_router.py
+++ b/rust/py_test/test_launch_router.py
@@ -35,6 +35,7 @@ class TestLaunchRouter(unittest.TestCase):
            balance_rel_threshold=1.0001,
            eviction_interval=60,
            max_tree_size=2**24,
+            max_payload_size=4 * 1024 * 1024,  # 4MB
            verbose=False,
        )

--- a/rust/py_test/test_launch_server.py
+++ b/rust/py_test/test_launch_server.py
@@ -21,6 +21,7 @@ def popen_launch_router(
    dp_size: int,
    timeout: float,
    policy: str = "cache_aware",
+    max_payload_size: int = None,
 ):
    """
    Launch the router server process.
@@ -31,6 +32,7 @@ def popen_launch_router(
        dp_size: Data parallel size
        timeout: Server launch timeout
        policy: Router policy, one of "cache_aware", "round_robin", "random"
+        max_payload_size: Maximum payload size in bytes
    """
    _, host, port = base_url.split(":")
    host = host[2:]
@@ -46,13 +48,16 @@ def popen_launch_router(
        "--port",
        port,
        "--dp",
-        str(dp_size),  # Convert dp_size to string
+        str(dp_size),
        "--router-eviction-interval",
-        "5",  # frequent eviction for testing
+        "5",
        "--router-policy",
        policy,
    ]
+    if max_payload_size is not None:
+        command.extend(["--router-max-payload-size", str(max_payload_size)])
    process = subprocess.Popen(command, stdout=None, stderr=None)
    start_time = time.time()
@@ -280,6 +285,54 @@ class TestLaunchServer(unittest.TestCase):
        msg = f"MMLU test {'passed' if passed else 'failed'} with score {score:.3f} (threshold: {THRESHOLD})"
        self.assertGreaterEqual(score, THRESHOLD, msg)
+    def test_4_payload_size(self):
+        print("Running test_4_payload_size...")
+        # Start router with 3MB limit
+        self.process = popen_launch_router(
+            self.model,
+            self.base_url,
+            dp_size=1,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            policy="round_robin",
+            max_payload_size=1 * 1024 * 1024,  # 1MB limit
+        )
+        # Test case 1: Payload just under 1MB should succeed
+        payload_0_5_mb = {
+            "text": "x" * int(0.5 * 1024 * 1024),  # 0.5MB of text
+            "temperature": 0.0,
+        }
+        with requests.Session() as session:
+            response = session.post(
+                f"{self.base_url}/generate",
+                json=payload_0_5_mb,
+                headers={"Content-Type": "application/json"},
+            )
+            self.assertEqual(
+                response.status_code,
+                200,
+                f"0.5MB payload should succeed but got status {response.status_code}",
+            )
+        # Test case 2: Payload over 1MB should fail
+        payload_1_plus_mb = {
+            "text": "x" * int((1.2 * 1024 * 1024)),  # 1.2MB of text
+            "temperature": 0.0,
+        }
+        with requests.Session() as session:
+            response = session.post(
+                f"{self.base_url}/generate",
+                json=payload_1_plus_mb,
+                headers={"Content-Type": "application/json"},
+            )
+            self.assertEqual(
+                response.status_code,
+                413,  # Payload Too Large
+                f"1.2MB payload should fail with 413 but got status {response.status_code}",
+            )
 if __name__ == "__main__":
    unittest.main()
--- a/rust/src/lib.rs
+++ b/rust/src/lib.rs
@@ -22,6 +22,7 @@ struct Router {
    balance_rel_threshold: f32,
    eviction_interval_secs: u64,
    max_tree_size: usize,
+    max_payload_size: usize,
    verbose: bool,
 }
@@ -38,6 +39,7 @@ impl Router {
        balance_rel_threshold = 1.0001,
        eviction_interval_secs = 60,
        max_tree_size = 2usize.pow(24),
+        max_payload_size = 4 * 1024 * 1024,
        verbose = false
    ))]
    fn new(
@@ -50,6 +52,7 @@ impl Router {
        balance_rel_threshold: f32,
        eviction_interval_secs: u64,
        max_tree_size: usize,
+        max_payload_size: usize,
        verbose: bool,
    ) -> PyResult<Self> {
        Ok(Router {
@@ -62,6 +65,7 @@ impl Router {
            balance_rel_threshold,
            eviction_interval_secs,
            max_tree_size,
+            max_payload_size,
            verbose,
        })
    }
@@ -86,6 +90,7 @@ impl Router {
                worker_urls: self.worker_urls.clone(),
                policy_config,
                verbose: self.verbose,
+                max_payload_size: self.max_payload_size,
            })
            .await
            .unwrap();

--- a/rust/src/server.rs
+++ b/rust/src/server.rs
@@ -127,6 +127,7 @@ pub struct ServerConfig {
    pub worker_urls: Vec<String>,
    pub policy_config: PolicyConfig,
    pub verbose: bool,
+    pub max_payload_size: usize,
 }
 pub async fn startup(config: ServerConfig) -> std::io::Result<()> {
@@ -164,10 +165,16 @@ pub async fn startup(config: ServerConfig) -> std::io::Result<()> {
    info!("✅ Starting router on {}:{}", config.host, config.port);
    info!("✅ Serving Worker URLs: {:?}", config.worker_urls);
    info!("✅ Policy Config: {:?}", config.policy_config);
+    info!(
+        "✅ Max payload size: {} MB",
+        config.max_payload_size / (1024 * 1024)
+    );
    HttpServer::new(move || {
        App::new()
            .app_data(app_state.clone())
+            .app_data(web::JsonConfig::default().limit(config.max_payload_size))
+            .app_data(web::PayloadConfig::default().limit(config.max_payload_size))
            .service(generate)
            .service(v1_chat_completions)
            .service(v1_completions)