# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 """ Test suite for TensorRT-LLM KV Router. Usage: python test_router.py # Run text-only tests (requires server) python test_router.py --verbose # Show detailed logs python test_router.py --mm-only # Run multimodal hash tests (no server needed) python test_router.py --mm-server # Run multimodal server tests (requires VLM) python test_router.py --all # Run all tests """ import argparse import sys import time from dataclasses import dataclass import httpx from dynamo.llm import compute_block_hash_for_seq # Sample test images from COCO dataset TEST_IMAGE_1 = "http://images.cocodataset.org/test2017/000000155781.jpg" TEST_IMAGE_2 = "http://images.cocodataset.org/test2017/000000000001.jpg" TEST_IMAGE_3 = "http://images.cocodataset.org/test2017/000000155721.jpg" TEST_IMAGE_4 = "https://huggingface.co/datasets/Sayali9141/traffic_signal_images/resolve/main/61.jpg" @dataclass class RouterTestConfig: api_url: str = "http://localhost:8000" router_url: str = "http://localhost:7000" timeout: int = 30 kv_settle_time: float = 3.0 # Time to wait for KV events to propagate @dataclass class RouterTestResult: name: str passed: bool message: str overlap: float = 0.0 def make_request(content: str, max_tokens: int = 10) -> dict: """Create a text-only chat completion request.""" return { "model": "test", "messages": [{"role": "user", "content": content}], "stream": True, "max_tokens": max_tokens, } def make_mm_request(text: str, image_url: str, max_tokens: int = 10) -> dict: """Create a multimodal chat completion request with image.""" return { "model": "test", "messages": [ { "role": "user", "content": [ {"type": "text", "text": text}, {"type": "image_url", "image_url": {"url": image_url}}, ], } ], "stream": True, "max_tokens": max_tokens, } def make_multi_image_request( text: str, image_urls: list[str], max_tokens: int = 10 ) -> dict: """Create a multimodal chat completion request with multiple images.""" content: list[dict] = [{"type": "text", "text": text}] for url in image_urls: content.append({"type": "image_url", "image_url": {"url": url}}) return { "model": "test", "messages": [{"role": "user", "content": content}], "stream": True, "max_tokens": max_tokens, } def send_request(client: httpx.Client, url: str, payload: dict) -> bool: """Send a chat completion request and consume the stream.""" try: resp = client.post(f"{url}/v1/chat/completions", json=payload) if resp.status_code != 200: return False for _ in resp.iter_lines(): pass return True except Exception: return False def get_tree_info(client: httpx.Client, url: str) -> dict: """Get radix tree debug info.""" try: resp = client.get(f"{url}/debug/tree_info") return resp.json() except Exception: return {"num_blocks": -1, "events": []} class KvRouterTests: """Test cases for KV cache routing.""" def __init__(self, config: RouterTestConfig, verbose: bool = False): self.config = config self.verbose = verbose self.client = httpx.Client(timeout=config.timeout) self.results: list[RouterTestResult] = [] # Test messages designed for block_size=32 # "Are you ok? Hello! Thank you! Thank you very much! " is ~12 tokens # Chat template adds ~4 tokens self.base_phrase = "Are you ok? Hello! Thank you! Thank you very much! " def log(self, msg: str): if self.verbose: print(f" {msg}") def run_all(self) -> bool: """Run all test cases.""" print("\nKV Router Test Suite") print("=" * 50) # Check server connectivity first if not self._check_servers(): print("\nFATAL: Cannot connect to servers") return False # Run test cases self._test_full_match() self._test_partial_match() self._test_no_match() # Print summary return self._print_summary() def run_mm_tests(self) -> bool: """Run multimodal tests (local hash computation, no server needed).""" print("\nMultimodal KV Router Tests (Local)") print("=" * 50) print("(These tests verify hash computation without server)") self._test_mm_hash_computation() self._test_mm_routing_distinction() self._test_mm_hash_consistency() self._test_mm_offset_affects_hash() self._test_mm_block_boundary() self._test_mm_multi_image_partial_match() return self._print_summary() def run_mm_server_tests(self) -> bool: """Run multimodal tests that require server.""" print("\nMultimodal KV Router Tests (Server)") print("=" * 50) if not self._check_servers(): print("\nFATAL: Cannot connect to servers") return False self._test_mm_same_image_cache_hit() self._test_mm_different_images_no_cache_hit() self._test_text_cache_hit_with_overlap() self._test_mm_multi_image_partial_match() return self._print_summary() def _check_servers(self) -> bool: """Verify both API and Router servers are reachable.""" print("\nChecking server connectivity...") try: # Check router resp = self.client.get(f"{self.config.router_url}/debug/tree_info") if resp.status_code != 200: print(f" Router not responding: {resp.status_code}") return False print(f" Router OK (blocks in tree: {resp.json().get('num_blocks', '?')})") # Check API - just verify it's up # A simple request to verify the endpoint exists return True except Exception as e: print(f" Connection error: {e}") return False def _test_full_match(self): """ Test: Send identical request twice. Expected: Second request should have overlap > 0. """ print("\n[1] Full Match Test") print(" Sending same request twice, expecting cache hit on second...") # Create a request with enough tokens for multiple full blocks # 5 repetitions ≈ 64 tokens ≈ 2 full blocks content = (self.base_phrase * 5).strip() payload = make_request(content) # Get initial state initial = get_tree_info(self.client, self.config.router_url) initial_blocks = initial["num_blocks"] self.log(f"Initial blocks: {initial_blocks}") # First request - should populate cache (or hit existing cache) self.log("Sending first request...") if not send_request(self.client, self.config.api_url, payload): self.results.append( RouterTestResult("full_match", False, "First request failed") ) return # Wait for KV events self.log(f"Waiting {self.config.kv_settle_time}s for KV events...") time.sleep(self.config.kv_settle_time) # Check blocks after first request after_first = get_tree_info(self.client, self.config.router_url) blocks_added = after_first["num_blocks"] - initial_blocks self.log( f"Blocks after first: {after_first['num_blocks']} (added {blocks_added})" ) # Second request - should hit cache self.log("Sending second request (should hit cache)...") if not send_request(self.client, self.config.api_url, payload): self.results.append( RouterTestResult("full_match", False, "Second request failed") ) return # Success: either new blocks were added, or blocks already existed (from previous runs) # Either way, the second request should show overlap > 0 in server logs total_blocks = after_first["num_blocks"] self.results.append( RouterTestResult( "full_match", True, f"OK - Tree has {total_blocks} blocks. Check server logs for 'overlap > 0'.", ) ) def _test_partial_match(self): """ Test: Send request A, then request B that shares same prefix but is longer. Expected: Request B should have partial overlap (matching the shared prefix blocks). """ print("\n[2] Partial Match Test") print(" Request B shares prefix with cached request A...") # Request A: 5 repetitions (~64 tokens, ~2 full blocks) content_a = (self.base_phrase * 5).strip() # Request B: 8 repetitions (~100 tokens, ~3 full blocks) # First 2 blocks should match A, third block is new content_b = (self.base_phrase * 8).strip() payload_a = make_request(content_a) payload_b = make_request(content_b) # Ensure A is cached (might already be from previous test) self.log("Ensuring request A is cached...") send_request(self.client, self.config.api_url, payload_a) time.sleep(self.config.kv_settle_time) before = get_tree_info(self.client, self.config.router_url) self.log(f"Blocks before B: {before['num_blocks']}") # Send request B self.log("Sending request B (longer, shares prefix)...") if not send_request(self.client, self.config.api_url, payload_b): self.results.append( RouterTestResult("partial_match", False, "Request B failed") ) return time.sleep(self.config.kv_settle_time) after = get_tree_info(self.client, self.config.router_url) new_blocks = after["num_blocks"] - before["num_blocks"] self.log(f"New blocks from B: {new_blocks}") # B should add new blocks (the non-matching suffix) # The matching prefix blocks already exist self.results.append( RouterTestResult( "partial_match", True, f"OK - Request B added {new_blocks} new blocks. " f"Check server logs for partial overlap (0 < overlap < 1).", ) ) def _test_no_match(self): """ Test: Send completely different content. Expected: No cache hit (overlap = 0). """ print("\n[3] No Match Test") print(" Sending completely different content...") # Content that's very different from previous tests # ~80 tokens, completely different from "Hello are you ok leijun" content = ( "The quick brown fox jumps over the lazy dog. " "Pack my box with five dozen liquor jugs. " "How vexingly quick daft zebras jump. " "The five boxing wizards jump quickly. " "Sphinx of black quartz, judge my vow." ) payload = make_request(content) before = get_tree_info(self.client, self.config.router_url) self.log(f"Blocks before: {before['num_blocks']}") # Send the different request self.log("Sending unrelated request...") if not send_request(self.client, self.config.api_url, payload): self.results.append(RouterTestResult("no_match", False, "Request failed")) return # No need to wait - we're checking overlap on this request, not the next self.results.append( RouterTestResult( "no_match", True, "OK - Check server logs for 'overlap = 0.000' (no cache hit expected).", ) ) def _test_mm_hash_computation(self): """ Test: Verify that compute_block_hash_for_seq produces different hashes for same tokens with different mm_hash values. """ print("\n[MM-1] MM Hash Computation Test") print(" Verifying same tokens + different mm_hash = different block_hash...") # Simulated tokens (32 tokens = 1 block) tokens = [100] * 32 block_size = 32 # Hash without MM info hash_no_mm = compute_block_hash_for_seq(tokens, block_size) # Hash with MM info (simulated mm_hash) mm_info_1 = {"mm_objects": [{"mm_hash": 0xDEADBEEF, "offsets": [[0, 32]]}]} hash_with_mm1 = compute_block_hash_for_seq(tokens, block_size, [mm_info_1]) # Hash with different MM info mm_info_2 = {"mm_objects": [{"mm_hash": 0xCAFEBABE, "offsets": [[0, 32]]}]} hash_with_mm2 = compute_block_hash_for_seq(tokens, block_size, [mm_info_2]) self.log(f"Hash without MM: {hash_no_mm}") self.log(f"Hash with MM 1: {hash_with_mm1}") self.log(f"Hash with MM 2: {hash_with_mm2}") # Verify all hashes are different if hash_no_mm == hash_with_mm1: self.results.append( RouterTestResult( "mm_hash_computation", False, "FAIL - Hash without MM equals hash with MM", ) ) return if hash_with_mm1 == hash_with_mm2: self.results.append( RouterTestResult( "mm_hash_computation", False, "FAIL - Different mm_hash produced same block_hash", ) ) return self.results.append( RouterTestResult( "mm_hash_computation", True, "OK - Different mm_hash values produce different block hashes", ) ) def _test_mm_routing_distinction(self): """ Test: Verify that the routing logic can distinguish between requests with same text but different images. """ print("\n[MM-2] MM Routing Distinction Test") print(" Verifying routing can distinguish same text + different images...") # This test simulates what the router would see tokens = [100] * 64 # 2 blocks block_size = 32 # Simulate Image A cached on worker 0 mm_info_a = { "mm_objects": [{"mm_hash": 0x1111111111111111, "offsets": [[0, 64]]}] } hashes_a = compute_block_hash_for_seq( tokens, block_size, [mm_info_a, mm_info_a] ) # Simulate Image B cached on worker 1 mm_info_b = { "mm_objects": [{"mm_hash": 0x2222222222222222, "offsets": [[0, 64]]}] } hashes_b = compute_block_hash_for_seq( tokens, block_size, [mm_info_b, mm_info_b] ) self.log(f"Hashes for Image A: {hashes_a}") self.log(f"Hashes for Image B: {hashes_b}") # Verify hashes are different if hashes_a == hashes_b: self.results.append( RouterTestResult( "mm_routing_distinction", False, "FAIL - Same tokens with different images produced same hashes", ) ) return self.results.append( RouterTestResult( "mm_routing_distinction", True, "OK - Router can distinguish requests with different images", ) ) def _test_mm_hash_consistency(self): """ Test: Verify that the same mm_hash + tokens produce the same block_hash regardless of when computed (idempotency). """ print("\n[MM-3] MM Hash Consistency Test") print(" Verifying same inputs produce same hash (idempotent)...") tokens = [151937] * 32 # Image token placeholder block_size = 32 mm_hash = 0xDEADBEEFCAFEBABE mm_info = {"mm_objects": [{"mm_hash": mm_hash, "offsets": [[0, 32]]}]} # Compute hash multiple times hash1 = compute_block_hash_for_seq(tokens, block_size, [mm_info]) hash2 = compute_block_hash_for_seq(tokens, block_size, [mm_info]) hash3 = compute_block_hash_for_seq(tokens, block_size, [mm_info]) self.log(f"Hash 1: {hash1}") self.log(f"Hash 2: {hash2}") self.log(f"Hash 3: {hash3}") if hash1 != hash2 or hash2 != hash3: self.results.append( RouterTestResult( "mm_hash_consistency", False, f"FAIL - Same inputs produced different hashes: {hash1}, {hash2}, {hash3}", ) ) return self.results.append( RouterTestResult( "mm_hash_consistency", True, f"OK - Hash computation is idempotent: {hash1[0]}", ) ) def _test_mm_offset_affects_hash(self): """ Test: Verify that different offsets produce different hashes, even with same mm_hash and tokens. """ print("\n[MM-4] MM Offset Affects Hash Test") print(" Verifying different offsets produce different hashes...") tokens = [151937] * 64 # 2 blocks of image tokens block_size = 32 mm_hash = 0x123456789ABCDEF0 # Image covers first block only mm_info_first = {"mm_objects": [{"mm_hash": mm_hash, "offsets": [[0, 32]]}]} hash_first = compute_block_hash_for_seq( tokens, block_size, [mm_info_first, None] ) # Image covers second block only mm_info_second = {"mm_objects": [{"mm_hash": mm_hash, "offsets": [[32, 64]]}]} hash_second = compute_block_hash_for_seq( tokens, block_size, [None, mm_info_second] ) # Image covers both blocks mm_info_both = {"mm_objects": [{"mm_hash": mm_hash, "offsets": [[0, 64]]}]} hash_both = compute_block_hash_for_seq( tokens, block_size, [mm_info_both, mm_info_both] ) self.log(f"Hash (first block MM): {hash_first}") self.log(f"Hash (second block MM): {hash_second}") self.log(f"Hash (both blocks MM): {hash_both}") # Block 0 with mm_info should differ from block 0 without mm_info # Block 1 with mm_info should differ from block 1 without mm_info if hash_first[0] == hash_second[0]: self.results.append( RouterTestResult( "mm_offset_affects_hash", False, "FAIL - First block hash should differ based on MM presence", ) ) return self.results.append( RouterTestResult( "mm_offset_affects_hash", True, "OK - Different MM offsets produce different block hashes", ) ) def _test_mm_block_boundary(self): """ Test: Verify that MM info correctly applies at block boundaries. """ print("\n[MM-5] MM Block Boundary Test") print(" Verifying MM info applies correctly at block boundaries...") block_size = 32 mm_hash = 0xFEDCBA9876543210 # 96 tokens = 3 blocks # Image tokens in the middle block (32-64) tokens = [100] * 32 + [151937] * 32 + [200] * 32 # MM info only applies to middle block mm_info = {"mm_objects": [{"mm_hash": mm_hash, "offsets": [[32, 64]]}]} hashes_with_mm = compute_block_hash_for_seq( tokens, block_size, [None, mm_info, None] ) # No MM info hashes_without_mm = compute_block_hash_for_seq(tokens, block_size, None) self.log(f"Hashes with MM: {hashes_with_mm}") self.log(f"Hashes without MM: {hashes_without_mm}") # Block 0 and 2 should be the same (no image tokens) # Block 1 should be different (has image tokens + mm_hash) if hashes_with_mm[0] != hashes_without_mm[0]: self.results.append( RouterTestResult( "mm_block_boundary", False, "FAIL - Block 0 should be same (no MM)" ) ) return if hashes_with_mm[1] == hashes_without_mm[1]: self.results.append( RouterTestResult( "mm_block_boundary", False, "FAIL - Block 1 should differ (has MM)" ) ) return if hashes_with_mm[2] != hashes_without_mm[2]: self.results.append( RouterTestResult( "mm_block_boundary", False, "FAIL - Block 2 should be same (no MM)" ) ) return self.results.append( RouterTestResult( "mm_block_boundary", True, "OK - MM info correctly applies only to relevant blocks", ) ) def _test_mm_same_image_cache_hit(self): """ Test: Send same text + same image twice. Expected: Second request should have cache hit (overlap > 0). """ print("\n[MM-S1] Same Image Cache Hit Test") print(" Sending same text + same image twice...") payload = make_mm_request("Describe this image", TEST_IMAGE_1) # Get initial state initial = get_tree_info(self.client, self.config.router_url) self.log(f"Initial blocks: {initial['num_blocks']}") # First request - populates the cache self.log("Sending first MM request...") if not send_request(self.client, self.config.api_url, payload): self.results.append( RouterTestResult("mm_same_image", False, "First MM request failed") ) return # Wait for KV events to propagate self.log(f"Waiting {self.config.kv_settle_time}s for KV events...") time.sleep(self.config.kv_settle_time) after_first = get_tree_info(self.client, self.config.router_url) blocks_added = after_first["num_blocks"] - initial["num_blocks"] self.log( f"Blocks after first: {after_first['num_blocks']} (added {blocks_added})" ) if blocks_added == 0: self.results.append( RouterTestResult( "mm_same_image", False, "FAIL - No blocks added after first request" ) ) return # Second identical request - should hit cache self.log("Sending second MM request (same image)...") if not send_request(self.client, self.config.api_url, payload): self.results.append( RouterTestResult("mm_same_image", False, "Second MM request failed") ) return # Query router to check overlap (simulating what the second request saw) # We need to compute the same hashes that the API computed # For now, check the tree grew or stayed same (cache reuse) after_second = get_tree_info(self.client, self.config.router_url) self.log(f"Blocks after second: {after_second['num_blocks']}") # The second request should reuse cached blocks, so minimal new blocks added new_blocks_second = after_second["num_blocks"] - after_first["num_blocks"] self.log(f"New blocks from second request: {new_blocks_second}") self.results.append( RouterTestResult( "mm_same_image", True, f"OK - First added {blocks_added} blocks, second added {new_blocks_second}. " f"Check logs for 'overlap > 0' on second request.", ) ) def _test_mm_different_images_no_cache_hit(self): """ Test: Send same text but different images. Expected: No cache hit (overlap ≈ 0) because mm_hash differs. Image blocks should not match, only text prefix might match. """ print("\n[MM-S2] Different Images No Cache Hit Test") print(" Sending same text + different images...") # First image payload_1 = make_mm_request("Describe this image in detail", TEST_IMAGE_2) initial = get_tree_info(self.client, self.config.router_url) self.log(f"Initial blocks: {initial['num_blocks']}") self.log(f"Sending request with image 1: {TEST_IMAGE_2}") if not send_request(self.client, self.config.api_url, payload_1): self.results.append( RouterTestResult("mm_different_images", False, "Image 1 request failed") ) return time.sleep(self.config.kv_settle_time) after_img1 = get_tree_info(self.client, self.config.router_url) blocks_img1 = after_img1["num_blocks"] - initial["num_blocks"] self.log( f"Blocks after image 1: {after_img1['num_blocks']} (added {blocks_img1})" ) # Second image (same text, different image) payload_2 = make_mm_request("Describe this image in detail", TEST_IMAGE_3) self.log(f"Sending request with image 2: {TEST_IMAGE_3}") if not send_request(self.client, self.config.api_url, payload_2): self.results.append( RouterTestResult("mm_different_images", False, "Image 2 request failed") ) return time.sleep(self.config.kv_settle_time) after_img2 = get_tree_info(self.client, self.config.router_url) blocks_img2 = after_img2["num_blocks"] - after_img1["num_blocks"] self.log( f"Blocks after image 2: {after_img2['num_blocks']} (added {blocks_img2})" ) # Different images should add similar number of blocks # If image 2 had cache hit, it would add fewer blocks if blocks_img2 == 0: self.results.append( RouterTestResult( "mm_different_images", False, "FAIL - Image 2 added 0 blocks (unexpected full cache hit)", ) ) return # Image 2 should add approximately same number of blocks as image 1 # (since different mm_hash means image blocks don't match) self.results.append( RouterTestResult( "mm_different_images", True, f"OK - Image 1 added {blocks_img1} blocks, image 2 added {blocks_img2} blocks. " f"Different images = different block hashes.", ) ) def _test_text_cache_hit_with_overlap(self): """ Test: Send same text request twice and verify overlap via router API. Expected: Second request should show overlap > 0 in router response. """ print("\n[MM-S3] Text Cache Hit with Overlap Verification") print(" Sending same text twice and verifying overlap value...") # Use a unique prompt to avoid interference from other tests unique_text = ( "This is a unique test prompt for cache hit verification. " "We need enough tokens to fill at least one block. " "The quick brown fox jumps over the lazy dog repeatedly. " * 3 ) payload = make_request(unique_text, max_tokens=5) # First request self.log("Sending first text request...") if not send_request(self.client, self.config.api_url, payload): self.results.append( RouterTestResult( "text_cache_hit_overlap", False, "First request failed" ) ) return # Wait for KV events self.log(f"Waiting {self.config.kv_settle_time}s for KV events...") time.sleep(self.config.kv_settle_time) # Get tree info to see blocks tree_info = get_tree_info(self.client, self.config.router_url) self.log(f"Blocks in tree: {tree_info['num_blocks']}") # Second request - should see cache hit self.log("Sending second text request (should hit cache)...") if not send_request(self.client, self.config.api_url, payload): self.results.append( RouterTestResult( "text_cache_hit_overlap", False, "Second request failed" ) ) return # For a true verification, we'd need to intercept the router response # or add an endpoint that returns the last routing decision # For now, we verify by checking if blocks increased (they shouldn't much) tree_info_after = get_tree_info(self.client, self.config.router_url) new_blocks = tree_info_after["num_blocks"] - tree_info["num_blocks"] self.log(f"New blocks after second request: {new_blocks}") self.results.append( RouterTestResult( "text_cache_hit_overlap", True, f"OK - Second request added {new_blocks} new blocks. " f"Check logs for 'overlap > 0' (cache hit).", ) ) def _test_mm_multi_image_partial_match(self): """ Test: Verify partial cache match with multi-image requests. Scenario: Step 1: Send Request A = text + [Image_1, Image_4] Step 2: Send Request A again (identical) - verify full cache hit (0 new blocks) Step 3: Send Request B = text + [Image_1, Image_3] - verify partial match (Image_3 is different, should add new blocks) Expected: - Identical request = no new blocks (full cache hit) - Different second image = new blocks added (partial match) """ print("\n[MM-S4] Multi-Image Partial Match Test") print(" Verifying cache behavior with multi-image requests...") # Use longer settle time for this test settle_time = self.config.kv_settle_time * 2 # Request A: text + Image_1 + Image_4 payload_a = make_multi_image_request( "Describe these images in detail", [TEST_IMAGE_1, TEST_IMAGE_4] ) initial = get_tree_info(self.client, self.config.router_url) self.log(f"Initial blocks: {initial['num_blocks']}") # Step 1: Send Request A first time self.log("Step 1: Sending Request A (text + Image_1 + Image_4)...") if not send_request(self.client, self.config.api_url, payload_a): self.results.append( RouterTestResult("mm_multi_image_partial", False, "Request A failed") ) return time.sleep(settle_time) after_a1 = get_tree_info(self.client, self.config.router_url) blocks_a1 = after_a1["num_blocks"] - initial["num_blocks"] self.log( f"Blocks after Request A: {after_a1['num_blocks']} (added {blocks_a1})" ) if blocks_a1 == 0: self.results.append( RouterTestResult( "mm_multi_image_partial", False, "FAIL - Request A added 0 blocks (should populate cache)", ) ) return # Step 2: Send Request A again (identical) - should be full cache hit self.log( "Step 2: Sending Request A again (identical, expect full cache hit)..." ) if not send_request(self.client, self.config.api_url, payload_a): self.results.append( RouterTestResult( "mm_multi_image_partial", False, "Request A (repeat) failed" ) ) return time.sleep(settle_time) after_a2 = get_tree_info(self.client, self.config.router_url) blocks_a2 = after_a2["num_blocks"] - after_a1["num_blocks"] self.log( f"Blocks after Request A repeat: {after_a2['num_blocks']} (added {blocks_a2})" ) # Identical request should add 0 new blocks (full cache hit) if blocks_a2 != 0: self.log( f"WARNING: Identical request added {blocks_a2} blocks (expected 0)" ) # Step 3: Send Request B with different second image payload_b = make_multi_image_request( "Describe these images in detail", [TEST_IMAGE_1, TEST_IMAGE_3] ) self.log( "Step 3: Sending Request B (text + Image_1 + Image_3, different 2nd image)..." ) if not send_request(self.client, self.config.api_url, payload_b): self.results.append( RouterTestResult("mm_multi_image_partial", False, "Request B failed") ) return time.sleep(settle_time) after_b = get_tree_info(self.client, self.config.router_url) blocks_b = after_b["num_blocks"] - after_a2["num_blocks"] self.log(f"Blocks after Request B: {after_b['num_blocks']} (added {blocks_b})") # Analysis: # - If blocks_b > 0: Image_3 created new blocks (correct - different image) # - If blocks_b == 0: Full cache hit (wrong - Image_3 should be different) # # Note: We can't easily verify partial match vs full cache miss because # the tree growth depends on whether routing hit the cached worker. # What we CAN verify is that different images should NOT fully cache hit. if blocks_b == 0 and blocks_a2 == 0: # Both identical and different requests added 0 blocks # This suggests Image_3's mm_hash is incorrectly matching Image_4 self.results.append( RouterTestResult( "mm_multi_image_partial", False, "FAIL - Request B (different image) added 0 blocks. " "Image_3 should have different mm_hash than Image_4. " "Check if mm_hash computation is correct.", ) ) return if blocks_b == 0: # Different image but 0 new blocks - might be timing or routing issue self.results.append( RouterTestResult( "mm_multi_image_partial", False, f"FAIL - Request B added 0 blocks. " f"Identical request added {blocks_a2}. " f"This is unexpected - different images should not fully cache hit.", ) ) return # Success: different image added new blocks self.results.append( RouterTestResult( "mm_multi_image_partial", True, f"OK - Request A: {blocks_a1} blocks, A repeat: {blocks_a2}, " f"Request B (diff image): {blocks_b}. " f"Different images correctly create distinct cache entries.", ) ) def _print_summary(self) -> bool: """Print test results summary.""" print("\n" + "=" * 50) print("Results") print("=" * 50) all_passed = True for r in self.results: _ = "PASS" if r.passed else "FAIL" symbol = "[OK]" if r.passed else "[X]" print(f" {symbol} {r.name}: {r.message}") if not r.passed: all_passed = False print("\n" + "-" * 50) if all_passed: print("All tests passed.") print("\nTo fully verify, check server logs for:") print(" - Full match: overlap > 0.5") print(" - Partial match: 0 < overlap < 0.5") print(" - No match: overlap = 0.000") else: print("Some tests failed. Check the messages above.") return all_passed def cleanup(self): self.client.close() def main(): parser = argparse.ArgumentParser(description="KV Router Test Suite") parser.add_argument( "--verbose", "-v", action="store_true", help="Show detailed logs" ) parser.add_argument( "--api-url", default="http://localhost:8000", help="API server URL" ) parser.add_argument( "--router-url", default="http://localhost:7000", help="Router URL" ) parser.add_argument( "--mm-only", action="store_true", help="Run only multimodal local tests (no server needed)", ) parser.add_argument( "--mm-server", action="store_true", help="Run multimodal server tests (requires VLM model)", ) parser.add_argument( "--all", action="store_true", help="Run all tests including multimodal" ) args = parser.parse_args() config = RouterTestConfig(api_url=args.api_url, router_url=args.router_url) tests = KvRouterTests(config, verbose=args.verbose) try: if args.mm_only: # Local MM tests only (no server) success = tests.run_mm_tests() elif args.mm_server: # MM server tests (requires VLM) success = tests.run_mm_server_tests() elif args.all: # Run all tests success = tests.run_all() if success: success = tests.run_mm_tests() if success: success = tests.run_mm_server_tests() else: # Default: text-only tests success = tests.run_all() sys.exit(0 if success else 1) finally: tests.cleanup() if __name__ == "__main__": main()