feat: Add KV event consolidator for KVBM (vllm) and router integration (#3725)

Signed-off-by: krishung5 <krish@nvidia.com>

feat: Add KV event consolidator for KVBM (vllm) and router integration (#3725)
Signed-off-by: krishung5 <krish@nvidia.com>
18986010 · Kris Hung · GitHub · 95214e8b · 18986010 · 18986010
Unverified Commit 18986010 authored Oct 30, 2025 by Kris Hung Committed by GitHub Oct 30, 2025
3 changed files
--- a/lib/llm/src/block_manager/state/resources.rs
+++ b/lib/llm/src/block_manager/state/resources.rs
@@ -2,10 +2,11 @@
 // SPDX-License-Identifier: Apache-2.0
 use super::*;
+use crate::block_manager::events::DynamoEventManager;
 impl Resources {
    /// Create a new [`Resources`] instance
-    pub fn new(config: KvBlockManagerConfig) -> Result<Self> {
+    pub async fn new(config: KvBlockManagerConfig) -> Result<Self> {
        config
            .runtime
            .validate()
@@ -18,13 +19,34 @@ impl Resources {
        let global_registry = GlobalRegistry::default();
-        let event_manager = config
+        // Create event manager based on configuration:
-            .event_manager
+        // 1. If explicit event_manager provided, use it
-            .clone()
+        // 2. Else if consolidator_config provided, create DynamoEventManager with consolidator
-            .unwrap_or_else(|| NullEventManager::new());
+        // 3. Else use NullEventManager (no event reporting)
+        let event_manager = if let Some(ref event_mgr) = config.event_manager {
-        // Create a NIXL agent if NIXL is enabled and instantiate requested backends
+            tracing::info!("Using explicit event_manager from config");
-        // TODO: Build a map of NIXL backends to block pools/sets
+            event_mgr.clone()
+        } else if let Some(consolidator_config) = config.consolidator_config.clone() {
+            tracing::info!(
+                "Creating DynamoEventManager with kv event consolidator config: vllm={}, output={}",
+                consolidator_config.vllm_event_endpoint,
+                consolidator_config.consolidated_event_endpoint
+            );
+            // Create DynamoEventManager with consolidator config (async)
+            match DynamoEventManager::new_with_config(consolidator_config).await {
+                Ok(manager) => manager as Arc<dyn EventManager>,
+                Err(e) => {
+                    tracing::error!(
+                        "Failed to create DynamoEventManager with consolidator: {}, fallback to NullEventManager",
+                        e
+                    );
+                    NullEventManager::new()
+                }
+            }
+        } else {
+            tracing::info!("Using NullEventManager");
+            NullEventManager::new()
+        };
        let mut nixl_backends: HashMap<String, Arc<nixl_sys::Backend>> = HashMap::new();

--- a/tests/kvbm/common.py
+++ b/tests/kvbm/common.py
@@ -10,6 +10,7 @@ aggregated and disaggregated determinism tests.
 """
 import os
+import re
 import time
 from collections import defaultdict
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -21,21 +22,40 @@ import pytest
 import requests
-class ServerType(str, Enum):
+def check_logs_for_patterns(
-    vllm = "vllm"
+    log_path: Path, patterns: List[str], process_name: str
-    trtllm = "trtllm"
+) -> List[str]:
+    """Check log file for specific patterns (errors, warnings, etc.)."""
+    findings = []
+    if not log_path.exists():
+        return [f"{process_name} log file not found at {log_path}"]
+    try:
+        with open(log_path, "r") as f:
+            content = f.read()
+            for pattern in patterns:
+                matches = re.findall(pattern, content, re.IGNORECASE | re.MULTILINE)
+                if matches:
+                    # Limit to first 3 matches and truncate each to 200 chars
+                    for match in matches[:3]:
+                        match_str = match if isinstance(match, str) else str(match)
+                        findings.append(f"{process_name}: {match_str[:200]}")
+    except Exception as e:
+        findings.append(f"Error reading {process_name} log: {e}")
+    return findings
-class DeterminismTester:
-    """Test class for model determinism validation."""
+class ApiTester:
+    """Base class for making API requests to LLM endpoints."""
    def __init__(
        self,
        base_url: Optional[str] = None,
        model_id: Optional[str] = None,
-        server_type: Optional[str] = ServerType.vllm,
    ):
-        # Allow environment override for flexibility in CI/local runs
        self.base_url = (
            base_url or os.environ.get("DYNAMO_API_BASE_URL") or "http://localhost:8000"
        )
@@ -44,6 +64,87 @@ class DeterminismTester:
            or os.environ.get("KVBM_MODEL_ID")
            or "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
        )
+    def make_request(
+        self,
+        content: str,
+        max_tokens: Optional[int] = None,
+        temperature: float = 0.0,
+        seed: int = 42,
+        **kwargs,
+    ) -> str:
+        """Make API request and return completion text."""
+        payload = {
+            "model": self.model_id,
+            "messages": [
+                {"role": "user", "content": content},
+            ],
+            "stream": False,
+            "temperature": temperature,
+            "seed": seed,
+        }
+        # Add max_tokens with appropriate key based on kwargs or defaults
+        if max_tokens is not None:
+            payload["max_tokens"] = max_tokens
+        elif "max_completion_tokens" in kwargs:
+            payload["max_completion_tokens"] = kwargs.pop("max_completion_tokens")
+        else:
+            payload["max_completion_tokens"] = int(
+                os.environ.get("KVBM_MAX_TOKENS", "48")
+            )
+        # Add any additional kwargs
+        payload.update(kwargs)
+        response = requests.post(
+            f"{self.base_url}/v1/chat/completions",
+            headers={"Content-Type": "application/json"},
+            json=payload,
+            timeout=int(os.environ.get("KVBM_HTTP_TIMEOUT", "30")),
+        )
+        response.raise_for_status()
+        data = response.json()
+        return data["choices"][0]["message"]["content"]
+    def send_chat_request(
+        self,
+        messages: List[dict],
+        max_tokens: int = 50,
+        temperature: float = 0.0,
+        seed: int = 42,
+    ) -> dict:
+        """Send a chat request and return full response JSON."""
+        url = f"{self.base_url}/v1/chat/completions"
+        payload = {
+            "model": self.model_id,
+            "messages": messages,
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+            "seed": seed,
+        }
+        response = requests.post(url, json=payload, timeout=30)
+        response.raise_for_status()
+        return response.json()
+class ServerType(str, Enum):
+    vllm = "vllm"
+    trtllm = "trtllm"
+class DeterminismTester(ApiTester):
+    """Test class for model determinism validation."""
+    def __init__(
+        self,
+        base_url: Optional[str] = None,
+        model_id: Optional[str] = None,
+        server_type: Optional[str] = ServerType.vllm,
+    ):
+        super().__init__(base_url, model_id)
        self.server_type = server_type
        self.shakespeare_file = Path("t8.shakespeare.txt")
@@ -100,30 +201,30 @@ class DeterminismTester:
            with open(self.shakespeare_file, "w", encoding="utf-8") as f:
                f.write(content)
-    def make_request(self, content: str) -> str:
+    # Inherited from ApiTester, but override to add top_p for determinism testing
-        """Make API request and return completion text."""
+    def make_request(
-        payload = {
+        self,
-            "model": self.model_id,
+        content: str,
-            "messages": [
+        max_tokens: Optional[int] = None,
-                {"role": "user", "content": content},
+        temperature: float = 0.0,
-            ],
+        seed: int = 42,
-            "stream": False,
+        **kwargs,
-            "max_completion_tokens": int(os.environ.get("KVBM_MAX_TOKENS", "48")),
+    ) -> str:
-            "temperature": 0,
+        """Make API request and return completion text with determinism settings."""
-            "top_p": 0.0001,
+        # Use determinism-specific defaults
-            "seed": int(os.environ.get("KVBM_SEED", "42")),
+        if max_tokens is None:
-        }
+            max_tokens = int(os.environ.get("KVBM_MAX_TOKENS", "48"))
+        if seed == 42:  # Default seed, use env override
-        response = requests.post(
+            seed = int(os.environ.get("KVBM_SEED", "42"))
-            f"{self.base_url}/v1/chat/completions",
-            headers={"Content-Type": "application/json"},
+        return super().make_request(
-            json=payload,
+            content,
-            timeout=int(os.environ.get("KVBM_HTTP_TIMEOUT", "30")),
+            max_tokens=max_tokens,
+            temperature=temperature,
+            seed=seed,
+            top_p=0.0001,  # For determinism
+            **kwargs,
        )
-        response.raise_for_status()
-        data = response.json()
-        return data["choices"][0]["message"]["content"]
    def warmup_server(self):
        """Perform comprehensive server warmup with all test prompts."""

--- a/tests/kvbm/test_consolidator_router_e2e.py
+++ b/tests/kvbm/test_consolidator_router_e2e.py