Unverified Commit 18986010 authored by Kris Hung's avatar Kris Hung Committed by GitHub
Browse files

feat: Add KV event consolidator for KVBM (vllm) and router integration (#3725)


Signed-off-by: default avatarkrishung5 <krish@nvidia.com>
parent 95214e8b
...@@ -2,10 +2,11 @@ ...@@ -2,10 +2,11 @@
// SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: Apache-2.0
use super::*; use super::*;
use crate::block_manager::events::DynamoEventManager;
impl Resources { impl Resources {
/// Create a new [`Resources`] instance /// Create a new [`Resources`] instance
pub fn new(config: KvBlockManagerConfig) -> Result<Self> { pub async fn new(config: KvBlockManagerConfig) -> Result<Self> {
config config
.runtime .runtime
.validate() .validate()
...@@ -18,13 +19,34 @@ impl Resources { ...@@ -18,13 +19,34 @@ impl Resources {
let global_registry = GlobalRegistry::default(); let global_registry = GlobalRegistry::default();
let event_manager = config // Create event manager based on configuration:
.event_manager // 1. If explicit event_manager provided, use it
.clone() // 2. Else if consolidator_config provided, create DynamoEventManager with consolidator
.unwrap_or_else(|| NullEventManager::new()); // 3. Else use NullEventManager (no event reporting)
let event_manager = if let Some(ref event_mgr) = config.event_manager {
// Create a NIXL agent if NIXL is enabled and instantiate requested backends tracing::info!("Using explicit event_manager from config");
// TODO: Build a map of NIXL backends to block pools/sets event_mgr.clone()
} else if let Some(consolidator_config) = config.consolidator_config.clone() {
tracing::info!(
"Creating DynamoEventManager with kv event consolidator config: vllm={}, output={}",
consolidator_config.vllm_event_endpoint,
consolidator_config.consolidated_event_endpoint
);
// Create DynamoEventManager with consolidator config (async)
match DynamoEventManager::new_with_config(consolidator_config).await {
Ok(manager) => manager as Arc<dyn EventManager>,
Err(e) => {
tracing::error!(
"Failed to create DynamoEventManager with consolidator: {}, fallback to NullEventManager",
e
);
NullEventManager::new()
}
}
} else {
tracing::info!("Using NullEventManager");
NullEventManager::new()
};
let mut nixl_backends: HashMap<String, Arc<nixl_sys::Backend>> = HashMap::new(); let mut nixl_backends: HashMap<String, Arc<nixl_sys::Backend>> = HashMap::new();
......
...@@ -10,6 +10,7 @@ aggregated and disaggregated determinism tests. ...@@ -10,6 +10,7 @@ aggregated and disaggregated determinism tests.
""" """
import os import os
import re
import time import time
from collections import defaultdict from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
...@@ -21,21 +22,40 @@ import pytest ...@@ -21,21 +22,40 @@ import pytest
import requests import requests
class ServerType(str, Enum): def check_logs_for_patterns(
vllm = "vllm" log_path: Path, patterns: List[str], process_name: str
trtllm = "trtllm" ) -> List[str]:
"""Check log file for specific patterns (errors, warnings, etc.)."""
findings = []
if not log_path.exists():
return [f"{process_name} log file not found at {log_path}"]
try:
with open(log_path, "r") as f:
content = f.read()
for pattern in patterns:
matches = re.findall(pattern, content, re.IGNORECASE | re.MULTILINE)
if matches:
# Limit to first 3 matches and truncate each to 200 chars
for match in matches[:3]:
match_str = match if isinstance(match, str) else str(match)
findings.append(f"{process_name}: {match_str[:200]}")
except Exception as e:
findings.append(f"Error reading {process_name} log: {e}")
return findings
class DeterminismTester:
"""Test class for model determinism validation.""" class ApiTester:
"""Base class for making API requests to LLM endpoints."""
def __init__( def __init__(
self, self,
base_url: Optional[str] = None, base_url: Optional[str] = None,
model_id: Optional[str] = None, model_id: Optional[str] = None,
server_type: Optional[str] = ServerType.vllm,
): ):
# Allow environment override for flexibility in CI/local runs
self.base_url = ( self.base_url = (
base_url or os.environ.get("DYNAMO_API_BASE_URL") or "http://localhost:8000" base_url or os.environ.get("DYNAMO_API_BASE_URL") or "http://localhost:8000"
) )
...@@ -44,6 +64,87 @@ class DeterminismTester: ...@@ -44,6 +64,87 @@ class DeterminismTester:
or os.environ.get("KVBM_MODEL_ID") or os.environ.get("KVBM_MODEL_ID")
or "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" or "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
) )
def make_request(
self,
content: str,
max_tokens: Optional[int] = None,
temperature: float = 0.0,
seed: int = 42,
**kwargs,
) -> str:
"""Make API request and return completion text."""
payload = {
"model": self.model_id,
"messages": [
{"role": "user", "content": content},
],
"stream": False,
"temperature": temperature,
"seed": seed,
}
# Add max_tokens with appropriate key based on kwargs or defaults
if max_tokens is not None:
payload["max_tokens"] = max_tokens
elif "max_completion_tokens" in kwargs:
payload["max_completion_tokens"] = kwargs.pop("max_completion_tokens")
else:
payload["max_completion_tokens"] = int(
os.environ.get("KVBM_MAX_TOKENS", "48")
)
# Add any additional kwargs
payload.update(kwargs)
response = requests.post(
f"{self.base_url}/v1/chat/completions",
headers={"Content-Type": "application/json"},
json=payload,
timeout=int(os.environ.get("KVBM_HTTP_TIMEOUT", "30")),
)
response.raise_for_status()
data = response.json()
return data["choices"][0]["message"]["content"]
def send_chat_request(
self,
messages: List[dict],
max_tokens: int = 50,
temperature: float = 0.0,
seed: int = 42,
) -> dict:
"""Send a chat request and return full response JSON."""
url = f"{self.base_url}/v1/chat/completions"
payload = {
"model": self.model_id,
"messages": messages,
"max_tokens": max_tokens,
"temperature": temperature,
"seed": seed,
}
response = requests.post(url, json=payload, timeout=30)
response.raise_for_status()
return response.json()
class ServerType(str, Enum):
vllm = "vllm"
trtllm = "trtllm"
class DeterminismTester(ApiTester):
"""Test class for model determinism validation."""
def __init__(
self,
base_url: Optional[str] = None,
model_id: Optional[str] = None,
server_type: Optional[str] = ServerType.vllm,
):
super().__init__(base_url, model_id)
self.server_type = server_type self.server_type = server_type
self.shakespeare_file = Path("t8.shakespeare.txt") self.shakespeare_file = Path("t8.shakespeare.txt")
...@@ -100,30 +201,30 @@ class DeterminismTester: ...@@ -100,30 +201,30 @@ class DeterminismTester:
with open(self.shakespeare_file, "w", encoding="utf-8") as f: with open(self.shakespeare_file, "w", encoding="utf-8") as f:
f.write(content) f.write(content)
def make_request(self, content: str) -> str: # Inherited from ApiTester, but override to add top_p for determinism testing
"""Make API request and return completion text.""" def make_request(
payload = { self,
"model": self.model_id, content: str,
"messages": [ max_tokens: Optional[int] = None,
{"role": "user", "content": content}, temperature: float = 0.0,
], seed: int = 42,
"stream": False, **kwargs,
"max_completion_tokens": int(os.environ.get("KVBM_MAX_TOKENS", "48")), ) -> str:
"temperature": 0, """Make API request and return completion text with determinism settings."""
"top_p": 0.0001, # Use determinism-specific defaults
"seed": int(os.environ.get("KVBM_SEED", "42")), if max_tokens is None:
} max_tokens = int(os.environ.get("KVBM_MAX_TOKENS", "48"))
if seed == 42: # Default seed, use env override
response = requests.post( seed = int(os.environ.get("KVBM_SEED", "42"))
f"{self.base_url}/v1/chat/completions",
headers={"Content-Type": "application/json"}, return super().make_request(
json=payload, content,
timeout=int(os.environ.get("KVBM_HTTP_TIMEOUT", "30")), max_tokens=max_tokens,
temperature=temperature,
seed=seed,
top_p=0.0001, # For determinism
**kwargs,
) )
response.raise_for_status()
data = response.json()
return data["choices"][0]["message"]["content"]
def warmup_server(self): def warmup_server(self):
"""Perform comprehensive server warmup with all test prompts.""" """Perform comprehensive server warmup with all test prompts."""
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment