Unverified Commit e02321f8 authored by Richard Huo's avatar Richard Huo Committed by GitHub
Browse files

test: fix kvbm disagg determinism test on GB200 by setting the kv block size to 32 (#6980)

parent f934744f
...@@ -51,6 +51,16 @@ pytestmark = [ ...@@ -51,6 +51,16 @@ pytestmark = [
SUCCESS_RATE_THRESHOLD = 0.95 SUCCESS_RATE_THRESHOLD = 0.95
# TRT-LLM will crash when loading the deepseek-ai/DeepSeek-R1-Distill-Llama-8B model on GB200 with a KV block size of 16.
# As a workaround, use a block size of 32 on GB200.
def is_gb200() -> bool:
out = subprocess.check_output(
["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
text=True,
)
return bool(out.strip()) and "GB200" in out.splitlines()[0].upper()
class LLMServerManager: class LLMServerManager:
"""Manages LLM server lifecycle for determinism testing.""" """Manages LLM server lifecycle for determinism testing."""
...@@ -185,7 +195,7 @@ class LLMServerManager: ...@@ -185,7 +195,7 @@ class LLMServerManager:
"/tmp/kvbm_llm_api_decode_config.yaml", "/tmp/kvbm_llm_api_decode_config.yaml",
) )
KV_BLOCK_SIZE = 16 KV_BLOCK_SIZE = 32 if is_gb200() else 16
llm_api_config: Dict[str, Any] = {} llm_api_config: Dict[str, Any] = {}
llm_api_config["kv_cache_config"] = { llm_api_config["kv_cache_config"] = {
...@@ -205,7 +215,7 @@ class LLMServerManager: ...@@ -205,7 +215,7 @@ class LLMServerManager:
prefill_config["disable_overlap_scheduler"] = True prefill_config["disable_overlap_scheduler"] = True
prefill_config["cache_transceiver_config"] = { prefill_config["cache_transceiver_config"] = {
"backend": "DEFAULT", "backend": "DEFAULT",
"max_tokens_in_buffer": 16384, "max_tokens_in_buffer": 32768 if is_gb200() else 16384,
} }
prefill_config["cuda_graph_config"] = None prefill_config["cuda_graph_config"] = None
...@@ -227,7 +237,7 @@ class LLMServerManager: ...@@ -227,7 +237,7 @@ class LLMServerManager:
"--model", "--model",
model, model,
"--kv-block-size", "--kv-block-size",
"16", str(KV_BLOCK_SIZE),
"--max-num-tokens", "--max-num-tokens",
"8000", "8000",
] ]
...@@ -478,7 +488,9 @@ class DisaggDeterminismTester(DeterminismTester): ...@@ -478,7 +488,9 @@ class DisaggDeterminismTester(DeterminismTester):
"""Reset the prefix cache.""" """Reset the prefix cache."""
print("Resetting prefix cache...") print("Resetting prefix cache...")
# 150 shakespeare requests (each request is 200 words, and roughly 17 blocks) could evict 150 * 17 = 2550 blocks # 150 shakespeare requests (each request is 200 words, and roughly 17 blocks) could evict 150 * 17 = 2550 blocks
shakespeare_count = 150 # On GB200, the block size is 32 tokens, and each request uses roughly 6 blocks.
# 300 × 6 = 1800 blocks should be enough to evict the GPU cache (which holds 1000 blocks).
shakespeare_count = 300 if is_gb200() else 150
for seq_idx in range(1, shakespeare_count + 1): for seq_idx in range(1, shakespeare_count + 1):
start_word = (seq_idx - 1) * self.word_count start_word = (seq_idx - 1) * self.word_count
content = self.get_shakespeare_content(start_word) content = self.get_shakespeare_content(start_word)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment