test: Fixing kvbm tests/ adding concurrency test (#5474)

224f63f5 · Olga Andreeva · GitHub · 199d11f5 · 224f63f5 · 224f63f5
Unverified Commit 224f63f5 authored Jan 21, 2026 by Olga Andreeva Committed by GitHub Jan 21, 2026
10 changed files
--- a/lib/bindings/kvbm/src/block_manager/vllm/connector/worker.rs
+++ b/lib/bindings/kvbm/src/block_manager/vllm/connector/worker.rs
@@ -309,7 +309,7 @@ impl Worker for KvConnectorWorker {
        if self.layers_complete == self.kv_cache_layers.len() {
            let offloading_operations = std::mem::take(&mut self.offloading_operations);

-            tracing::info!(
+            tracing::trace!(
                iteration = self.iteration,
                num_operations = offloading_operations.len(),
                "All layers complete, enqueuing {} offload operations",

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -233,6 +233,7 @@ markers = [
    "router: marks tests for router component",
    "planner: marks tests for planner component",
    "kvbm: marks tests for KV behavior and model determinism",
+    "kvbm_concurrency: marks concurrency stress tests for KVBM (runs separately)",
    "model: model id used by a test or parameter",
    "custom_build: marks tests that require custom builds or special setup (e.g., MoE models)",
    "k8s: marks tests as requiring Kubernetes",

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -55,6 +55,7 @@ def pytest_configure(config):
        "planner: marks tests for planner component",
        "kvbm: marks tests for KV behavior and model determinism",
        "kvbm_v2: marks tests using KVBM V2",
+        "kvbm_concurrency: marks concurrency stress tests for KVBM (runs separately)",
        "model: model id used by a test or parameter",
        "custom_build: marks tests that require custom builds or special setup (e.g., MoE models)",
        "k8s: marks tests as requiring Kubernetes",

--- a/tests/dependencies/test_kvbm_imports.py
+++ b/tests/dependencies/test_kvbm_imports.py
@@ -3,11 +3,23 @@

 """Unit tests to verify KVBM package and wheels are properly installed."""

+import importlib.util
 import subprocess

 import pytest


+def _is_sglang_installed() -> bool:
+    """Check if sglang is installed (KVBM is not available in sglang images)."""
+    return importlib.util.find_spec("sglang") is not None
+
+
+# Skip all KVBM tests if running in sglang environment (sglang doesn't have KVBM)
+pytestmark = pytest.mark.skipif(
+    _is_sglang_installed(), reason="KVBM is not available in sglang images"
+)
+
+
 # Helper functions for KVBM verification
 def _check_kvbm_wheel_exists():
    """Helper to verify KVBM wheel file exists in expected location."""

--- a/tests/kvbm_integration/common.py
+++ b/tests/kvbm_integration/common.py
--- a/tests/kvbm_integration/es_prompt.txt
+++ b/tests/kvbm_integration/es_prompt.txt
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+Divus es un hombre alto, de cabello dividido entre blanco y negro. Viste un largo abrigo confeccionado con pieles sintéticas y proyecta una presencia elegante y autoritaria. Es un profesor sumamente estricto, exigente y perfeccionista. No tolera equivocaciones en las evaluaciones prácticas y toma con la máxima seriedad la formación de sus alumnos. Aunque su carácter puede resultar intimidante, es un docente comprometido que se involucra activamente en el avance académico de quienes demuestran potencial. Es reconocido por ofrecer tutorías personalizadas a los estudiantes destacados, asegurándose de que alcancen el nivel más alto. Valora por encima de todo la puntualidad, la precisión técnica y la claridad conceptual, y espera que sus alumnos mantengan la misma disciplina tanto dentro como fuera del aula.\n\nHistorial de Divus Crewel:\nRespetado dentro de la academia por su elevado porcentaje de aprobados en exámenes de nivel avanzado, Divus ha formado a numerosos estudiantes que luego brillaron en campos técnicos y estratégicos. Su método se caracteriza por ser riguroso y directo, priorizando la observación minuciosa, la corrección inmediata de fallos y la práctica estructurada hasta alcanzar la excelencia. Ha participado en distintos comités de evaluación y es conocido por diseñar pruebas exigentes que reproducen situaciones reales. Aunque rara vez ofrece halagos explícitos, registra cuidadosamente el progreso individual y adapta su enseñanza según las fortalezas y debilidades que detecta.\n\nEscenario del rol:\nDivus se preparaba para aplicar un examen práctico presencial. Tú y tu hermano mayor no estaban convencidos de abandonar su habitación, pues solían realizar las prácticas remotamente desde su dormitorio en Ignihyde, utilizando sistemas avanzados y simulaciones virtuales. Consideraban innecesario el desplazamiento, pero la normativa del curso exigía una evaluación presencial.\n\nMientras Divus distribuía el material del examen —carpetas, instrumentos de medición y hojas de instrucciones detalladas—, nota tu presencia entre los participantes. Se detiene por un instante, revisa la lista y se acerca con un interés profesional.\n\n”Oh… parece que tenemos a una estudiante que normalmente trabaja de forma remota participando en persona.”\n\nSu tono es firme, aunque teñido de curiosidad. Aún no estaba familiarizado contigo en persona, por lo que adopta una actitud objetiva y profesional. Al revisar tu expediente académico, observa tus calificaciones previas, los comentarios de otros profesores y los registros de prácticas exitosas. Su expresión cambia apenas al notar tu constancia y rapidez de aprendizaje.\n\nDivus cierra la carpeta con cuidado y adopta una postura más formal, evaluándote con atención.\n\n”Tus resultados anteriores son consistentes. No obstante, el objetivo de esta prueba es evaluar tu desempeño en un entorno controlado, con recursos limitados y bajo presión de tiempo. Aquí se miden no solo los conocimientos, sino también el criterio, el método y la capacidad de adaptación.”\n\nDa unos pasos hacia la mesa y señala el material dispuesto, asegurándose de que todo esté perfectamente alineado.\n\n”Lee las instrucciones con atención antes de comenzar. Cada procedimiento tiene una razón específica. No se aceptarán improvisaciones fuera de los parámetros establecidos.”\n\nHace una breve pausa, observando a los demás estudiantes, y luego vuelve a dirigir la mirada hacia ti.\n\n”Si completas la prueba con resultados satisfactorios, consideraré ofrecerte tutorías adicionales. No son un privilegio común; exigen constancia y disposición para aceptar correcciones directas.”\n\nEl ambiente se vuelve silencioso mientras Divus da la señal de inicio. Permanece atento, recorriendo el aula con pasos medidos, observando técnicas, anotando tiempos y registrando mentalmente las conductas de cada estudiante. En más de una ocasión, dirige una mirada analítica hacia tu estación de trabajo, evaluando tu método sin intervenir.\n\nEjemplo de interacción:\n\n{{char}}:\nObserva tus apuntes y dispositivos con atención.\n”Veo que aplicas un enfoque sistemático. Eso minimiza errores y mejora la eficiencia.”\n\n{{user}}:\n”Gracias, profesor.”\n\n{{char}}:\n”Mantén ese nivel de concentración. La constancia es tan valiosa como el conocimiento.”\n\n\nContinuación del escenario:\nMientras la evaluación progresa, Divus revisa algunos resultados preliminares. Anota observaciones en su cuaderno, marcando discretamente los puntos fuertes y las áreas a perfeccionar de cada alumno. Su mente opera de forma analítica, comparando métodos, tiempos de ejecución y decisiones técnicas. Solo interviene para recordar una regla general o ajustar el ritmo de la clase cuando lo considera pertinente.\n\nAl concluir el tiempo designado, Divus da la orden de finalizar. Recoge el material con precisión, revisando que todo esté completo y correctamente identificado. Luego se coloca al frente, cruzando los brazos mientras examina al grupo.\n\n”La evaluación ha terminado. Los resultados serán analizados con detenimiento. No esperen comentarios inmediatos; prefiero revisar cada procedimiento con calma.”\n\nSu mirada se detiene en ti brevemente, sin mostrar emoción, pero registrando tu desempeño.\n\n”Aquellos que demuestren constancia y criterio recibirán indicaciones adicionales en los próximos días. El aprendizaje no se detiene en una sola prueba.”\n\nDivus abandona el aula con paso firme, dejando un ambiente de reflexión y expectativa. Para él, cada examen es una herramienta para perfeccionar el talento, y ya planea los siguientes pasos del curso.\n\nNo escribas como {{user}} ni asumas su reacción o respuesta. Espera la respuesta de {{user}} antes de continuar.\n
\ No newline at end of file
--- a/tests/kvbm_integration/test_consolidator_router_e2e.py
+++ b/tests/kvbm_integration/test_consolidator_router_e2e.py
@@ -13,7 +13,6 @@ This test validates that:
 """

 import concurrent.futures
-import importlib.util
 import logging
 import os
 import re
@@ -27,22 +26,11 @@ import yaml
 from tests.kvbm_integration.common import ApiTester, check_logs_for_patterns
 from tests.utils.managed_process import ManagedProcess

-
 # Check if engines are available and build list of available engines
-# Use find_spec first (fast check), then verify import works (functional check)
-def _check_engine_available(module_name: str) -> bool:
-    """Check if an engine module is available and importable."""
-    if importlib.util.find_spec(module_name) is None:
-        return False
-    try:
-        importlib.import_module(module_name)
-        return True
-    except ImportError:
-        return False
-
+from .common import check_module_available

-HAS_VLLM = _check_engine_available("vllm")
-HAS_TRTLLM = _check_engine_available("tensorrt_llm")
+HAS_VLLM = check_module_available("vllm")
+HAS_TRTLLM = check_module_available("tensorrt_llm")

 # Build list of available engines for parameterization
 AVAILABLE_ENGINES = []

--- a/tests/kvbm_integration/test_cuda_graph.py
+++ b/tests/kvbm_integration/test_cuda_graph.py
@@ -11,6 +11,8 @@ when given the same inputs with fixed seed and temperature=0.
 The test uses comprehensive server warmup (sending all test prompts
 before validation) to avoid server initialization effects that could
 impact determinism measurements.
+
+This is a TensorRTLLM only test.
 """

 import logging
@@ -24,8 +26,13 @@ from tests.utils.engine_process import FRONTEND_PORT
 from tests.utils.managed_process import DynamoFrontendProcess, ManagedProcess
 from tests.utils.payloads import check_models_api

+from .common import check_module_available
+
 logger = logging.getLogger(__name__)

+
+HAS_TRTLLM = check_module_available("tensorrt_llm")
+
 # Just need a model to show the config works rather than any stress of the system.
 MODEL_PATH = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 SERVED_MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
@@ -151,6 +158,7 @@ def send_completion_request(
 @pytest.mark.nightly
 @pytest.mark.slow
 @pytest.mark.gpu_1
+@pytest.mark.skipif(not HAS_TRTLLM, reason="requires tensorrt_llm")
 def test_kvbm_without_cuda_graph_enabled(request, runtime_services):
    """
    End-to-end test for TRTLLM worker with cuda_graph_config not defined and
@@ -187,6 +195,7 @@ def test_kvbm_without_cuda_graph_enabled(request, runtime_services):
 @pytest.mark.slow
 @pytest.mark.nightly
 @pytest.mark.gpu_1
+@pytest.mark.skipif(not HAS_TRTLLM, reason="requires tensorrt_llm")
 def test_kvbm_with_cuda_graph_enabled(request, runtime_services):
    """
    End-to-end test for TRTLLM worker with cuda_graph_config defined and

--- a/tests/kvbm_integration/test_determinism_agg.py
+++ b/tests/kvbm_integration/test_determinism_agg.py
@@ -14,27 +14,30 @@ The expected results should be 100% match between the two cases. Compared to
 disaggregated mode, aggregated mode has less randomness chances.
 """

-import importlib.util
 import logging
 import os
 import signal
+import socket
 import subprocess
+import sys
+import threading
 import time
-from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
 from pathlib import Path
-from typing import Any, Dict, Optional, TextIO
+from typing import Any, Dict, List, Optional, TextIO

 import pytest
 import requests

 from .common import DeterminismTester, ServerType
 from .common import TestDeterminism as BaseTestDeterminism
+from .common import check_module_available
+
+HAS_VLLM_BENCH = check_module_available("vllm")

 # Test markers to align with repository conventions
 # Todo: enable the rest when kvbm is built in the ci
 pytestmark = [
-    pytest.mark.kvbm,
    pytest.mark.e2e,
    pytest.mark.slow,
    pytest.mark.gpu_1,
@@ -42,6 +45,16 @@ pytestmark = [
 ]


+def _find_free_port() -> int:
+    """Find a free port by binding to port 0 and letting the OS assign one."""
+
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        s.listen(1)
+        port = s.getsockname()[1]
+    return port
+
+
 class LLMServerManager:
    """Manages LLM server lifecycle for determinism testing."""

@@ -55,7 +68,13 @@ class LLMServerManager:
        server_type: Optional[str] = ServerType.vllm,
    ):
        self.server_type = server_type
-        self.port = port or int(os.environ.get("KVBM_SERVER_PORT", "8000"))
+        # Use provided port, env var, or find a free port to avoid conflicts
+        if port is not None:
+            self.port = port
+        elif os.environ.get("KVBM_SERVER_PORT"):
+            self.port = int(os.environ["KVBM_SERVER_PORT"])
+        else:
+            self.port = _find_free_port()
        self.base_url = base_url or f"http://localhost:{self.port}"
        self.process: Optional[subprocess.Popen] = None
        self.cpu_cache_blocks = cpu_cache_blocks
@@ -72,7 +91,7 @@ class LLMServerManager:
            self.log_dir / f"{self.server_type}_server_{config_str}_{timestamp}.log"
        )
        self.server_stdout_file: Optional[TextIO] = None
-        self.server_stderr_file: Optional[TextIO] = None
+        self._tee_threads: List[threading.Thread] = []

        # Environment for the process
        self.env = os.environ.copy()
@@ -82,6 +101,12 @@ class LLMServerManager:
                # DynamoConnector connection settings
                "NATS_SERVER": "nats://localhost:4222",
                "ETCD_ENDPOINTS": "http://localhost:2379",
+                # Enable KVBM metrics for monitoring offload/onboard
+                "DYN_KVBM_METRICS": "true",
+                "DYN_KVBM_METRICS_PORT": "6880",
+                # Enable vLLM batch invariant for deterministic batching
+                "VLLM_BATCH_INVARIANT": "1",
+                "VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
            }
        )

@@ -164,40 +189,69 @@ class LLMServerManager:
        with open(config_path, "w") as f:
            yaml.dump(llm_api_config, f, default_flow_style=False, sort_keys=False)

+    def _tee_output(self, pipe: Any, log_file: TextIO, prefix: str) -> None:
+        """Read from pipe and write to both log file and stdout (tee)."""
+        try:
+            for line in iter(pipe.readline, ""):
+                if not line:
+                    break
+                # Write to log file
+                log_file.write(line)
+                log_file.flush()
+                # Write to stdout with prefix
+                sys.stdout.write(f"[{prefix}] {line}")
+                sys.stdout.flush()
+        except (ValueError, OSError):
+            pass  # Pipe closed
+        finally:
+            pipe.close()
+
    def start_server(self, timeout: int = 300) -> bool:
        """Start LLM server and wait for readiness."""
        if self.is_server_running():
            self.stop_server()
            time.sleep(2)

-        # Open log files
-        self.server_stdout_file = open(
-            self.server_log_file.with_suffix(".stdout.log"), "w"
-        )
-        self.server_stderr_file = open(
-            self.server_log_file.with_suffix(".stderr.log"), "w"
-        )
-        if self.server_stdout_file is not None:
-            self.server_stdout_file.write(
-                f"=== {self.server_type} Server Started at {datetime.now()} ===\nCommand: {' '.join(self.server_cmd)}\n"
-            )
-            self.server_stdout_file.flush()
+        # Open log file (combined stdout+stderr)
+        self.server_stdout_file = open(self.server_log_file.with_suffix(".log"), "w")
+
+        # Write header
+        header = f"=== {self.server_type} Server Started at {datetime.now()} ===\nCommand: {' '.join(self.server_cmd)}\n"
+        self.server_stdout_file.write(header)
+        self.server_stdout_file.flush()
+        print(f"[{self.server_type}] {header}", end="")

-        # Launch
+        # Launch with pipe, redirect stderr to stdout
        self.process = subprocess.Popen(
            self.server_cmd,
-            stdout=self.server_stdout_file,
-            stderr=self.server_stderr_file,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,  # Redirect stderr to stdout
            env=self.env,
            preexec_fn=os.setsid,
+            text=True,
+            bufsize=1,  # Line buffered
        )

+        # Start tee thread for combined output
+        self._tee_threads = [
+            threading.Thread(
+                target=self._tee_output,
+                args=(self.process.stdout, self.server_stdout_file, self.server_type),
+                daemon=True,
+            ),
+        ]
+        for t in self._tee_threads:
+            t.start()
+
        # Wait for health
        start_time = time.time()
        while time.time() - start_time < timeout:
            if self.is_server_running():
                return True
            if self.process.poll() is not None:
+                # Process exited, wait for tee thread to finish
+                for t in self._tee_threads:
+                    t.join(timeout=2)
                self._close_log_files()
                return False
            time.sleep(5)
@@ -220,6 +274,10 @@ class LLMServerManager:
                pass
            finally:
                self.process = None
+        # Wait for tee threads to finish
+        for t in self._tee_threads:
+            t.join(timeout=2)
+        self._tee_threads = []
        self._close_log_files()

    def _close_log_files(self):
@@ -229,9 +287,6 @@ class LLMServerManager:
            )
            self.server_stdout_file.close()
            self.server_stdout_file = None
-        if self.server_stderr_file:
-            self.server_stderr_file.close()
-            self.server_stderr_file = None

    def is_server_running(self) -> bool:
        try:
@@ -318,9 +373,9 @@ def llm_server(request, runtime_services):
    # Put logs in the per-test directory set up by tests/conftest.py
    log_dir = Path(request.node.name)

-    if importlib.util.find_spec("vllm") is not None:
+    if check_module_available("vllm"):
        server_type = ServerType.vllm
-    elif importlib.util.find_spec("tensorrt_llm") is not None:
+    elif check_module_available("tensorrt_llm"):
        server_type = ServerType.trtllm
    else:
        raise Exception(
@@ -363,10 +418,14 @@ class TestDeterminismAgg(BaseTestDeterminism):
    @pytest.mark.parametrize(
        "llm_server",
        [
-            {"cpu_blocks": int(os.environ.get("KVBM_CPU_BLOCKS", "10000"))},
+            {
+                "cpu_blocks": int(os.environ.get("KVBM_CPU_BLOCKS", "10000")),
+                "gpu_blocks": int(os.environ.get("KVBM_GPU_BLOCKS", "2048")),
+            },
        ],
        indirect=True,
    )
+    @pytest.mark.kvbm
    def test_determinism_agg_with_cache_reset(
        self, tester, llm_server, runtime_services
    ):
@@ -379,197 +438,38 @@ class TestDeterminismAgg(BaseTestDeterminism):
    @pytest.mark.parametrize(
        "llm_server",
        [
-            {"cpu_blocks": int(os.environ.get("KVBM_CPU_BLOCKS", "20000"))},
+            {
+                "cpu_blocks": int(os.environ.get("KVBM_CPU_BLOCKS", "30000")),
+                "gpu_blocks": int(os.environ.get("KVBM_GPU_BLOCKS", "2048")),
+            },
        ],
        indirect=True,
    )
-    @pytest.mark.parametrize(
-        "num_concurrent",
-        [int(x) for x in os.environ.get("KVBM_CONCURRENT_REQUESTS", "3").split(",")],
+    @pytest.mark.kvbm_concurrency
+    @pytest.mark.skipif(
+        not HAS_VLLM_BENCH, reason="requires vllm bench (vllm module not found)"
    )
-    @pytest.mark.parametrize(
-        "max_tokens",
-        [int(os.environ.get("KVBM_MAX_TOKENS", "48"))],
+    @pytest.mark.xfail(
+        reason="Known issue, fixed in PR: https://github.com/ai-dynamo/dynamo/pull/5475",
+        run=True,
    )
-    @pytest.mark.parametrize(
-        "num_prompts",
-        [int(x) for x in os.environ.get("KVBM_IFEVAL_PROMPTS", "120").split(",")],
-    )
-    @pytest.mark.skip(reason="Flaky test: DIS-665")
-    def test_concurrent_determinism_with_ifeval(
-        self,
-        tester,
-        llm_server,
-        runtime_services,
-        num_concurrent,
-        max_tokens,
-        num_prompts,
+    def test_concurrent_determinism_under_load(
+        self, tester, llm_server, runtime_services
    ):
-        """Simple concurrent determinism test: send IFEval prompts concurrently, with cache reset."""
-        print("\n" + "=" * 70)
-        print("CONCURRENT DETERMINISM TEST WITH IFEVAL")
-        print("=" * 70)
+        """Test Spanish prompt determinism under high concurrency load.

-        print(f"Using max_tokens={max_tokens} (from KVBM_MAX_TOKENS)")
+        Reproduces the bug where Spanish responses become English or corrupted.
+        """
+        # Get the Spanish prompt path relative to this test file
+        spanish_prompt_path = Path(
+            os.path.join(os.path.dirname(__file__), "es_prompt.txt")
+        ).absolute()

-        # Configuration comes from parametrize
-        print(
-            f"Configuration: {num_concurrent} concurrent requests, {max_tokens} max tokens"
-        )
-
-        # Load IFEval prompts
-        ifeval_prompts = tester.download_ifeval_dataset()
-        if not ifeval_prompts:
-            pytest.skip("IFEval dataset not available")
-
-        # Use parametrized number of IFEval prompts
-        test_prompts = ifeval_prompts[:num_prompts]
-        print(
-            f"Using {len(test_prompts)} IFEval prompts for concurrent testing (parametrized: {num_prompts})"
-        )
-        print(f"Concurrency level: {num_concurrent} simultaneous requests")
-
-        # Show sample prompts
-        print("\nSample prompts:")
-        for i, prompt in enumerate(test_prompts[:3]):
-            print(f"  {i+1}. {prompt[:80]}{'...' if len(prompt) > 80 else ''}")
-        if len(test_prompts) > 3:
-            print(f"  ... and {len(test_prompts) - 3} more")
-
-        def run_concurrent_test(phase_name, do_warmup=False):
-            """Run one phase of concurrent testing."""
-            print(f"\n=== {phase_name} ===")
-
-            if do_warmup:
-                # KV Cache warmup - send ALL test prompts to compute KV caches
-                print(
-                    f"Warming up KV caches with all {len(test_prompts)} test prompts..."
-                )
-                warmup_failed = 0
-
-                for i, prompt in enumerate(test_prompts):
-                    if (
-                        i % 5 == 0 or i == len(test_prompts) - 1
-                    ):  # Progress every 5 prompts
-                        print(f"  Warmup progress: {i+1}/{len(test_prompts)}")
-
-                    try:
-                        tester.make_request(prompt)
-                    except Exception as e:
-                        warmup_failed += 1
-                        if warmup_failed <= 3:  # Show first few failures
-                            print(f"    Warmup failed for prompt {i}: {e}")
-
-                if warmup_failed > 0:
-                    print(
-                        f"Warmup completed with {warmup_failed} failures out of {len(test_prompts)} prompts"
-                    )
-                else:
-                    print(
-                        f"Warmup completed successfully - all {len(test_prompts)} KV caches computed"
-                    )
-
-                # Wait for 10 seconds to make sure all transfers are complete
-                time.sleep(10)
-            else:
-                print("Skipping warmup (already done in previous phase)")
-
-            # Run concurrent requests
-            print(
-                f"Sending {len(test_prompts)} requests with {num_concurrent} max concurrent..."
-            )
-            start_time = time.time()
-
-            def make_request_wrapper(prompt_and_idx):
-                idx, prompt = prompt_and_idx
-                try:
-                    response = tester.make_request(prompt)
-                    return {
-                        "idx": idx,
-                        "prompt": prompt,
-                        "response": response,
-                        "success": True,
-                    }
-                except Exception as e:
-                    return {
-                        "idx": idx,
-                        "prompt": prompt,
-                        "error": str(e),
-                        "success": False,
-                    }
-
-            # Execute all requests concurrently
-            with ThreadPoolExecutor(max_workers=num_concurrent) as executor:
-                results = list(
-                    executor.map(make_request_wrapper, enumerate(test_prompts))
-                )
-
-            elapsed = time.time() - start_time
-            successful = [r for r in results if r["success"]]
-            failed = [r for r in results if not r["success"]]
-
-            print(
-                f"Completed in {elapsed:.2f}s - Success: {len(successful)}, Failed: {len(failed)}"
-            )
-
-            if failed:
-                for fail in failed[:3]:  # Show first few failures
-                    print(f"  Failed: {fail['error']}")
-
-            return successful
-
-        # Phase 1: Before cache reset
-        results_before = run_concurrent_test(
-            "PHASE 1: BEFORE CACHE RESET", do_warmup=True
+        # Call the base class implementation
+        super().base_test_spanish_prompt_determinism_under_load(
+            tester, llm_server, runtime_services, spanish_prompt_path
        )

-        # Reset cache
-        print("\n" + "=" * 50)
-        print("RESETTING CACHE")
-        print("=" * 50)
-        tester.reset_prefix_cache()
-
-        # Phase 2: After cache reset
-        results_after = run_concurrent_test("PHASE 2: AFTER CACHE RESET")
-
-        # Compare results between phases
-        print("\n" + "=" * 70)
-        print("DETERMINISM ANALYSIS")
-        print("=" * 70)
-
-        # Create lookup for before results
-        before_responses = {r["idx"]: r["response"] for r in results_before}
-        after_responses = {r["idx"]: r["response"] for r in results_after}
-
-        deterministic_count = 0
-        total_compared = 0
-
-        for idx in before_responses:
-            if idx in after_responses:
-                total_compared += 1
-                before_resp = before_responses[idx]
-                after_resp = after_responses[idx]
-
-                if before_resp == after_resp:
-                    deterministic_count += 1
-                    print(f"   Prompt {idx}: DETERMINISTIC")
-                else:
-                    print(f"   Prompt {idx}: NON-DETERMINISTIC")
-                    print(f"     Before: {before_resp}")
-                    print(f"     After:  {after_resp}")
-
-        # Final assessment
-        success_rate = deterministic_count / total_compared if total_compared > 0 else 0
-        print("\n=== FINAL RESULT ===")
-        print(f"Prompts compared: {total_compared}")
-        print(f"Deterministic: {deterministic_count}")
-        print(f"Success rate: {success_rate:.1%}")
-        print(f"Concurrent requests: {num_concurrent}")
-
-        assert (
-            success_rate == 1.0
-        ), f"Determinism failed: {deterministic_count}/{total_compared} prompts deterministic"
-

 if __name__ == "__main__":
    # Allow running as script

--- a/tests/kvbm_integration/test_determinism_disagg.py
+++ b/tests/kvbm_integration/test_determinism_disagg.py
@@ -15,7 +15,6 @@ Compared to aggregated mode, disaggregated mode has some known randomness.
 Example reference: https://github.com/vllm-project/vllm/issues/7779#issuecomment-2304967870
 """

-import importlib.util
 import logging
 import os
 import signal
@@ -32,6 +31,7 @@ import yaml

 from .common import DeterminismTester, ServerType
 from .common import TestDeterminism as BaseTestDeterminism
+from .common import check_module_available

 # Test markers to align with repository conventions
 # Todo: enable the rest when kvbm is built in the ci
@@ -507,9 +507,9 @@ def llm_server(request, runtime_services):
    # Put logs in the per-test directory set up by tests/conftest.py
    log_dir = Path(request.node.name)

-    if importlib.util.find_spec("vllm") is not None:
+    if check_module_available("vllm"):
        server_type = ServerType.vllm
-    elif importlib.util.find_spec("tensorrt_llm") is not None:
+    elif check_module_available("tensorrt_llm"):
        server_type = ServerType.trtllm
    else:
        pytest.skip("vllm module is not available in the current environment.")