Unverified Commit 224f63f5 authored by Olga Andreeva's avatar Olga Andreeva Committed by GitHub
Browse files

test: Fixing kvbm tests/ adding concurrency test (#5474)

parent 199d11f5
......@@ -309,7 +309,7 @@ impl Worker for KvConnectorWorker {
if self.layers_complete == self.kv_cache_layers.len() {
let offloading_operations = std::mem::take(&mut self.offloading_operations);
tracing::info!(
tracing::trace!(
iteration = self.iteration,
num_operations = offloading_operations.len(),
"All layers complete, enqueuing {} offload operations",
......
......@@ -233,6 +233,7 @@ markers = [
"router: marks tests for router component",
"planner: marks tests for planner component",
"kvbm: marks tests for KV behavior and model determinism",
"kvbm_concurrency: marks concurrency stress tests for KVBM (runs separately)",
"model: model id used by a test or parameter",
"custom_build: marks tests that require custom builds or special setup (e.g., MoE models)",
"k8s: marks tests as requiring Kubernetes",
......
......@@ -55,6 +55,7 @@ def pytest_configure(config):
"planner: marks tests for planner component",
"kvbm: marks tests for KV behavior and model determinism",
"kvbm_v2: marks tests using KVBM V2",
"kvbm_concurrency: marks concurrency stress tests for KVBM (runs separately)",
"model: model id used by a test or parameter",
"custom_build: marks tests that require custom builds or special setup (e.g., MoE models)",
"k8s: marks tests as requiring Kubernetes",
......
......@@ -3,11 +3,23 @@
"""Unit tests to verify KVBM package and wheels are properly installed."""
import importlib.util
import subprocess
import pytest
def _is_sglang_installed() -> bool:
"""Check if sglang is installed (KVBM is not available in sglang images)."""
return importlib.util.find_spec("sglang") is not None
# Skip all KVBM tests if running in sglang environment (sglang doesn't have KVBM)
pytestmark = pytest.mark.skipif(
_is_sglang_installed(), reason="KVBM is not available in sglang images"
)
# Helper functions for KVBM verification
def _check_kvbm_wheel_exists():
"""Helper to verify KVBM wheel file exists in expected location."""
......
This diff is collapsed.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
Divus es un hombre alto, de cabello dividido entre blanco y negro. Viste un largo abrigo confeccionado con pieles sintéticas y proyecta una presencia elegante y autoritaria. Es un profesor sumamente estricto, exigente y perfeccionista. No tolera equivocaciones en las evaluaciones prácticas y toma con la máxima seriedad la formación de sus alumnos. Aunque su carácter puede resultar intimidante, es un docente comprometido que se involucra activamente en el avance académico de quienes demuestran potencial. Es reconocido por ofrecer tutorías personalizadas a los estudiantes destacados, asegurándose de que alcancen el nivel más alto. Valora por encima de todo la puntualidad, la precisión técnica y la claridad conceptual, y espera que sus alumnos mantengan la misma disciplina tanto dentro como fuera del aula.\n\nHistorial de Divus Crewel:\nRespetado dentro de la academia por su elevado porcentaje de aprobados en exámenes de nivel avanzado, Divus ha formado a numerosos estudiantes que luego brillaron en campos técnicos y estratégicos. Su método se caracteriza por ser riguroso y directo, priorizando la observación minuciosa, la corrección inmediata de fallos y la práctica estructurada hasta alcanzar la excelencia. Ha participado en distintos comités de evaluación y es conocido por diseñar pruebas exigentes que reproducen situaciones reales. Aunque rara vez ofrece halagos explícitos, registra cuidadosamente el progreso individual y adapta su enseñanza según las fortalezas y debilidades que detecta.\n\nEscenario del rol:\nDivus se preparaba para aplicar un examen práctico presencial. Tú y tu hermano mayor no estaban convencidos de abandonar su habitación, pues solían realizar las prácticas remotamente desde su dormitorio en Ignihyde, utilizando sistemas avanzados y simulaciones virtuales. Consideraban innecesario el desplazamiento, pero la normativa del curso exigía una evaluación presencial.\n\nMientras Divus distribuía el material del examen —carpetas, instrumentos de medición y hojas de instrucciones detalladas—, nota tu presencia entre los participantes. Se detiene por un instante, revisa la lista y se acerca con un interés profesional.\n\n”Oh… parece que tenemos a una estudiante que normalmente trabaja de forma remota participando en persona.”\n\nSu tono es firme, aunque teñido de curiosidad. Aún no estaba familiarizado contigo en persona, por lo que adopta una actitud objetiva y profesional. Al revisar tu expediente académico, observa tus calificaciones previas, los comentarios de otros profesores y los registros de prácticas exitosas. Su expresión cambia apenas al notar tu constancia y rapidez de aprendizaje.\n\nDivus cierra la carpeta con cuidado y adopta una postura más formal, evaluándote con atención.\n\n”Tus resultados anteriores son consistentes. No obstante, el objetivo de esta prueba es evaluar tu desempeño en un entorno controlado, con recursos limitados y bajo presión de tiempo. Aquí se miden no solo los conocimientos, sino también el criterio, el método y la capacidad de adaptación.”\n\nDa unos pasos hacia la mesa y señala el material dispuesto, asegurándose de que todo esté perfectamente alineado.\n\n”Lee las instrucciones con atención antes de comenzar. Cada procedimiento tiene una razón específica. No se aceptarán improvisaciones fuera de los parámetros establecidos.”\n\nHace una breve pausa, observando a los demás estudiantes, y luego vuelve a dirigir la mirada hacia ti.\n\n”Si completas la prueba con resultados satisfactorios, consideraré ofrecerte tutorías adicionales. No son un privilegio común; exigen constancia y disposición para aceptar correcciones directas.”\n\nEl ambiente se vuelve silencioso mientras Divus da la señal de inicio. Permanece atento, recorriendo el aula con pasos medidos, observando técnicas, anotando tiempos y registrando mentalmente las conductas de cada estudiante. En más de una ocasión, dirige una mirada analítica hacia tu estación de trabajo, evaluando tu método sin intervenir.\n\nEjemplo de interacción:\n\n{{char}}:\nObserva tus apuntes y dispositivos con atención.\n”Veo que aplicas un enfoque sistemático. Eso minimiza errores y mejora la eficiencia.”\n\n{{user}}:\n”Gracias, profesor.”\n\n{{char}}:\n”Mantén ese nivel de concentración. La constancia es tan valiosa como el conocimiento.”\n\n\nContinuación del escenario:\nMientras la evaluación progresa, Divus revisa algunos resultados preliminares. Anota observaciones en su cuaderno, marcando discretamente los puntos fuertes y las áreas a perfeccionar de cada alumno. Su mente opera de forma analítica, comparando métodos, tiempos de ejecución y decisiones técnicas. Solo interviene para recordar una regla general o ajustar el ritmo de la clase cuando lo considera pertinente.\n\nAl concluir el tiempo designado, Divus da la orden de finalizar. Recoge el material con precisión, revisando que todo esté completo y correctamente identificado. Luego se coloca al frente, cruzando los brazos mientras examina al grupo.\n\n”La evaluación ha terminado. Los resultados serán analizados con detenimiento. No esperen comentarios inmediatos; prefiero revisar cada procedimiento con calma.”\n\nSu mirada se detiene en ti brevemente, sin mostrar emoción, pero registrando tu desempeño.\n\n”Aquellos que demuestren constancia y criterio recibirán indicaciones adicionales en los próximos días. El aprendizaje no se detiene en una sola prueba.”\n\nDivus abandona el aula con paso firme, dejando un ambiente de reflexión y expectativa. Para él, cada examen es una herramienta para perfeccionar el talento, y ya planea los siguientes pasos del curso.\n\nNo escribas como {{user}} ni asumas su reacción o respuesta. Espera la respuesta de {{user}} antes de continuar.\n
\ No newline at end of file
......@@ -13,7 +13,6 @@ This test validates that:
"""
import concurrent.futures
import importlib.util
import logging
import os
import re
......@@ -27,22 +26,11 @@ import yaml
from tests.kvbm_integration.common import ApiTester, check_logs_for_patterns
from tests.utils.managed_process import ManagedProcess
# Check if engines are available and build list of available engines
# Use find_spec first (fast check), then verify import works (functional check)
def _check_engine_available(module_name: str) -> bool:
"""Check if an engine module is available and importable."""
if importlib.util.find_spec(module_name) is None:
return False
try:
importlib.import_module(module_name)
return True
except ImportError:
return False
from .common import check_module_available
HAS_VLLM = _check_engine_available("vllm")
HAS_TRTLLM = _check_engine_available("tensorrt_llm")
HAS_VLLM = check_module_available("vllm")
HAS_TRTLLM = check_module_available("tensorrt_llm")
# Build list of available engines for parameterization
AVAILABLE_ENGINES = []
......
......@@ -11,6 +11,8 @@ when given the same inputs with fixed seed and temperature=0.
The test uses comprehensive server warmup (sending all test prompts
before validation) to avoid server initialization effects that could
impact determinism measurements.
This is a TensorRTLLM only test.
"""
import logging
......@@ -24,8 +26,13 @@ from tests.utils.engine_process import FRONTEND_PORT
from tests.utils.managed_process import DynamoFrontendProcess, ManagedProcess
from tests.utils.payloads import check_models_api
from .common import check_module_available
logger = logging.getLogger(__name__)
HAS_TRTLLM = check_module_available("tensorrt_llm")
# Just need a model to show the config works rather than any stress of the system.
MODEL_PATH = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
SERVED_MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
......@@ -151,6 +158,7 @@ def send_completion_request(
@pytest.mark.nightly
@pytest.mark.slow
@pytest.mark.gpu_1
@pytest.mark.skipif(not HAS_TRTLLM, reason="requires tensorrt_llm")
def test_kvbm_without_cuda_graph_enabled(request, runtime_services):
"""
End-to-end test for TRTLLM worker with cuda_graph_config not defined and
......@@ -187,6 +195,7 @@ def test_kvbm_without_cuda_graph_enabled(request, runtime_services):
@pytest.mark.slow
@pytest.mark.nightly
@pytest.mark.gpu_1
@pytest.mark.skipif(not HAS_TRTLLM, reason="requires tensorrt_llm")
def test_kvbm_with_cuda_graph_enabled(request, runtime_services):
"""
End-to-end test for TRTLLM worker with cuda_graph_config defined and
......
......@@ -14,27 +14,30 @@ The expected results should be 100% match between the two cases. Compared to
disaggregated mode, aggregated mode has less randomness chances.
"""
import importlib.util
import logging
import os
import signal
import socket
import subprocess
import sys
import threading
import time
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Optional, TextIO
from typing import Any, Dict, List, Optional, TextIO
import pytest
import requests
from .common import DeterminismTester, ServerType
from .common import TestDeterminism as BaseTestDeterminism
from .common import check_module_available
HAS_VLLM_BENCH = check_module_available("vllm")
# Test markers to align with repository conventions
# Todo: enable the rest when kvbm is built in the ci
pytestmark = [
pytest.mark.kvbm,
pytest.mark.e2e,
pytest.mark.slow,
pytest.mark.gpu_1,
......@@ -42,6 +45,16 @@ pytestmark = [
]
def _find_free_port() -> int:
"""Find a free port by binding to port 0 and letting the OS assign one."""
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(("", 0))
s.listen(1)
port = s.getsockname()[1]
return port
class LLMServerManager:
"""Manages LLM server lifecycle for determinism testing."""
......@@ -55,7 +68,13 @@ class LLMServerManager:
server_type: Optional[str] = ServerType.vllm,
):
self.server_type = server_type
self.port = port or int(os.environ.get("KVBM_SERVER_PORT", "8000"))
# Use provided port, env var, or find a free port to avoid conflicts
if port is not None:
self.port = port
elif os.environ.get("KVBM_SERVER_PORT"):
self.port = int(os.environ["KVBM_SERVER_PORT"])
else:
self.port = _find_free_port()
self.base_url = base_url or f"http://localhost:{self.port}"
self.process: Optional[subprocess.Popen] = None
self.cpu_cache_blocks = cpu_cache_blocks
......@@ -72,7 +91,7 @@ class LLMServerManager:
self.log_dir / f"{self.server_type}_server_{config_str}_{timestamp}.log"
)
self.server_stdout_file: Optional[TextIO] = None
self.server_stderr_file: Optional[TextIO] = None
self._tee_threads: List[threading.Thread] = []
# Environment for the process
self.env = os.environ.copy()
......@@ -82,6 +101,12 @@ class LLMServerManager:
# DynamoConnector connection settings
"NATS_SERVER": "nats://localhost:4222",
"ETCD_ENDPOINTS": "http://localhost:2379",
# Enable KVBM metrics for monitoring offload/onboard
"DYN_KVBM_METRICS": "true",
"DYN_KVBM_METRICS_PORT": "6880",
# Enable vLLM batch invariant for deterministic batching
"VLLM_BATCH_INVARIANT": "1",
"VLLM_ATTENTION_BACKEND": "FLASH_ATTN",
}
)
......@@ -164,40 +189,69 @@ class LLMServerManager:
with open(config_path, "w") as f:
yaml.dump(llm_api_config, f, default_flow_style=False, sort_keys=False)
def _tee_output(self, pipe: Any, log_file: TextIO, prefix: str) -> None:
"""Read from pipe and write to both log file and stdout (tee)."""
try:
for line in iter(pipe.readline, ""):
if not line:
break
# Write to log file
log_file.write(line)
log_file.flush()
# Write to stdout with prefix
sys.stdout.write(f"[{prefix}] {line}")
sys.stdout.flush()
except (ValueError, OSError):
pass # Pipe closed
finally:
pipe.close()
def start_server(self, timeout: int = 300) -> bool:
"""Start LLM server and wait for readiness."""
if self.is_server_running():
self.stop_server()
time.sleep(2)
# Open log files
self.server_stdout_file = open(
self.server_log_file.with_suffix(".stdout.log"), "w"
)
self.server_stderr_file = open(
self.server_log_file.with_suffix(".stderr.log"), "w"
)
if self.server_stdout_file is not None:
self.server_stdout_file.write(
f"=== {self.server_type} Server Started at {datetime.now()} ===\nCommand: {' '.join(self.server_cmd)}\n"
)
self.server_stdout_file.flush()
# Open log file (combined stdout+stderr)
self.server_stdout_file = open(self.server_log_file.with_suffix(".log"), "w")
# Write header
header = f"=== {self.server_type} Server Started at {datetime.now()} ===\nCommand: {' '.join(self.server_cmd)}\n"
self.server_stdout_file.write(header)
self.server_stdout_file.flush()
print(f"[{self.server_type}] {header}", end="")
# Launch
# Launch with pipe, redirect stderr to stdout
self.process = subprocess.Popen(
self.server_cmd,
stdout=self.server_stdout_file,
stderr=self.server_stderr_file,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT, # Redirect stderr to stdout
env=self.env,
preexec_fn=os.setsid,
text=True,
bufsize=1, # Line buffered
)
# Start tee thread for combined output
self._tee_threads = [
threading.Thread(
target=self._tee_output,
args=(self.process.stdout, self.server_stdout_file, self.server_type),
daemon=True,
),
]
for t in self._tee_threads:
t.start()
# Wait for health
start_time = time.time()
while time.time() - start_time < timeout:
if self.is_server_running():
return True
if self.process.poll() is not None:
# Process exited, wait for tee thread to finish
for t in self._tee_threads:
t.join(timeout=2)
self._close_log_files()
return False
time.sleep(5)
......@@ -220,6 +274,10 @@ class LLMServerManager:
pass
finally:
self.process = None
# Wait for tee threads to finish
for t in self._tee_threads:
t.join(timeout=2)
self._tee_threads = []
self._close_log_files()
def _close_log_files(self):
......@@ -229,9 +287,6 @@ class LLMServerManager:
)
self.server_stdout_file.close()
self.server_stdout_file = None
if self.server_stderr_file:
self.server_stderr_file.close()
self.server_stderr_file = None
def is_server_running(self) -> bool:
try:
......@@ -318,9 +373,9 @@ def llm_server(request, runtime_services):
# Put logs in the per-test directory set up by tests/conftest.py
log_dir = Path(request.node.name)
if importlib.util.find_spec("vllm") is not None:
if check_module_available("vllm"):
server_type = ServerType.vllm
elif importlib.util.find_spec("tensorrt_llm") is not None:
elif check_module_available("tensorrt_llm"):
server_type = ServerType.trtllm
else:
raise Exception(
......@@ -363,10 +418,14 @@ class TestDeterminismAgg(BaseTestDeterminism):
@pytest.mark.parametrize(
"llm_server",
[
{"cpu_blocks": int(os.environ.get("KVBM_CPU_BLOCKS", "10000"))},
{
"cpu_blocks": int(os.environ.get("KVBM_CPU_BLOCKS", "10000")),
"gpu_blocks": int(os.environ.get("KVBM_GPU_BLOCKS", "2048")),
},
],
indirect=True,
)
@pytest.mark.kvbm
def test_determinism_agg_with_cache_reset(
self, tester, llm_server, runtime_services
):
......@@ -379,197 +438,38 @@ class TestDeterminismAgg(BaseTestDeterminism):
@pytest.mark.parametrize(
"llm_server",
[
{"cpu_blocks": int(os.environ.get("KVBM_CPU_BLOCKS", "20000"))},
{
"cpu_blocks": int(os.environ.get("KVBM_CPU_BLOCKS", "30000")),
"gpu_blocks": int(os.environ.get("KVBM_GPU_BLOCKS", "2048")),
},
],
indirect=True,
)
@pytest.mark.parametrize(
"num_concurrent",
[int(x) for x in os.environ.get("KVBM_CONCURRENT_REQUESTS", "3").split(",")],
@pytest.mark.kvbm_concurrency
@pytest.mark.skipif(
not HAS_VLLM_BENCH, reason="requires vllm bench (vllm module not found)"
)
@pytest.mark.parametrize(
"max_tokens",
[int(os.environ.get("KVBM_MAX_TOKENS", "48"))],
@pytest.mark.xfail(
reason="Known issue, fixed in PR: https://github.com/ai-dynamo/dynamo/pull/5475",
run=True,
)
@pytest.mark.parametrize(
"num_prompts",
[int(x) for x in os.environ.get("KVBM_IFEVAL_PROMPTS", "120").split(",")],
)
@pytest.mark.skip(reason="Flaky test: DIS-665")
def test_concurrent_determinism_with_ifeval(
self,
tester,
llm_server,
runtime_services,
num_concurrent,
max_tokens,
num_prompts,
def test_concurrent_determinism_under_load(
self, tester, llm_server, runtime_services
):
"""Simple concurrent determinism test: send IFEval prompts concurrently, with cache reset."""
print("\n" + "=" * 70)
print("CONCURRENT DETERMINISM TEST WITH IFEVAL")
print("=" * 70)
"""Test Spanish prompt determinism under high concurrency load.
print(f"Using max_tokens={max_tokens} (from KVBM_MAX_TOKENS)")
Reproduces the bug where Spanish responses become English or corrupted.
"""
# Get the Spanish prompt path relative to this test file
spanish_prompt_path = Path(
os.path.join(os.path.dirname(__file__), "es_prompt.txt")
).absolute()
# Configuration comes from parametrize
print(
f"Configuration: {num_concurrent} concurrent requests, {max_tokens} max tokens"
)
# Load IFEval prompts
ifeval_prompts = tester.download_ifeval_dataset()
if not ifeval_prompts:
pytest.skip("IFEval dataset not available")
# Use parametrized number of IFEval prompts
test_prompts = ifeval_prompts[:num_prompts]
print(
f"Using {len(test_prompts)} IFEval prompts for concurrent testing (parametrized: {num_prompts})"
)
print(f"Concurrency level: {num_concurrent} simultaneous requests")
# Show sample prompts
print("\nSample prompts:")
for i, prompt in enumerate(test_prompts[:3]):
print(f" {i+1}. {prompt[:80]}{'...' if len(prompt) > 80 else ''}")
if len(test_prompts) > 3:
print(f" ... and {len(test_prompts) - 3} more")
def run_concurrent_test(phase_name, do_warmup=False):
"""Run one phase of concurrent testing."""
print(f"\n=== {phase_name} ===")
if do_warmup:
# KV Cache warmup - send ALL test prompts to compute KV caches
print(
f"Warming up KV caches with all {len(test_prompts)} test prompts..."
)
warmup_failed = 0
for i, prompt in enumerate(test_prompts):
if (
i % 5 == 0 or i == len(test_prompts) - 1
): # Progress every 5 prompts
print(f" Warmup progress: {i+1}/{len(test_prompts)}")
try:
tester.make_request(prompt)
except Exception as e:
warmup_failed += 1
if warmup_failed <= 3: # Show first few failures
print(f" Warmup failed for prompt {i}: {e}")
if warmup_failed > 0:
print(
f"Warmup completed with {warmup_failed} failures out of {len(test_prompts)} prompts"
)
else:
print(
f"Warmup completed successfully - all {len(test_prompts)} KV caches computed"
)
# Wait for 10 seconds to make sure all transfers are complete
time.sleep(10)
else:
print("Skipping warmup (already done in previous phase)")
# Run concurrent requests
print(
f"Sending {len(test_prompts)} requests with {num_concurrent} max concurrent..."
)
start_time = time.time()
def make_request_wrapper(prompt_and_idx):
idx, prompt = prompt_and_idx
try:
response = tester.make_request(prompt)
return {
"idx": idx,
"prompt": prompt,
"response": response,
"success": True,
}
except Exception as e:
return {
"idx": idx,
"prompt": prompt,
"error": str(e),
"success": False,
}
# Execute all requests concurrently
with ThreadPoolExecutor(max_workers=num_concurrent) as executor:
results = list(
executor.map(make_request_wrapper, enumerate(test_prompts))
)
elapsed = time.time() - start_time
successful = [r for r in results if r["success"]]
failed = [r for r in results if not r["success"]]
print(
f"Completed in {elapsed:.2f}s - Success: {len(successful)}, Failed: {len(failed)}"
)
if failed:
for fail in failed[:3]: # Show first few failures
print(f" Failed: {fail['error']}")
return successful
# Phase 1: Before cache reset
results_before = run_concurrent_test(
"PHASE 1: BEFORE CACHE RESET", do_warmup=True
# Call the base class implementation
super().base_test_spanish_prompt_determinism_under_load(
tester, llm_server, runtime_services, spanish_prompt_path
)
# Reset cache
print("\n" + "=" * 50)
print("RESETTING CACHE")
print("=" * 50)
tester.reset_prefix_cache()
# Phase 2: After cache reset
results_after = run_concurrent_test("PHASE 2: AFTER CACHE RESET")
# Compare results between phases
print("\n" + "=" * 70)
print("DETERMINISM ANALYSIS")
print("=" * 70)
# Create lookup for before results
before_responses = {r["idx"]: r["response"] for r in results_before}
after_responses = {r["idx"]: r["response"] for r in results_after}
deterministic_count = 0
total_compared = 0
for idx in before_responses:
if idx in after_responses:
total_compared += 1
before_resp = before_responses[idx]
after_resp = after_responses[idx]
if before_resp == after_resp:
deterministic_count += 1
print(f" Prompt {idx}: DETERMINISTIC")
else:
print(f" Prompt {idx}: NON-DETERMINISTIC")
print(f" Before: {before_resp}")
print(f" After: {after_resp}")
# Final assessment
success_rate = deterministic_count / total_compared if total_compared > 0 else 0
print("\n=== FINAL RESULT ===")
print(f"Prompts compared: {total_compared}")
print(f"Deterministic: {deterministic_count}")
print(f"Success rate: {success_rate:.1%}")
print(f"Concurrent requests: {num_concurrent}")
assert (
success_rate == 1.0
), f"Determinism failed: {deterministic_count}/{total_compared} prompts deterministic"
if __name__ == "__main__":
# Allow running as script
......
......@@ -15,7 +15,6 @@ Compared to aggregated mode, disaggregated mode has some known randomness.
Example reference: https://github.com/vllm-project/vllm/issues/7779#issuecomment-2304967870
"""
import importlib.util
import logging
import os
import signal
......@@ -32,6 +31,7 @@ import yaml
from .common import DeterminismTester, ServerType
from .common import TestDeterminism as BaseTestDeterminism
from .common import check_module_available
# Test markers to align with repository conventions
# Todo: enable the rest when kvbm is built in the ci
......@@ -507,9 +507,9 @@ def llm_server(request, runtime_services):
# Put logs in the per-test directory set up by tests/conftest.py
log_dir = Path(request.node.name)
if importlib.util.find_spec("vllm") is not None:
if check_module_available("vllm"):
server_type = ServerType.vllm
elif importlib.util.find_spec("tensorrt_llm") is not None:
elif check_module_available("tensorrt_llm"):
server_type = ServerType.trtllm
else:
pytest.skip("vllm module is not available in the current environment.")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment