test: pytests able to run locally now (#7219)

Signed-off-by: Graham King <grahamk@nvidia.com>

test: pytests able to run locally now (#7219)
Signed-off-by: Graham King <grahamk@nvidia.com>
4bd6299b · Graham King · GitHub · 5d5fd243 · 4bd6299b · 4bd6299b
Unverified Commit 4bd6299b authored Mar 11, 2026 by Graham King Committed by GitHub Mar 11, 2026
12 changed files
--- a/container/deps/requirements.test.txt
+++ b/container/deps/requirements.test.txt
@@ -38,4 +38,5 @@ tabulate==0.9.0
 types-aiofiles>=24.1.0
 types-PyYAML==6.0.12.20250915
 types-requests==2.32.4.20250913
+types-tabulate>=0.9.0
 websocket-client==1.9.0
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -318,6 +318,12 @@ module = ["vllm.*"]
 follow_imports = "skip"
 ignore_missing_imports = true

+[[tool.mypy.overrides]]
+# WAR mypy 1.18.x crash with numpy 1.26.x stubs:
+# "Should never get here in normal mode, got TypeAlias:numpy.float64 instead of TypeInfo"
+module = ["numpy", "numpy.*"]
+follow_imports = "skip"
+
 [tool.sphinx]

 # extra-content-head

--- a/tests/basic/test_wheel_contents.py
+++ b/tests/basic/test_wheel_contents.py
@@ -26,6 +26,7 @@ def test_no_bundled_shared_libraries():
    except PackageNotFoundError:
        pytest.fail("ai-dynamo-runtime is not installed")

+    assert installed_files is not None, "ai-dynamo-runtime has no recorded files"
    bundled_libs = [
        str(f) for f in installed_files if ".libs/" in str(f) and ".so" in str(f)
    ]

--- a/tests/conftest.py
+++ b/tests/conftest.py
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0

+import importlib.util
 import logging
 import os
 import shutil
@@ -50,6 +51,7 @@ def pytest_configure(config):
        "vllm: marks tests as requiring vllm",
        "trtllm: marks tests as requiring trtllm",
        "sglang: marks tests as requiring sglang",
+        "lmcache: mark tests as requiring lmcache",
        "multimodal: marks tests as multimodal (image/video) tests",
        "slow: marks tests as known to be slow",
        "h100: marks tests to run on H100",
@@ -282,11 +284,40 @@ def logger(request):
    logger.removeHandler(handler)


+def _item_has_marker(item, marker_name):
+    """Check if a test item has a marker, including module-level pytestmark."""
+    if item.get_closest_marker(marker_name):
+        return True
+    module = getattr(item, "module", None)
+    if module is not None:
+        marks = getattr(module, "pytestmark", [])
+        if not isinstance(marks, list):
+            marks = [marks]
+        if any(getattr(m, "name", "") == marker_name for m in marks):
+            return True
+    return False
+
+
 @pytest.hookimpl(trylast=True)
 def pytest_collection_modifyitems(config, items):
    """
    This function is called to modify the list of tests to run.
    """
+    # Auto-skip tests marked with a framework marker when the framework is not installed
+    framework_markers = {
+        "trtllm": "tensorrt_llm",
+        "vllm": "vllm",
+        "sglang": "sglang",
+        "kvbm": "kvbm",
+        "lmcache": "lmcache",
+    }
+    for marker_name, module_name in framework_markers.items():
+        if importlib.util.find_spec(module_name) is None:
+            skip = pytest.mark.skip(reason=f"{module_name} is not installed")
+            for item in items:
+                if _item_has_marker(item, marker_name):
+                    item.add_marker(skip)
+
    # Collect models via explicit pytest mark from final filtered items only
    models_to_download = set()
    for item in items:

--- a/tests/dependencies/test_kvbm_imports.py
+++ b/tests/dependencies/test_kvbm_imports.py
@@ -54,7 +54,7 @@ def _check_kvbm_imports():


 # Base tests (no framework markers) - run in main job with --framework none --enable-kvbm
-@pytest.mark.pre_merge
+@pytest.mark.post_merge
 @pytest.mark.gpu_0
 @pytest.mark.unit
 def test_kvbm_wheel_exists():
@@ -62,7 +62,7 @@ def test_kvbm_wheel_exists():
    _check_kvbm_wheel_exists()


-@pytest.mark.pre_merge
+@pytest.mark.post_merge
 @pytest.mark.gpu_0
 @pytest.mark.unit
 def test_kvbm_imports():
@@ -71,7 +71,7 @@ def test_kvbm_imports():


 # vLLM-specific tests - run in vLLM job (vLLM auto-enables KVBM)
-@pytest.mark.pre_merge
+@pytest.mark.post_merge
 @pytest.mark.vllm
 @pytest.mark.unit
 @pytest.mark.gpu_0
@@ -80,7 +80,7 @@ def test_kvbm_wheel_exists_vllm():
    _check_kvbm_wheel_exists()


-@pytest.mark.pre_merge
+@pytest.mark.post_merge
 @pytest.mark.vllm
 @pytest.mark.unit
 @pytest.mark.gpu_0
@@ -90,7 +90,7 @@ def test_kvbm_imports_vllm():


 # TRT-LLM-specific tests - run in TRT-LLM job (TRT-LLM auto-enables KVBM)
-@pytest.mark.pre_merge
+@pytest.mark.post_merge
 @pytest.mark.trtllm
 @pytest.mark.unit
 @pytest.mark.gpu_0
@@ -99,7 +99,7 @@ def test_kvbm_wheel_exists_trtllm():
    _check_kvbm_wheel_exists()


-@pytest.mark.pre_merge
+@pytest.mark.post_merge
 @pytest.mark.trtllm
 @pytest.mark.unit
 @pytest.mark.gpu_0

--- a/tests/dependencies/test_vllm_imports.py
+++ b/tests/dependencies/test_vllm_imports.py
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Unit tests to sanity check that required dependencies can be imported."""
-
-import pytest
-
-
-@pytest.mark.vllm
-@pytest.mark.unit
-@pytest.mark.gpu_1
-@pytest.mark.pre_merge
-def test_import_deep_ep():
-    """Test that deep_ep module can be imported."""
-    try:
-        import deep_ep
-
-        assert deep_ep is not None
-    except ImportError as e:
-        pytest.fail(f"Failed to import deep_ep: {e}")
-
-
-@pytest.mark.vllm
-@pytest.mark.unit
-@pytest.mark.gpu_1
-@pytest.mark.pre_merge
-def test_import_pplx_kernels():
-    """Test that pplx_kernels module can be imported."""
-    try:
-        import pplx_kernels
-
-        assert pplx_kernels is not None
-    except ImportError as e:
-        pytest.fail(f"Failed to import pplx_kernels: {e}")
--- a/tests/fault_tolerance/deploy/scenarios.py
+++ b/tests/fault_tolerance/deploy/scenarios.py
@@ -20,7 +20,7 @@ import time
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from enum import Enum, auto
-from typing import TYPE_CHECKING, Dict, List, Optional, Pattern
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Pattern

 from typing_extensions import Required, TypedDict

@@ -568,7 +568,7 @@ class TerminateProcessFailure(Failure):
            f"Checking Frontend service health (after {service_name} pod restart)..."
        )

-        pod_ports = {}  # Temporary dict for port forward tracking
+        pod_ports: dict[str, Any] = {}  # Temporary dict for port forward tracking
        try:
            logger.info("Getting frontend pod and setting up port forward...")
            frontend_pod_name, local_port, frontend_pod = get_frontend_port(

--- a/tests/fault_tolerance/deploy/test_deployment.py
+++ b/tests/fault_tolerance/deploy/test_deployment.py
@@ -9,7 +9,7 @@ import re
 import signal
 from contextlib import contextmanager
 from multiprocessing.context import SpawnProcess
-from typing import Any
+from typing import Any, Optional

 import pytest

@@ -31,8 +31,8 @@ from tests.utils.test_output import resolve_test_output_path

 def get_model_from_deployment(
    deployment_spec: DeploymentSpec,
-    scenario: Scenario = None,
-    service_name: str = None,
+    scenario: Optional[Scenario] = None,
+    service_name: Optional[str] = None,
 ) -> str:
    """Get model name from deployment spec.

@@ -60,19 +60,22 @@ def get_model_from_deployment(
    # Get model from backend-specific worker (if scenario provided)
    if scenario:
        try:
+            model: Optional[str] = None
            if scenario.backend == "vllm":
-                return deployment_spec["VllmDecodeWorker"].model
+                model = deployment_spec["VllmDecodeWorker"].model
            elif scenario.backend == "sglang":
-                return deployment_spec["decode"].model
+                model = deployment_spec["decode"].model
            elif scenario.backend == "trtllm":
                # Determine deployment type from scenario deployment name
                if (
                    "agg" in deployment_spec.name
                    and "disagg" not in deployment_spec.name
                ):
-                    return deployment_spec["TRTLLMWorker"].model
+                    model = deployment_spec["TRTLLMWorker"].model
                else:
-                    return deployment_spec["TRTLLMDecodeWorker"].model
+                    model = deployment_spec["TRTLLMDecodeWorker"].model
+            if model:
+                return model
        except (KeyError, AttributeError) as e:
            logging.warning(
                f"Could not get model from backend-specific worker "
@@ -290,6 +293,8 @@ async def _inject_failures(
    return affected_pods


+# TODO: These globals might not work in parallel testing. FIXME
+
 global_result_list = []
 # Global storage for test results (used by validation fixture)
 test_results_cache = {}
@@ -489,6 +494,7 @@ def results_summary():
 @pytest.mark.post_merge
 @pytest.mark.e2e
 @pytest.mark.slow
+@pytest.mark.gpu_0
 @pytest.mark.filterwarnings("ignore::DeprecationWarning")
 async def test_fault_scenario(
    scenario: Scenario,  # noqa: F811

--- a/tests/frontend/grpc/echo_tensor_worker.py
+++ b/tests/frontend/grpc/echo_tensor_worker.py
@@ -46,6 +46,7 @@ async def echo_tensor_worker(runtime: DistributedRuntime):

    # Internally the bytes string will be converted to List of int
    retrieved_model_config = runtime_config.get_tensor_model_config()
+    assert retrieved_model_config is not None
    retrieved_model_config["triton_model_config"] = bytes(
        retrieved_model_config["triton_model_config"]
    )
@@ -63,7 +64,7 @@ async def echo_tensor_worker(runtime: DistributedRuntime):
    await endpoint.serve_endpoint(generate)


-async def generate(request, context):
+async def generate(request):
    """Echo tensors and parameters back to the client."""
    # [NOTE] gluo: currently there is no frontend side
    # validation between model config and actual request,

--- a/tests/frontend/grpc/triton_echo_client.py
+++ b/tests/frontend/grpc/triton_echo_client.py
@@ -86,7 +86,7 @@ class TritonEchoClient:

        class UserData:
            def __init__(self):
-                self._completed_requests = queue.Queue()
+                self._completed_requests: queue.Queue = queue.Queue()

        # Define the callback function. Note the last two parameters should be
        # result and error. InferenceServerClient would povide the results of an

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -114,13 +114,13 @@ vllm_configs = {
        directory=vllm_dir,
        script_name="agg_lmcache.sh",
        marks=[
+            pytest.mark.lmcache,
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.timeout(360),  # 3x estimated time (70s) + download time (150s)
            pytest.mark.skipif(
                _is_cuda13(),
                reason="lmcache does not support CUDA 13 as of v0.3.11",
-                strict=False,
            ),
        ],
        model="Qwen/Qwen3-0.6B",
@@ -136,13 +136,13 @@ vllm_configs = {
        directory=vllm_dir,
        script_name="agg_lmcache_multiproc.sh",
        marks=[
+            pytest.mark.lmcache,
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.timeout(360),  # 3x estimated time (70s) + download time (150s)
            pytest.mark.skipif(
                _is_cuda13(),
                reason="lmcache does not support CUDA 13 as of v0.3.11",
-                strict=False,
            ),
        ],
        model="Qwen/Qwen3-0.6B",
@@ -317,7 +317,8 @@ vllm_configs = {
        name="multimodal_agg_frontend_decoding",
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
+        # post_merge because needs real NIXL not stub
+        marks=[pytest.mark.gpu_1, pytest.mark.post_merge],
        model="Qwen/Qwen2-VL-2B-Instruct",
        # Pass --frontend-decoding to enable Rust frontend image decoding + NIXL RDMA transfer
        script_args=[
@@ -351,7 +352,7 @@ vllm_configs = {
        script_name="disagg_multimodal_epd.sh",
        marks=[
            pytest.mark.gpu_1,
-            pytest.mark.pre_merge,
+            pytest.mark.post_merge,
            pytest.mark.skip(reason="DYN-2265"),
        ],
        model="Qwen/Qwen3-VL-2B-Instruct",
@@ -388,7 +389,7 @@ vllm_configs = {
        name="multimodal_agg_qwen",
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
+        marks=[pytest.mark.gpu_1, pytest.mark.post_merge],
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
        delayed_start=0,

--- a/tests/utils/payload_builder.py
+++ b/tests/utils/payload_builder.py
@@ -198,7 +198,7 @@ def metric_payload_default(
    Returns:
        Backend-specific MetricsPayload subclass based on backend parameter
    """
-    common_args = {
+    common_args: dict[str, Any] = {
        "body": {},
        "repeat_count": repeat_count,
        "expected_log": expected_log or [],