feat: exclude k8 FT tests that need custom build from default run (#4017)

Signed-off-by: tzulingk@nvidia.com <tzulingk@nvidia.com>

feat: exclude k8 FT tests that need custom build from default run (#4017)
Signed-off-by: tzulingk@nvidia.com <tzulingk@nvidia.com>
20d1eb2e · Tzu-Ling Kan · GitHub · 8bd37c96 · 20d1eb2e · 20d1eb2e
Unverified Commit 20d1eb2e authored Oct 31, 2025 by Tzu-Ling Kan Committed by GitHub Oct 31, 2025
5 changed files
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -198,7 +198,8 @@ markers = [
    "slow: marks tests as known to be slow",
    "h100: marks tests to run on H100",
    "kvbm: marks tests for KV behavior and model determinism",
-    "model: model id used by a test or parameter"
+    "model: model id used by a test or parameter",
+    "custom_build: marks tests that require custom builds or special setup (e.g., MoE models)"
 ]
 # Linting/formatting

--- a/tests/fault_tolerance/deploy/README.md
+++ b/tests/fault_tolerance/deploy/README.md
@@ -141,12 +141,18 @@ The combined results of these two phases demonstrate both the system's ability t
 #### Example Scenario Execution:
-Run all deployments and failure scenarios
+Run standard deployments and failure scenarios (excludes custom builds by default):
 ```bash
 pytest tests/fault_tolerance/deploy/test_deployment.py -s -v --namespace ${NAMESPACE}
 ```
+To include all scenarios including custom builds (e.g., MoE models):
+```bash
+pytest tests/fault_tolerance/deploy/test_deployment.py -s -v --namespace ${NAMESPACE} --include-custom-build
+```
 ### Test Results Directory
 For each test scenario a directory of log files is created and post-processed to summarize the test. The directory structure differs based on which client type is used.
@@ -490,10 +496,54 @@ Then run the development container mounting the workspace and your kube config.
 ### Run the tests
+#### Default: Run Standard Tests Only
+By default, tests requiring custom builds (e.g., MoE models) are **automatically excluded**:
 ```bash
-pytest tests/fault_tolerance/deploy/test_deployment.py -s -v --namespace ${NAMESPACE} --image ${IMAGE}
+# Standard tests only
+pytest tests/fault_tolerance/deploy/test_deployment.py -s -v \
+  --namespace ${NAMESPACE} \
+  --image ${IMAGE}
 ```
+#### Include Custom Build Tests
+To run ALL tests including those requiring custom builds (e.g., MoE models):
+```bash
+pytest tests/fault_tolerance/deploy/test_deployment.py -s -v \
+  --namespace ${NAMESPACE} \
+  --image ${IMAGE} \
+  --include-custom-build
+```
+#### Run Only Custom Build Tests
+To run ONLY tests that require custom builds:
+```bash
+pytest tests/fault_tolerance/deploy/test_deployment.py -s -v \
+  --namespace ${NAMESPACE} \
+  --image ${IMAGE} \
+  -m "custom_build"
+```
+#### List Available Tests
+```bash
+# See which tests will run by default (excludes custom_build)
+pytest tests/fault_tolerance/deploy/test_deployment.py --collect-only -q
+# See which tests are excluded
+pytest tests/fault_tolerance/deploy/test_deployment.py --collect-only -m "custom_build" -q
+```
+> **Note:** Tests requiring custom builds are marked with `@pytest.mark.custom_build` and include:
+> - MoE (Mixture-of-Experts) models like DeepSeek-V2-Lite
+> - Tests requiring special Docker image configurations
+> - Any scenario with `requires_custom_build=True` in scenarios.py
 ### Note on Running with Additional Credentials

--- a/tests/fault_tolerance/deploy/conftest.py
+++ b/tests/fault_tolerance/deploy/conftest.py
@@ -15,6 +15,8 @@
 import pytest
+from tests.fault_tolerance.deploy.scenarios import scenarios
 def pytest_addoption(parser):
    parser.addoption("--image", type=str, default=None)
@@ -26,6 +28,71 @@ def pytest_addoption(parser):
        choices=["aiperf", "legacy"],
        help="Client type for load generation: 'aiperf' (default) or 'legacy'",
    )
+    parser.addoption(
+        "--include-custom-build",
+        action="store_true",
+        default=False,
+        help="Include tests that require custom builds (e.g., MoE models). "
+        "By default, these tests are excluded.",
+    )
+def pytest_generate_tests(metafunc):
+    """Dynamically parametrize tests and apply markers based on scenario properties.
+    This hook applies markers to individual test instances based on their scenario:
+    - @pytest.mark.custom_build: For MoE models and other tests requiring custom builds
+    """
+    if "scenario" in metafunc.fixturenames:
+        scenario_names = list(scenarios.keys())
+        argvalues = []
+        ids = []
+        for scenario_name in scenario_names:
+            scenario_obj = scenarios[scenario_name]
+            marks = []
+            if getattr(scenario_obj, "requires_custom_build", False):
+                marks.append(pytest.mark.custom_build)
+            # Always use pytest.param for type consistency (even with empty marks)
+            argvalues.append(pytest.param(scenario_name, marks=marks))
+            ids.append(scenario_name)
+        metafunc.parametrize("scenario_name", argvalues, ids=ids)
+def pytest_collection_modifyitems(config, items):
+    """Automatically deselect custom_build tests unless --include-custom-build is specified.
+    This allows users to run tests without any special flags and automatically excludes
+    tests that require custom builds. To include them, use --include-custom-build.
+    Note: If user explicitly uses -m marker filtering, we respect that and don't
+    auto-deselect, allowing them to run custom_build tests with -m "custom_build".
+    """
+    # If --include-custom-build flag is set, include all tests
+    if config.getoption("--include-custom-build"):
+        return
+    # If user explicitly used -m marker filtering, let pytest handle it
+    # Don't auto-deselect in this case
+    if config.option.markexpr:
+        return
+    # Default case: auto-deselect custom_build tests
+    deselected = []
+    selected = []
+    for item in items:
+        if "custom_build" in item.keywords:
+            deselected.append(item)
+        else:
+            selected.append(item)
+    if deselected:
+        config.hook.pytest_deselected(items=deselected)
+        items[:] = selected
 @pytest.fixture

--- a/tests/fault_tolerance/deploy/scenarios.py
+++ b/tests/fault_tolerance/deploy/scenarios.py
@@ -184,6 +184,9 @@ class Scenario:
    failures: list[Failure]
    model: Optional[str] = None
    backend: str = "vllm"  # Backend type for tracking
+    # When set to True, the test will be automatically marked with @pytest.mark.custom_build
+    # and excluded from default test runs unless --include-custom-build flag is used
+    requires_custom_build: bool = False  # Flag for tests needing custom builds/setup
 # Helper functions to create deployment specs
@@ -572,6 +575,7 @@ for deployment_name, deployment_info in DEPLOYMENT_SPECS.items():
            failures=failure,
            model=scenario_model,
            backend=backend,
+            requires_custom_build=is_moe,  # MoE models require custom builds
        )

--- a/tests/fault_tolerance/deploy/test_deployment.py
+++ b/tests/fault_tolerance/deploy/test_deployment.py
@@ -22,13 +22,13 @@ from tests.fault_tolerance.deploy.scenarios import (
 from tests.utils.managed_deployment import ManagedDeployment
-@pytest.fixture(params=scenarios.keys())
+@pytest.fixture
-def scenario(request, client_type):
+def scenario(scenario_name, client_type):
    """Get scenario and optionally override client type from command line.
    If --client-type is specified, it overrides the scenario's default client type.
    """
-    scenario_obj = scenarios[request.param]
+    scenario_obj = scenarios[scenario_name]
    # Override client type if specified on command line
    if client_type is not None: