test: bring back the framework 1 gpu pre-merge tests + clean up pytest markers (#4698)

Signed-off-by: PeaBrane <yanrpei@gmail.com>

test: bring back the framework 1 gpu pre-merge tests + clean up pytest markers (#4698)
Signed-off-by: PeaBrane <yanrpei@gmail.com>
7e499b5c · Yan Ru Pei · GitHub · 3cad926e · 7e499b5c · 7e499b5c
Unverified Commit 7e499b5c authored Dec 02, 2025 by Yan Ru Pei Committed by GitHub Dec 02, 2025
18 changed files
--- a/.github/workflows/container-validation-backends.yml
+++ b/.github/workflows/container-validation-backends.yml
@@ -179,23 +179,14 @@ jobs:
          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}

-      - name: Run unit tests
+      - name: Run tests
        if: ${{ matrix.platform.arch != 'arm64' }}
        uses: ./.github/actions/pytest
        with:
          image_tag: ${{ steps.build-image.outputs.image_tag }}
-          pytest_marks: "unit and vllm and gpu_1"
+          pytest_marks: "pre_merge and vllm"
          framework: "vllm"
-          test_type: "unit"
-          platform_arch: ${{ matrix.platform.arch }}
-      - name: Run e2e tests
-        if: ${{ matrix.platform.arch != 'arm64' }}
-        uses: ./.github/actions/pytest
-        with:
-          image_tag: ${{ steps.build-image.outputs.image_tag }}
-          pytest_marks: "e2e and vllm and gpu_1 and not slow"
-          framework: "vllm"
-          test_type: "e2e, gpu_1"
+          test_type: "pre_merge"
          platform_arch: ${{ matrix.platform.arch }}

  sglang:
@@ -246,23 +237,14 @@ jobs:
          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}

-      - name: Run unit tests
-        if: ${{ matrix.platform.arch != 'arm64' }}
-        uses: ./.github/actions/pytest
-        with:
-          image_tag: ${{ steps.build-image.outputs.image_tag }}
-          pytest_marks: "unit and sglang and gpu_1"
-          framework: "sglang"
-          test_type: "unit"
-          platform_arch: ${{ matrix.platform.arch }}
-      - name: Run e2e tests
+      - name: Run tests
        if: ${{ matrix.platform.arch != 'arm64' }}
        uses: ./.github/actions/pytest
        with:
          image_tag: ${{ steps.build-image.outputs.image_tag }}
-          pytest_marks: "e2e and sglang and gpu_1"
+          pytest_marks: "pre_merge and sglang"
          framework: "sglang"
-          test_type: "e2e, gpu_1"
+          test_type: "pre_merge"
          platform_arch: ${{ matrix.platform.arch }}

  trtllm:
@@ -313,23 +295,14 @@ jobs:
          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}

-      - name: Run unit tests
-        if: ${{ matrix.platform.arch != 'arm64' }}
-        uses: ./.github/actions/pytest
-        with:
-          image_tag: ${{ steps.build-image.outputs.image_tag }}
-          pytest_marks: "unit and trtllm and gpu_1"
-          framework: "trtllm"
-          test_type: "unit"
-          platform_arch: ${{ matrix.platform.arch }}
-      - name: Run e2e tests
+      - name: Run tests
        if: ${{ matrix.platform.arch != 'arm64' }}
        uses: ./.github/actions/pytest
        with:
          image_tag: ${{ steps.build-image.outputs.image_tag }}
-          pytest_marks: "e2e and trtllm and gpu_1 and not slow"
+          pytest_marks: "pre_merge and trtllm"
          framework: "trtllm"
-          test_type: "e2e, gpu_1"
+          test_type: "pre_merge"
          platform_arch: ${{ matrix.platform.arch }}

  deploy-test-fault-tolerance:

--- a/.github/workflows/container-validation-dynamo.yml
+++ b/.github/workflows/container-validation-dynamo.yml
@@ -65,7 +65,7 @@ jobs:
          docker compose down
      - name: Run pytest (parallel tests with xdist)
        env:
-          PYTEST_MARKS: "pre_merge and parallel"
+          PYTEST_MARKS: "pre_merge and parallel and not (vllm or sglang or trtllm)"
        run: |
          docker run -w /workspace \
            --name ${{ env.CONTAINER_ID }}_pytest_parallel \
@@ -77,7 +77,7 @@ jobs:
          docker cp ${{ env.CONTAINER_ID }}_pytest_parallel:/workspace/${{ env.PYTEST_PARALLEL_XML_FILE }} . || echo "No parallel test report found"
      - name: Run pytest (sequential tests)
        env:
-          PYTEST_MARKS: "(pre_merge and not parallel) or mypy"
+          PYTEST_MARKS: "((pre_merge and not parallel) or mypy) and not (vllm or sglang or trtllm)"
        run: |
          docker run -w /workspace \
            --name ${{ env.CONTAINER_ID }}_pytest \

--- a/components/src/dynamo/common/utils/prometheus.py
+++ b/components/src/dynamo/common/utils/prometheus.py
@@ -55,7 +55,7 @@ def register_engine_metrics_callback(

        # Include multiple metric prefixes
        register_engine_metrics_callback(
-            generate_endpoint, REGISTRY, metric_prefix_filter=["vllm:", "lmcache:"]
+            generate_endpoint, REGISTRY, metric_prefix_filters=["vllm:", "lmcache:"]
        )

        # With filtering and prefixing for TensorRT-LLM

--- a/components/src/dynamo/sglang/tests/test_sglang_prometheus_utils.py
+++ b/components/src/dynamo/sglang/tests/test_sglang_prometheus_utils.py
@@ -13,6 +13,7 @@ pytestmark = [
    pytest.mark.unit,
    pytest.mark.sglang,
    pytest.mark.gpu_0,
+    pytest.mark.pre_merge,
    pytest.mark.post_merge,
 ]

@@ -58,7 +59,7 @@ sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0075
        """Test SGLang use case: filter to sglang: metrics and exclude python_/process_."""
        result = get_prometheus_expfmt(
            sglang_registry,
-            metric_prefix_filter="sglang:",
+            metric_prefix_filters=["sglang:"],
            exclude_prefixes=["python_", "process_"],
        )


--- a/components/src/dynamo/trtllm/tests/test_trtllm_autodeploy.py
+++ b/components/src/dynamo/trtllm/tests/test_trtllm_autodeploy.py
@@ -19,6 +19,7 @@ pytestmark = [
    # `.github/workflows/container-validation-backends.yml` does not make use of
    # the `gpu_0` marker.
    pytest.mark.gpu_1,
+    pytest.mark.pre_merge,
 ]
 _PYTORCH_LLM_CLS_NAME = "dynamo.trtllm.engine.LLM"
 _AUTODEPLOY_LLM_CLS_NAME = "tensorrt_llm._torch.auto_deploy.LLM"

--- a/components/src/dynamo/trtllm/tests/test_trtllm_prometheus_utils.py
+++ b/components/src/dynamo/trtllm/tests/test_trtllm_prometheus_utils.py
@@ -13,6 +13,7 @@ pytestmark = [
    pytest.mark.unit,
    pytest.mark.trtllm,
    pytest.mark.gpu_0,
+    pytest.mark.pre_merge,
    pytest.mark.post_merge,
 ]


--- a/components/src/dynamo/trtllm/tests/test_trtllm_unit.py
+++ b/components/src/dynamo/trtllm/tests/test_trtllm_unit.py
@@ -23,6 +23,7 @@ pytestmark = [
    pytest.mark.unit,
    pytest.mark.trtllm,
    pytest.mark.gpu_1,
+    pytest.mark.pre_merge,
 ]



--- a/components/src/dynamo/vllm/tests/test_vllm_prometheus_utils.py
+++ b/components/src/dynamo/vllm/tests/test_vllm_prometheus_utils.py
@@ -13,6 +13,7 @@ pytestmark = [
    pytest.mark.unit,
    pytest.mark.vllm,
    pytest.mark.gpu_0,
+    pytest.mark.pre_merge,
    pytest.mark.post_merge,
 ]

@@ -56,7 +57,7 @@ vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B"} 165
        """Test vLLM use case: filter to vllm: metrics and exclude python_/process_."""
        result = get_prometheus_expfmt(
            vllm_registry,
-            metric_prefix_filter="vllm:",
+            metric_prefix_filters=["vllm:"],
            exclude_prefixes=["python_", "process_"],
        )


--- a/tests/README.md
+++ b/tests/README.md
@@ -71,7 +71,8 @@ Markers are required for all tests. They are used for test selection in CI and l
 | Test Type [required]    | unit, integration, e2e, benchmark, stress, multimodal   | Nature of the test                 |
 | Hardware [required]     | gpu_0, gpu_1, gpu_2,  gpu_4, gpu_8, h100      | Number/type of GPUs required       |
 | Component/Framework     | vllm, trtllm, sglang, kvbm, planner, router    | Backend or component specificity   |
-| Other                   | slow, skip, xfail        | Special handling                   |
+| Execution               | parallel                 | Test can run in parallel with pytest-xdist |
+| Other                   | slow, skip, xfail, mypy, custom_build        | Special handling                   |

 ### Example
 ```python

--- a/tests/fault_tolerance/cancellation/test_trtllm.py
+++ b/tests/fault_tolerance/cancellation/test_trtllm.py
@@ -21,6 +21,13 @@ from tests.utils.payloads import check_health_generate, check_models_api

 logger = logging.getLogger(__name__)

+pytestmark = [
+    pytest.mark.trtllm,
+    pytest.mark.gpu_1,
+    pytest.mark.e2e,
+    pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
+]
+

 class DynamoWorkerProcess(ManagedProcess):
    """Process manager for Dynamo worker with TensorRT-LLM backend"""
@@ -127,10 +134,6 @@ class DynamoWorkerProcess(ManagedProcess):
        return False


-@pytest.mark.trtllm
-@pytest.mark.gpu_1
-@pytest.mark.e2e
-@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
 @pytest.mark.nightly
 def test_request_cancellation_trtllm_aggregated(
    request, runtime_services, predownload_models
@@ -205,10 +208,6 @@ def test_request_cancellation_trtllm_aggregated(
                logger.info(f"{description} detected successfully")


-@pytest.mark.trtllm
-@pytest.mark.gpu_1
-@pytest.mark.e2e
-@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
 @pytest.mark.nightly
 def test_request_cancellation_trtllm_decode_cancel(
    request, runtime_services, predownload_models
@@ -282,11 +281,7 @@ def test_request_cancellation_trtllm_decode_cancel(
                )


-@pytest.mark.trtllm
-@pytest.mark.gpu_1
-@pytest.mark.e2e
 @pytest.mark.nightly
-@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
 def test_request_cancellation_trtllm_prefill_cancel(
    request, runtime_services, predownload_models
 ):
@@ -369,10 +364,6 @@ def test_request_cancellation_trtllm_prefill_cancel(
                )


-@pytest.mark.trtllm
-@pytest.mark.gpu_1
-@pytest.mark.e2e
-@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
 @pytest.mark.xfail(
    reason="May fail due to unknown reason with TRT-LLM or backend implementation",
    strict=False,

--- a/tests/fault_tolerance/migration/test_vllm.py
+++ b/tests/fault_tolerance/migration/test_vllm.py
@@ -23,6 +23,14 @@ from .utils import (

 logger = logging.getLogger(__name__)

+pytestmark = [
+    pytest.mark.vllm,
+    pytest.mark.gpu_1,
+    pytest.mark.e2e,
+    pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME),
+    pytest.mark.nightly,
+]
+

 class DynamoWorkerProcess(ManagedProcess):
    """Process manager for Dynamo worker with vLLM backend"""
@@ -100,11 +108,6 @@ class DynamoWorkerProcess(ManagedProcess):
        return False


-@pytest.mark.vllm
-@pytest.mark.gpu_1
-@pytest.mark.e2e
-@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
-@pytest.mark.nightly
 def test_request_migration_vllm_worker_failure(
    request, runtime_services, predownload_models, set_ucx_tls_no_mm
 ):
@@ -151,11 +154,6 @@ def test_request_migration_vllm_worker_failure(
                verify_migration_occurred(frontend)


-@pytest.mark.vllm
-@pytest.mark.gpu_1
-@pytest.mark.e2e
-@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
-@pytest.mark.nightly
 def test_request_migration_vllm_graceful_shutdown(
    request, runtime_services, predownload_models, set_ucx_tls_no_mm
 ):
@@ -203,11 +201,6 @@ def test_request_migration_vllm_graceful_shutdown(
                verify_migration_occurred(frontend)


-@pytest.mark.vllm
-@pytest.mark.gpu_1
-@pytest.mark.e2e
-@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
-@pytest.mark.nightly
 def test_no_request_migration_vllm_worker_failure(
    request, runtime_services, predownload_models, set_ucx_tls_no_mm
 ):
@@ -268,11 +261,6 @@ def test_no_request_migration_vllm_worker_failure(
                    ), f"Unexpected migration message: {e}"


-@pytest.mark.vllm
-@pytest.mark.gpu_1
-@pytest.mark.e2e
-@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
-@pytest.mark.nightly
 def test_no_request_migration_vllm_graceful_shutdown(
    request, runtime_services, predownload_models, set_ucx_tls_no_mm
 ):

--- a/tests/frontend/test_completion_mocker_engine.py
+++ b/tests/frontend/test_completion_mocker_engine.py
@@ -22,6 +22,13 @@ logger = logging.getLogger(__name__)

 TEST_MODEL = QWEN

+pytestmark = [
+    pytest.mark.e2e,
+    pytest.mark.gpu_1,
+    pytest.mark.post_merge,
+    pytest.mark.model(TEST_MODEL),
+]
+

 class DynamoFrontendProcess(ManagedProcess):
    """Process manager for Dynamo frontend"""
@@ -145,10 +152,6 @@ def start_services(request, runtime_services):


 @pytest.mark.usefixtures("start_services")
-@pytest.mark.e2e
-@pytest.mark.gpu_1
-@pytest.mark.post_merge
-@pytest.mark.model(TEST_MODEL)
 def test_completion_string_prompt() -> None:
    payload: Dict[str, Any] = {
        "model": TEST_MODEL,
@@ -165,10 +168,6 @@ def test_completion_string_prompt() -> None:


 @pytest.mark.usefixtures("start_services")
-@pytest.mark.e2e
-@pytest.mark.gpu_1
-@pytest.mark.post_merge
-@pytest.mark.model(TEST_MODEL)
 def test_completion_empty_array_prompt() -> None:
    payload: Dict[str, Any] = {
        "model": TEST_MODEL,
@@ -185,10 +184,6 @@ def test_completion_empty_array_prompt() -> None:


 @pytest.mark.usefixtures("start_services")
-@pytest.mark.e2e
-@pytest.mark.gpu_1
-@pytest.mark.post_merge
-@pytest.mark.model(TEST_MODEL)
 def test_completion_single_element_array_prompt() -> None:
    payload: Dict[str, Any] = {
        "model": TEST_MODEL,
@@ -205,10 +200,6 @@ def test_completion_single_element_array_prompt() -> None:


 @pytest.mark.usefixtures("start_services")
-@pytest.mark.e2e
-@pytest.mark.gpu_1
-@pytest.mark.post_merge
-@pytest.mark.model(TEST_MODEL)
 def test_completion_multi_element_array_prompt() -> None:
    payload: Dict[str, Any] = {
        "model": TEST_MODEL,

--- a/tests/frontend/test_vllm.py
+++ b/tests/frontend/test_vllm.py
@@ -22,6 +22,13 @@ logger = logging.getLogger(__name__)

 TEST_MODEL = GPT_OSS

+pytestmark = [
+    pytest.mark.vllm,
+    pytest.mark.gpu_1,
+    pytest.mark.e2e,
+    pytest.mark.model(TEST_MODEL),
+]
+
 WEATHER_TOOL = {
    "type": "function",
    "function": {
@@ -211,11 +218,7 @@ def _validate_chat_response(response: requests.Response) -> Dict[str, Any]:


 @pytest.mark.usefixtures("start_services")
-@pytest.mark.vllm
-@pytest.mark.gpu_1
-@pytest.mark.e2e
 @pytest.mark.post_merge
-@pytest.mark.model(TEST_MODEL)
 def test_reasoning_effort(request, runtime_services, predownload_models) -> None:
    """High reasoning effort should yield more detailed reasoning than low effort."""

@@ -278,11 +281,7 @@ def test_reasoning_effort(request, runtime_services, predownload_models) -> None


 @pytest.mark.usefixtures("start_services")
-@pytest.mark.vllm
-@pytest.mark.gpu_1
-@pytest.mark.e2e
 @pytest.mark.post_merge
-@pytest.mark.model(TEST_MODEL)
 def test_tool_calling(request, runtime_services, predownload_models) -> None:
    """Test tool calling functionality with weather and system health tools."""

@@ -321,11 +320,7 @@ def test_tool_calling(request, runtime_services, predownload_models) -> None:


 @pytest.mark.usefixtures("start_services")
-@pytest.mark.vllm
-@pytest.mark.gpu_1
-@pytest.mark.e2e
 @pytest.mark.nightly
-@pytest.mark.model(TEST_MODEL)
 def test_tool_calling_second_round(
    request, runtime_services, predownload_models
 ) -> None:
@@ -388,11 +383,7 @@ def test_tool_calling_second_round(


 @pytest.mark.usefixtures("start_services")
-@pytest.mark.vllm
-@pytest.mark.gpu_1
-@pytest.mark.e2e
 @pytest.mark.nightly
-@pytest.mark.model(TEST_MODEL)
 def test_reasoning(request, runtime_services, predownload_models) -> None:
    """Test reasoning functionality with a mathematical problem."""


--- a/tests/router/test_router_e2e_with_mockers.py
+++ b/tests/router/test_router_e2e_with_mockers.py
@@ -22,17 +22,17 @@ from tests.router.common import (  # utilities
 from tests.utils.constants import ROUTER_MODEL_NAME
 from tests.utils.managed_process import ManagedProcess

+logger = logging.getLogger(__name__)
+
+MODEL_NAME = ROUTER_MODEL_NAME
+
 pytestmark = [
    pytest.mark.pre_merge,
    pytest.mark.gpu_0,
    pytest.mark.integration,
+    pytest.mark.parallel,
+    pytest.mark.model(MODEL_NAME),
 ]
-
-
-logger = logging.getLogger(__name__)
-
-
-MODEL_NAME = ROUTER_MODEL_NAME
 NUM_MOCKERS = 2
 SPEEDUP_RATIO = 10.0
 BASE_PORT = 9100  # Base port for all tests (high port to avoid conflicts)
@@ -287,11 +287,6 @@ class DisaggMockerProcess:
        self._process.__exit__(exc_type, exc_val, exc_tb)


-@pytest.mark.pre_merge
-@pytest.mark.gpu_0
-@pytest.mark.integration
-@pytest.mark.parallel
-@pytest.mark.model(MODEL_NAME)
 def test_mocker_kv_router(request, runtime_services_session, predownload_tokenizers):
    """
    Test KV router with multiple mocker engine instances.
@@ -331,11 +326,6 @@ def test_mocker_kv_router(request, runtime_services_session, predownload_tokeniz
            mockers.__exit__(None, None, None)


-@pytest.mark.pre_merge
-@pytest.mark.gpu_0
-@pytest.mark.integration
-@pytest.mark.parallel
-@pytest.mark.model(MODEL_NAME)
 @pytest.mark.parametrize("store_backend", ["etcd", "file"])
 def test_mocker_two_kv_router(
    request,
@@ -391,11 +381,6 @@ def test_mocker_two_kv_router(
            mockers.__exit__(None, None, None)


-@pytest.mark.pre_merge
-@pytest.mark.gpu_0
-@pytest.mark.integration
-@pytest.mark.parallel
-@pytest.mark.model(MODEL_NAME)
 @pytest.mark.skip(reason="Flaky, temporarily disabled")
 def test_mocker_kv_router_overload_503(
    request, runtime_services_session, predownload_tokenizers
@@ -434,11 +419,6 @@ def test_mocker_kv_router_overload_503(
            mockers.__exit__(None, None, None)


-@pytest.mark.pre_merge
-@pytest.mark.gpu_0
-@pytest.mark.integration
-@pytest.mark.parallel
-@pytest.mark.model(MODEL_NAME)
 def test_kv_push_router_bindings(
    request, runtime_services_session, predownload_tokenizers
 ):
@@ -475,11 +455,6 @@ def test_kv_push_router_bindings(
            mockers.__exit__(None, None, None)


-@pytest.mark.pre_merge
-@pytest.mark.gpu_0
-@pytest.mark.integration
-@pytest.mark.parallel
-@pytest.mark.model(MODEL_NAME)
 @pytest.mark.parametrize("store_backend", ["etcd", "file"])
 def test_indexers_sync(
    request,
@@ -529,11 +504,6 @@ def test_indexers_sync(
            mockers.__exit__(None, None, None)


-@pytest.mark.pre_merge
-@pytest.mark.gpu_0
-@pytest.mark.integration
-@pytest.mark.parallel
-@pytest.mark.model(MODEL_NAME)
 def test_query_instance_id_returns_worker_and_tokens(
    request, runtime_services_session, predownload_tokenizers
 ):
@@ -568,11 +538,6 @@ def test_query_instance_id_returns_worker_and_tokens(
            mockers.__exit__(None, None, None)


-@pytest.mark.pre_merge
-@pytest.mark.gpu_0
-@pytest.mark.integration
-@pytest.mark.parallel
-@pytest.mark.model(MODEL_NAME)
 def test_router_decisions(request, runtime_services_session, predownload_tokenizers):
    """Validate KV cache prefix reuse and dp_rank routing by sending progressive requests with overlapping prefixes."""

@@ -612,9 +577,6 @@ def test_router_decisions(request, runtime_services_session, predownload_tokeniz
            mockers.__exit__(None, None, None)


-@pytest.mark.pre_merge
-@pytest.mark.parallel
-@pytest.mark.model(MODEL_NAME)
 def test_router_disagg_decisions(
    request, runtime_services_session, predownload_tokenizers
 ):
@@ -680,11 +642,6 @@ def test_router_disagg_decisions(
            prefill_workers.__exit__(None, None, None)


-@pytest.mark.pre_merge
-@pytest.mark.gpu_0
-@pytest.mark.integration
-@pytest.mark.parallel
-@pytest.mark.model(MODEL_NAME)
 def test_busy_threshold_endpoint(
    request, runtime_services_session, predownload_tokenizers
 ):

--- a/tests/router/test_router_e2e_with_vllm.py
+++ b/tests/router/test_router_e2e_with_vllm.py
@@ -18,6 +18,13 @@ from tests.utils.managed_process import ManagedProcess
 logger = logging.getLogger(__name__)

 MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+
+pytestmark = [
+    pytest.mark.pre_merge,
+    pytest.mark.e2e,
+    pytest.mark.vllm,
+    pytest.mark.model(MODEL_NAME),
+]
 SPEEDUP_RATIO = 10.0
 PORTS = [
    8011,
@@ -269,11 +276,8 @@ class VLLMProcess:
        time.sleep(2)


-@pytest.mark.e2e
 @pytest.mark.gpu_1
-@pytest.mark.vllm
 @pytest.mark.skip(reason="All vLLM tests disabled for now")
-@pytest.mark.model(MODEL_NAME)
 def test_vllm_kv_router_basic(request, runtime_services, predownload_tokenizers):
    """
    Quick e2e sanity test for KV router with vLLM engine instances.
@@ -319,11 +323,8 @@ def test_vllm_kv_router_basic(request, runtime_services, predownload_tokenizers)
            vllm_workers.__exit__(None, None, None)


-@pytest.mark.e2e
-@pytest.mark.vllm
 @pytest.mark.gpu_1
 @pytest.mark.skip(reason="All vLLM tests disabled for now")
-@pytest.mark.model(MODEL_NAME)
 def test_router_decisions_vllm_multiple_workers(
    request, runtime_services, predownload_tokenizers
 ):
@@ -371,11 +372,8 @@ def test_router_decisions_vllm_multiple_workers(
            vllm_workers.__exit__(None, None, None)


-@pytest.mark.e2e
-@pytest.mark.vllm
 @pytest.mark.gpu_2
 @pytest.mark.skip(reason="All vLLM tests disabled for now")
-@pytest.mark.model(MODEL_NAME)
 def test_router_decisions_vllm_dp(request, runtime_services, predownload_tokenizers):
    """Validate KV cache prefix reuse with vLLM by sending progressive requests with overlapping prefixes.
    Same flow as test_router_decisions_vllm_multiple_workers; force first request to (worker_id, dp_rank=1).

--- a/tests/serve/test_sglang.py
+++ b/tests/serve/test_sglang.py
@@ -44,7 +44,7 @@ sglang_configs = {
        name="aggregated",
        directory=sglang_dir,
        script_name="agg.sh",
-        marks=[pytest.mark.gpu_1],
+        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
        model="Qwen/Qwen3-0.6B",
        env={},
        models_port=8000,
@@ -73,7 +73,11 @@ sglang_configs = {
        name="disaggregated_same_gpu",
        directory=sglang_dir,
        script_name="disagg_same_gpu.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.skip(reason="unstable")],
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.pre_merge,
+            pytest.mark.skip(reason="unstable"),
+        ],
        model="Qwen/Qwen3-0.6B",
        env={},
        models_port=8000,
@@ -116,7 +120,7 @@ sglang_configs = {
        name="template_verification",
        directory=SERVE_TEST_DIR,  # special directory for test-specific scripts
        script_name="template_verifier.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.nightly],
+        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.nightly],
        model="Qwen/Qwen3-0.6B",
        env={},
        models_port=8000,
@@ -159,7 +163,7 @@ sglang_configs = {
        name="embedding_agg",
        directory=sglang_dir,
        script_name="agg_embed.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.nightly],
+        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.nightly],
        model="Qwen/Qwen3-Embedding-4B",
        delayed_start=0,
        timeout=180,

--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
@@ -40,7 +40,7 @@ trtllm_configs = {
        name="aggregated",
        directory=trtllm_dir,
        script_name="agg_metrics.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.trtllm],
+        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm],
        model="Qwen/Qwen3-0.6B",
        models_port=8000,
        request_payloads=[
@@ -65,7 +65,7 @@ trtllm_configs = {
        name="disaggregated_same_gpu",
        directory=trtllm_dir,
        script_name="disagg_same_gpu.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.trtllm],
+        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm],
        model="Qwen/Qwen3-0.6B",
        models_port=8000,
        request_payloads=[
@@ -79,7 +79,7 @@ trtllm_configs = {
        name="aggregated_router",
        directory=trtllm_dir,
        script_name="agg_router.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.trtllm, pytest.mark.post_merge],
+        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge, pytest.mark.trtllm],
        model="Qwen/Qwen3-0.6B",
        models_port=8000,
        request_payloads=[

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -43,7 +43,7 @@ vllm_configs = {
        name="aggregated",
        directory=vllm_dir,
        script_name="agg.sh",
-        marks=[pytest.mark.gpu_1],
+        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
            chat_payload_default(),
@@ -55,7 +55,7 @@ vllm_configs = {
        name="aggregated_lmcache",
        directory=vllm_dir,
        script_name="agg_lmcache.sh",
-        marks=[pytest.mark.gpu_1],
+        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
            chat_payload_default(),
@@ -68,7 +68,7 @@ vllm_configs = {
        name="agg-request-plane-tcp",
        directory=vllm_dir,
        script_name="agg_request_planes.sh",
-        marks=[pytest.mark.gpu_1],
+        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
        model="Qwen/Qwen3-0.6B",
        script_args=["--tcp"],
        request_payloads=[
@@ -80,7 +80,7 @@ vllm_configs = {
        name="agg-request-plane-http",
        directory=vllm_dir,
        script_name="agg_request_planes.sh",
-        marks=[pytest.mark.gpu_1],
+        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
        model="Qwen/Qwen3-0.6B",
        script_args=["--http"],
        request_payloads=[