test: Add pytest markers (#4111)

Signed-off-by: pvijayakrish <pvijayakrish@nvidia.com> Signed-off-by: Pavithra Vijayakrishnan <160681768+pvijayakrish@users.noreply.github.com>

test: Add pytest markers (#4111)
Signed-off-by: pvijayakrish <pvijayakrish@nvidia.com> Signed-off-by: Pavithra Vijayakrishnan <160681768+pvijayakrish@users.noreply.github.com>
0f6dca6e · Pavithra Vijayakrishnan · GitHub · 5d11f75e · 0f6dca6e · 0f6dca6e
Unverified Commit 0f6dca6e authored Dec 01, 2025 by Pavithra Vijayakrishnan Committed by GitHub Dec 01, 2025
16 changed files
--- a/tests/frontend/grpc/test_tensor_parameters.py
+++ b/tests/frontend/grpc/test_tensor_parameters.py
@@ -92,6 +92,7 @@ def extract_params(param_map) -> dict:
 @pytest.mark.e2e
 @pytest.mark.pre_merge
+@pytest.mark.gpu_1
 @pytest.mark.parametrize(
    "request_params",
    [

--- a/tests/frontend/test_completion_mocker_engine.py
+++ b/tests/frontend/test_completion_mocker_engine.py
@@ -146,6 +146,8 @@ def start_services(request, runtime_services):
 @pytest.mark.usefixtures("start_services")
 @pytest.mark.e2e
+@pytest.mark.gpu_1
+@pytest.mark.post_merge
 @pytest.mark.model(TEST_MODEL)
 def test_completion_string_prompt() -> None:
    payload: Dict[str, Any] = {
@@ -164,6 +166,8 @@ def test_completion_string_prompt() -> None:
 @pytest.mark.usefixtures("start_services")
 @pytest.mark.e2e
+@pytest.mark.gpu_1
+@pytest.mark.post_merge
 @pytest.mark.model(TEST_MODEL)
 def test_completion_empty_array_prompt() -> None:
    payload: Dict[str, Any] = {
@@ -182,6 +186,8 @@ def test_completion_empty_array_prompt() -> None:
 @pytest.mark.usefixtures("start_services")
 @pytest.mark.e2e
+@pytest.mark.gpu_1
+@pytest.mark.post_merge
 @pytest.mark.model(TEST_MODEL)
 def test_completion_single_element_array_prompt() -> None:
    payload: Dict[str, Any] = {
@@ -200,6 +206,8 @@ def test_completion_single_element_array_prompt() -> None:
 @pytest.mark.usefixtures("start_services")
 @pytest.mark.e2e
+@pytest.mark.gpu_1
+@pytest.mark.post_merge
 @pytest.mark.model(TEST_MODEL)
 def test_completion_multi_element_array_prompt() -> None:
    payload: Dict[str, Any] = {

--- a/tests/frontend/test_vllm.py
+++ b/tests/frontend/test_vllm.py
@@ -214,6 +214,7 @@ def _validate_chat_response(response: requests.Response) -> Dict[str, Any]:
 @pytest.mark.vllm
 @pytest.mark.gpu_1
 @pytest.mark.e2e
+@pytest.mark.post_merge
 @pytest.mark.model(TEST_MODEL)
 def test_reasoning_effort(request, runtime_services, predownload_models) -> None:
    """High reasoning effort should yield more detailed reasoning than low effort."""
@@ -280,6 +281,7 @@ def test_reasoning_effort(request, runtime_services, predownload_models) -> None
 @pytest.mark.vllm
 @pytest.mark.gpu_1
 @pytest.mark.e2e
+@pytest.mark.post_merge
 @pytest.mark.model(TEST_MODEL)
 def test_tool_calling(request, runtime_services, predownload_models) -> None:
    """Test tool calling functionality with weather and system health tools."""
@@ -322,6 +324,7 @@ def test_tool_calling(request, runtime_services, predownload_models) -> None:
 @pytest.mark.vllm
 @pytest.mark.gpu_1
 @pytest.mark.e2e
+@pytest.mark.nightly
 @pytest.mark.model(TEST_MODEL)
 def test_tool_calling_second_round(
    request, runtime_services, predownload_models
@@ -388,6 +391,7 @@ def test_tool_calling_second_round(
 @pytest.mark.vllm
 @pytest.mark.gpu_1
 @pytest.mark.e2e
+@pytest.mark.nightly
 @pytest.mark.model(TEST_MODEL)
 def test_reasoning(request, runtime_services, predownload_models) -> None:
    """Test reasoning functionality with a mathematical problem."""

--- a/tests/kvbm_integration/test_consolidator_router_e2e.py
+++ b/tests/kvbm_integration/test_consolidator_router_e2e.py
@@ -35,6 +35,7 @@ pytestmark = [
    pytest.mark.e2e,
    pytest.mark.slow,
    pytest.mark.gpu_1,
+    pytest.mark.nightly,
    pytest.mark.skipif(not HAS_VLLM, reason="requires vllm"),
 ]

--- a/tests/kvbm_integration/test_cuda_graph.py
+++ b/tests/kvbm_integration/test_cuda_graph.py
@@ -143,9 +143,12 @@ def send_completion_request(
 # Test markers to align with repository conventions
 # Todo: enable the rest when kvbm is built in the ci
 @pytest.mark.kvbm
-@pytest.mark.trtllm_marker
+@pytest.mark.trtllm
 @pytest.mark.e2e
+@pytest.mark.nightly
 @pytest.mark.slow
 @pytest.mark.gpu_1
 @pytest.mark.skip(
@@ -182,9 +185,10 @@ def test_kvbm_without_cuda_graph_enabled(request, runtime_services):
 @pytest.mark.kvbm
-@pytest.mark.trtllm_marker
+@pytest.mark.trtllm
 @pytest.mark.e2e
 @pytest.mark.slow
+@pytest.mark.nightly
 @pytest.mark.gpu_1
 @pytest.mark.skip(
    reason="Enable these tests once dynamo `main` upgrades to TRTLLM 1.2+"

--- a/tests/kvbm_integration/test_determinism_agg.py
+++ b/tests/kvbm_integration/test_determinism_agg.py
@@ -38,6 +38,7 @@ pytestmark = [
    pytest.mark.e2e,
    pytest.mark.slow,
    pytest.mark.gpu_1,
+    pytest.mark.nightly,
 ]

--- a/tests/kvbm_integration/test_determinism_disagg.py
+++ b/tests/kvbm_integration/test_determinism_disagg.py
@@ -38,6 +38,7 @@ pytestmark = [
    pytest.mark.e2e,
    pytest.mark.slow,
    pytest.mark.gpu_2,
+    pytest.mark.nightly,
 ]

--- a/tests/planner/test_replica_calculation.py
+++ b/tests/planner/test_replica_calculation.py
@@ -104,6 +104,9 @@ def planner():
 class TestReplicaCalculation:
    """Test replica calculation formulas in isolation."""
+    @pytest.mark.nightly
+    @pytest.mark.gpu_2
+    @pytest.mark.performance
    def test_prefill_replica_calculation_basic(self, planner):
        """Test basic prefill replica calculation."""
        # Setup test data
@@ -173,6 +176,9 @@ class TestReplicaCalculation:
                == calculated_prefill_replicas
            )
+    @pytest.mark.nightly
+    @pytest.mark.gpu_2
+    @pytest.mark.performance
    def test_decode_replica_calculation_basic(self, planner):
        """Test basic decode replica calculation."""
        # Setup test data
@@ -242,6 +248,9 @@ class TestReplicaCalculation:
            (500, 1000, 1, 2),  # high_load_500_req_per_second (lower decode throughput)
        ],
    )
+    @pytest.mark.nightly
+    @pytest.mark.gpu_2
+    @pytest.mark.performance
    def test_scaling_scenario_low_to_high_load(
        self, planner, num_req, decode_thpt, expected_p, expected_d
    ):
@@ -307,6 +316,9 @@ class TestReplicaCalculation:
                decode_replicas == expected_d
            ), f"Decode replicas mismatch: expected {expected_d}, got {decode_replicas}"
+    @pytest.mark.nightly
+    @pytest.mark.gpu_2
+    @pytest.mark.performance
    def test_gpu_budget_constraint(self, planner):
        """Test that GPU budget constraints are properly applied."""
        # Set a low GPU budget
@@ -363,6 +375,9 @@ class TestReplicaCalculation:
                total_gpus <= planner.args.max_gpu_budget
            ), "Total GPU usage exceeds budget"
+    @pytest.mark.nightly
+    @pytest.mark.gpu_2
+    @pytest.mark.performance
    def test_min_endpoint_constraint(self, planner):
        """Test that minimum endpoint constraints are respected."""
        planner.args.min_endpoint = 2
@@ -414,6 +429,9 @@ class TestReplicaCalculation:
                decode_replicas >= planner.args.min_endpoint
            ), "Decode replicas below minimum"
+    @pytest.mark.nightly
+    @pytest.mark.gpu_2
+    @pytest.mark.performance
    def test_prefill_correction_factor_clamping(self, planner):
        """Test that prefill correction factor > 1 is clamped to 1."""
        # Set a high correction factor > 1
@@ -473,6 +491,9 @@ class TestReplicaCalculation:
                expected_prefill_replicas, planner.args.min_endpoint
            ), "Prefill correction factor should be clamped to 1"
+    @pytest.mark.nightly
+    @pytest.mark.gpu_2
+    @pytest.mark.performance
    def test_decode_correction_factor_zero_handling(self, planner):
        """Test handling of d_correction_factor <= 0."""
        # Test both 0 and negative values
@@ -534,6 +555,9 @@ class TestReplicaCalculation:
                        decode_replicas >= 1
                    ), f"Should handle correction factor {correction_factor} gracefully"
+    @pytest.mark.nightly
+    @pytest.mark.gpu_2
+    @pytest.mark.performance
    def test_multi_gpu_engines(self, planner):
        """Test replica calculation with multi-GPU engines."""
        # Set multi-GPU configuration
@@ -599,6 +623,9 @@ class TestReplicaCalculation:
                expected_decode_replicas, planner.args.min_endpoint
            )
+    @pytest.mark.weekly
+    @pytest.mark.gpu_2
+    @pytest.mark.performance
    def test_complex_gpu_budget_scaling(self, planner):
        """Test complex GPU budget scaling with proportional reduction and decode adjustment."""
        # Set tight GPU budget that will trigger complex scaling

--- a/tests/planner/unit/test_prometheus.py
+++ b/tests/planner/unit/test_prometheus.py
@@ -24,6 +24,13 @@ from dynamo.planner.utils.prometheus import (
    PrometheusAPIClient,
 )
+pytestmark = [
+    pytest.mark.gpu_0,
+    pytest.mark.pre_merge,
+    pytest.mark.unit,
+    pytest.mark.planner,
+]
 @pytest.fixture
 def mock_prometheus_result():

--- a/tests/planner/unit/test_virtual_connector.py
+++ b/tests/planner/unit/test_virtual_connector.py
@@ -13,7 +13,12 @@ import pytest
 from dynamo._core import DistributedRuntime, VirtualConnectorClient
 from dynamo.planner import SubComponentType, TargetReplica, VirtualConnector
-pytestmark = pytest.mark.pre_merge
+pytestmark = [
+    pytest.mark.gpu_0,
+    pytest.mark.pre_merge,
+    pytest.mark.unit,
+    pytest.mark.planner,
+]
 logger = logging.getLogger(__name__)
 NAMESPACE = "test_virtual_connector"

--- a/tests/profiler/test_profile_sla_aiconfigurator.py
+++ b/tests/profiler/test_profile_sla_aiconfigurator.py
@@ -77,6 +77,8 @@ class TestProfileSlaAiconfigurator:
        return Args()
    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    @pytest.mark.performance
    @pytest.mark.parallel
    @pytest.mark.asyncio
    @pytest.mark.parametrize("missing_arg", ["aic_system", "aic_hf_id"])
@@ -88,6 +90,8 @@ class TestProfileSlaAiconfigurator:
            await run_profile(trtllm_args)
    @pytest.mark.pre_merge
+    @pytest.mark.gpu_0
+    @pytest.mark.performance
    @pytest.mark.parallel
    @pytest.mark.asyncio
    @pytest.mark.parametrize(
@@ -108,12 +112,17 @@ class TestProfileSlaAiconfigurator:
    @pytest.mark.pre_merge
    @pytest.mark.parallel
    @pytest.mark.asyncio
+    @pytest.mark.gpu_1
+    @pytest.mark.performance
    async def test_trtllm_aiconfigurator_single_model(self, trtllm_args):
        # Test that profile_sla works with the model & backend in the trtllm_args fixture.
        await run_profile(trtllm_args)
    @pytest.mark.parallel
    @pytest.mark.asyncio
+    @pytest.mark.gpu_1
+    @pytest.mark.nightly
+    @pytest.mark.performance
    @pytest.mark.parametrize(
        "backend, aic_backend_version",
        [

--- a/tests/profiler/test_profile_sla_dryrun.py
+++ b/tests/profiler/test_profile_sla_dryrun.py
@@ -128,6 +128,9 @@ class TestProfileSLADryRun:
    @pytest.mark.pre_merge
    @pytest.mark.parallel
    @pytest.mark.asyncio
+    @pytest.mark.gpu_0
+    @pytest.mark.integration
+    @pytest.mark.vllm
    async def test_vllm_dryrun(self, vllm_args):
        """Test that profile_sla dry-run works for vllm backend with disagg.yaml config."""
        # Run the profile in dry-run mode - should complete without errors
@@ -136,6 +139,9 @@ class TestProfileSLADryRun:
    @pytest.mark.pre_merge
    @pytest.mark.parallel
    @pytest.mark.asyncio
+    @pytest.mark.gpu_0
+    @pytest.mark.integration
+    @pytest.mark.sglang
    async def test_sglang_dryrun(self, sglang_args):
        """Test that profile_sla dry-run works for sglang backend with disagg.yaml config."""
        # Run the profile in dry-run mode - should complete without errors
@@ -186,6 +192,9 @@ class TestProfileSLADryRun:
    @pytest.mark.pre_merge
    @pytest.mark.parallel
    @pytest.mark.asyncio
+    @pytest.mark.gpu_0
+    @pytest.mark.integration
+    @pytest.mark.trtllm
    async def test_trtllm_dryrun(self, trtllm_args):
        """Test that profile_sla dry-run works for trtllm backend with disagg.yaml config."""
        # Run the profile in dry-run mode - should complete without errors
@@ -237,6 +246,9 @@ class TestProfileSLADryRun:
    @pytest.mark.pre_merge
    @pytest.mark.parallel
    @pytest.mark.asyncio
+    @pytest.mark.gpu_0
+    @pytest.mark.integration
+    @pytest.mark.sglang
    async def test_sglang_moe_dryrun(self, sglang_moe_args):
        """Test that profile_sla dry-run works for sglang backend with MoE config."""
        # Run the profile in dry-run mode - should complete without errors
@@ -304,6 +316,9 @@ class TestProfileSLADryRun:
    @pytest.mark.pre_merge
    @pytest.mark.parallel
    @pytest.mark.asyncio
+    @pytest.mark.integration
+    @pytest.mark.gpu_0
+    @pytest.mark.vllm
    @patch("benchmarks.profiler.utils.search_space_autogen.get_gpu_summary")
    @patch("benchmarks.profiler.utils.search_space_autogen.get_model_info")
    async def test_profile_with_autogen_search_space_h100(
@@ -368,6 +383,9 @@ class TestProfileSLADryRun:
    @pytest.mark.pre_merge
    @pytest.mark.parallel
    @pytest.mark.asyncio
+    @pytest.mark.gpu_0
+    @pytest.mark.integration
+    @pytest.mark.sglang
    @patch("benchmarks.profiler.utils.search_space_autogen.get_gpu_summary")
    @patch("benchmarks.profiler.utils.search_space_autogen.get_model_info")
    async def test_sglang_profile_with_autogen_search_space_h100(
@@ -432,6 +450,9 @@ class TestProfileSLADryRun:
    @pytest.mark.pre_merge
    @pytest.mark.parallel
    @pytest.mark.asyncio
+    @pytest.mark.gpu_0
+    @pytest.mark.integration
+    @pytest.mark.trtllm
    @patch("benchmarks.profiler.utils.search_space_autogen.get_gpu_summary")
    @patch("benchmarks.profiler.utils.search_space_autogen.get_model_info")
    async def test_trtllm_profile_with_autogen_search_space_h100(

--- a/tests/router/test_router_e2e_with_mockers.py
+++ b/tests/router/test_router_e2e_with_mockers.py
@@ -21,7 +21,11 @@ from tests.router.common import (  # utilities
 from tests.utils.constants import ROUTER_MODEL_NAME
 from tests.utils.managed_process import ManagedProcess
-pytestmark = pytest.mark.pre_merge
+pytestmark = [
+    pytest.mark.pre_merge,
+    pytest.mark.gpu_0,
+    pytest.mark.integration,
+]
 logger = logging.getLogger(__name__)
@@ -282,6 +286,8 @@ class DisaggMockerProcess:
 @pytest.mark.pre_merge
+@pytest.mark.gpu_0
+@pytest.mark.integration
 @pytest.mark.parallel
 @pytest.mark.model(MODEL_NAME)
 def test_mocker_kv_router(request, runtime_services_session, predownload_tokenizers):
@@ -324,6 +330,8 @@ def test_mocker_kv_router(request, runtime_services_session, predownload_tokeniz
 @pytest.mark.pre_merge
+@pytest.mark.gpu_0
+@pytest.mark.integration
 @pytest.mark.parallel
 @pytest.mark.model(MODEL_NAME)
 @pytest.mark.parametrize("store_backend", ["etcd", "file"])
@@ -382,6 +390,8 @@ def test_mocker_two_kv_router(
 @pytest.mark.pre_merge
+@pytest.mark.gpu_0
+@pytest.mark.integration
 @pytest.mark.parallel
 @pytest.mark.model(MODEL_NAME)
 @pytest.mark.skip(reason="Flaky, temporarily disabled")
@@ -423,6 +433,8 @@ def test_mocker_kv_router_overload_503(
 @pytest.mark.pre_merge
+@pytest.mark.gpu_0
+@pytest.mark.integration
 @pytest.mark.parallel
 @pytest.mark.model(MODEL_NAME)
 def test_kv_push_router_bindings(
@@ -462,6 +474,8 @@ def test_kv_push_router_bindings(
 @pytest.mark.pre_merge
+@pytest.mark.gpu_0
+@pytest.mark.integration
 @pytest.mark.parallel
 @pytest.mark.model(MODEL_NAME)
 @pytest.mark.parametrize("store_backend", ["etcd", "file"])
@@ -514,6 +528,8 @@ def test_indexers_sync(
 @pytest.mark.pre_merge
+@pytest.mark.gpu_0
+@pytest.mark.integration
 @pytest.mark.parallel
 @pytest.mark.model(MODEL_NAME)
 def test_query_instance_id_returns_worker_and_tokens(
@@ -551,6 +567,8 @@ def test_query_instance_id_returns_worker_and_tokens(
 @pytest.mark.pre_merge
+@pytest.mark.gpu_0
+@pytest.mark.integration
 @pytest.mark.parallel
 @pytest.mark.model(MODEL_NAME)
 def test_router_decisions(request, runtime_services_session, predownload_tokenizers):

--- a/tests/serve/test_sglang.py
+++ b/tests/serve/test_sglang.py
@@ -44,7 +44,7 @@ sglang_configs = {
        name="aggregated",
        directory=sglang_dir,
        script_name="agg.sh",
-        marks=[pytest.mark.gpu_1],
+        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
        model="Qwen/Qwen3-0.6B",
        env={},
        models_port=8000,
@@ -58,7 +58,7 @@ sglang_configs = {
        name="disaggregated",
        directory=sglang_dir,
        script_name="disagg.sh",
-        marks=[pytest.mark.gpu_2],
+        marks=[pytest.mark.gpu_2, pytest.mark.post_merge],
        model="Qwen/Qwen3-0.6B",
        env={},
        models_port=8000,
@@ -116,7 +116,7 @@ sglang_configs = {
        name="template_verification",
        directory=SERVE_TEST_DIR,  # special directory for test-specific scripts
        script_name="template_verifier.sh",
-        marks=[pytest.mark.gpu_1],
+        marks=[pytest.mark.gpu_1, pytest.mark.nightly],
        model="Qwen/Qwen3-0.6B",
        env={},
        models_port=8000,
@@ -130,7 +130,7 @@ sglang_configs = {
        name="multimodal_agg_qwen",
        directory=sglang_dir,
        script_name="multimodal_agg.sh",
-        marks=[pytest.mark.gpu_2],
+        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        delayed_start=0,
        timeout=360,
@@ -159,7 +159,7 @@ sglang_configs = {
        name="embedding_agg",
        directory=sglang_dir,
        script_name="agg_embed.sh",
-        marks=[pytest.mark.gpu_1],
+        marks=[pytest.mark.gpu_1, pytest.mark.nightly],
        model="Qwen/Qwen3-Embedding-4B",
        delayed_start=0,
        timeout=180,
@@ -207,6 +207,10 @@ def test_sglang_deployment(
    run_serve_deployment(config, request)
+@pytest.mark.e2e
+@pytest.mark.sglang
+@pytest.mark.gpu_1
+@pytest.mark.nightly
 @pytest.mark.skip(
    reason="Requires 4 GPUs - enable when hardware is consistently available"
 )

--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
@@ -40,7 +40,7 @@ trtllm_configs = {
        name="aggregated",
        directory=trtllm_dir,
        script_name="agg_metrics.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.trtllm_marker],
+        marks=[pytest.mark.gpu_1, pytest.mark.trtllm, pytest.mark.pre_merge],
        model="Qwen/Qwen3-0.6B",
        models_port=8000,
        request_payloads=[
@@ -53,7 +53,7 @@ trtllm_configs = {
        name="disaggregated",
        directory=trtllm_dir,
        script_name="disagg.sh",
-        marks=[pytest.mark.gpu_2, pytest.mark.trtllm_marker],
+        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.post_merge],
        model="Qwen/Qwen3-0.6B",
        models_port=8000,
        request_payloads=[
@@ -65,7 +65,7 @@ trtllm_configs = {
        name="disaggregated_same_gpu",
        directory=trtllm_dir,
        script_name="disagg_same_gpu.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.trtllm_marker],
+        marks=[pytest.mark.gpu_1, pytest.mark.trtllm],
        model="Qwen/Qwen3-0.6B",
        models_port=8000,
        request_payloads=[
@@ -79,7 +79,7 @@ trtllm_configs = {
        name="aggregated_router",
        directory=trtllm_dir,
        script_name="agg_router.sh",
-        marks=[pytest.mark.gpu_1, pytest.mark.trtllm_marker],
+        marks=[pytest.mark.gpu_1, pytest.mark.trtllm, pytest.mark.post_merge],
        model="Qwen/Qwen3-0.6B",
        models_port=8000,
        request_payloads=[
@@ -98,7 +98,7 @@ trtllm_configs = {
        name="disaggregated_router",
        directory=trtllm_dir,
        script_name="disagg_router.sh",
-        marks=[pytest.mark.gpu_2, pytest.mark.trtllm_marker],
+        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.nightly],
        model="Qwen/Qwen3-0.6B",
        models_port=8000,
        request_payloads=[
@@ -110,7 +110,7 @@ trtllm_configs = {
        name="disaggregated_multimodal",
        directory=trtllm_dir,
        script_name="disagg_multimodal.sh",
-        marks=[pytest.mark.gpu_2, pytest.mark.trtllm_marker, pytest.mark.multimodal],
+        marks=[pytest.mark.gpu_2, pytest.mark.trtllm, pytest.mark.multimodal],
        model="Qwen/Qwen2-VL-7B-Instruct",
        models_port=8000,
        timeout=900,
@@ -123,10 +123,10 @@ trtllm_configs = {
 @pytest.fixture(params=params_with_model_mark(trtllm_configs))
 def trtllm_config_test(request):
    """Fixture that provides different trtllm test configurations"""
-    return trtllm_configs[request.param]
+    return request.param
-@pytest.mark.trtllm_marker
+@pytest.mark.trtllm
 @pytest.mark.e2e
 def test_deployment(trtllm_config_test, request, runtime_services, predownload_models):
    """
@@ -140,7 +140,8 @@ def test_deployment(trtllm_config_test, request, runtime_services, predownload_m
 # TODO make this a normal guy
 @pytest.mark.e2e
 @pytest.mark.gpu_1
-@pytest.mark.trtllm_marker
+@pytest.mark.pre_merge
+@pytest.mark.trtllm
 def test_chat_only_aggregated_with_test_logits_processor(
    request, runtime_services, predownload_models, monkeypatch
 ):

--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -43,7 +43,7 @@ vllm_configs = {
        name="aggregated",
        directory=vllm_dir,
        script_name="agg.sh",
-        marks=[pytest.mark.gpu_1],
+        marks=[pytest.mark.gpu_1, pytest.mark.pre_merge],
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
            chat_payload_default(),
@@ -92,7 +92,7 @@ vllm_configs = {
        name="agg-router",
        directory=vllm_dir,
        script_name="agg_router.sh",
-        marks=[pytest.mark.gpu_2],
+        marks=[pytest.mark.gpu_2, pytest.mark.post_merge],
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
            chat_payload_default(
@@ -111,7 +111,7 @@ vllm_configs = {
        name="disaggregated",
        directory=vllm_dir,
        script_name="disagg.sh",
-        marks=[pytest.mark.gpu_2],
+        marks=[pytest.mark.gpu_2, pytest.mark.post_merge],
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
            chat_payload_default(),
@@ -126,6 +126,7 @@ vllm_configs = {
            pytest.mark.gpu_2,
            pytest.mark.vllm,
            pytest.mark.h100,
+            pytest.mark.nightly,
        ],
        model="deepseek-ai/DeepSeek-V2-Lite",
        script_args=[
@@ -148,7 +149,7 @@ vllm_configs = {
        name="multimodal_agg_llava_epd",
        directory=vllm_dir,
        script_name="agg_multimodal_epd.sh",
-        marks=[pytest.mark.gpu_2],
+        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
        model="llava-hf/llava-1.5-7b-hf",
        script_args=["--model", "llava-hf/llava-1.5-7b-hf"],
        request_payloads=[
@@ -174,7 +175,7 @@ vllm_configs = {
        name="multimodal_agg_qwen_epd",
        directory=vllm_dir,
        script_name="agg_multimodal_epd.sh",
-        marks=[pytest.mark.gpu_2],
+        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        delayed_start=0,
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
@@ -201,7 +202,7 @@ vllm_configs = {
        name="multimodal_agg_qwen",
        directory=vllm_dir,
        script_name="agg_multimodal.sh",
-        marks=[pytest.mark.gpu_2],
+        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        script_args=["--model", "Qwen/Qwen2.5-VL-7B-Instruct"],
        delayed_start=0,
@@ -265,7 +266,7 @@ vllm_configs = {
        name="multimodal_video_agg",
        directory=os.path.join(WORKSPACE_DIR, "examples/multimodal"),
        script_name="video_agg.sh",
-        marks=[pytest.mark.gpu_2],
+        marks=[pytest.mark.gpu_2, pytest.mark.nightly],
        model="llava-hf/LLaVA-NeXT-Video-7B-hf",
        delayed_start=0,
        script_args=["--model", "llava-hf/LLaVA-NeXT-Video-7B-hf"],
@@ -336,6 +337,8 @@ def vllm_config_test(request):
 @pytest.mark.vllm
 @pytest.mark.e2e
+@pytest.mark.gpu_1
+@pytest.mark.nightly
 def test_serve_deployment(
    vllm_config_test, request, runtime_services, predownload_models, image_server
 ):