feat(mocker): add multi-worker replay and router startup fixes (#7553)

Signed-off-by: PeaBrane <yanrpei@gmail.com>

feat(mocker): add multi-worker replay and router startup fixes (#7553)
Signed-off-by: PeaBrane <yanrpei@gmail.com>
b7fe46b1 · Yan Ru Pei · GitHub · 82794761 · b7fe46b1 · b7fe46b1
Unverified Commit b7fe46b1 authored Mar 23, 2026 by Yan Ru Pei Committed by GitHub Mar 23, 2026
Showing with 114 additions and 513 deletions

tests/router/test_router_e2e_with_trtllm.py tests/router/test_router_e2e_with_trtllm.py +58 -261

tests/router/test_router_e2e_with_vllm.py tests/router/test_router_e2e_with_vllm.py +56 -252

No files found.
--- a/tests/router/test_router_e2e_with_trtllm.py
+++ b/tests/router/test_router_e2e_with_trtllm.py
@@ -7,21 +7,20 @@
 # so we set explicit pytest timeouts to fail fast on hangs (see per-test markers below).
 import logging
 import os
-import time
 from typing import Any, Dict, Optional

 import pytest

-from tests.router.common import (
-    _test_router_basic,
-    _test_router_decisions,
-    _test_router_indexers_sync,
+from tests.router.e2e_harness import (
+    ManagedEngineProcessMixin,
+    run_basic_router_test,
+    run_indexers_sync_test,
+    run_router_decisions_test,
 )
-from tests.router.helper import generate_random_suffix, get_runtime
+from tests.router.helper import generate_random_suffix
 from tests.utils.constants import DefaultPort
 from tests.utils.managed_process import ManagedProcess
 from tests.utils.port_utils import allocate_ports, deallocate_ports
-from tests.utils.test_output import resolve_test_output_path

 logger = logging.getLogger(__name__)

@@ -34,28 +33,6 @@ pytestmark = [
    pytest.mark.trtllm,
    pytest.mark.model(MODEL_NAME),
 ]
-NUM_REQUESTS = 10
-
-
-def allocate_frontend_ports(request, count: int) -> list[int]:
-    """Allocate random free frontend ports for xdist-safe execution."""
-    ports = allocate_ports(count, DefaultPort.FRONTEND.value)
-    request.addfinalizer(lambda: deallocate_ports(ports))
-    return ports
-
-
-# Shared test payload for all tests
-TEST_PAYLOAD: Dict[str, Any] = {
-    "model": MODEL_NAME,
-    "messages": [
-        {
-            "role": "user",
-            "content": "In a quiet meadow tucked between rolling hills, a plump gray rabbit nibbled on clover beneath the shade of a gnarled oak tree. Its ears twitched at the faint rustle of leaves, but it remained calm, confident in the safety of its burrow just a few hops away. The late afternoon sun warmed its fur, and tiny dust motes danced in the golden light as bees hummed lazily nearby. Though the rabbit lived a simple life, every day was an adventure of scents, shadows, and snacks—an endless search for the tastiest patch of greens and the softest spot to nap.",
-        }
-    ],
-    "stream": True,
-    "max_tokens": 10,
-}

 # Shared TRT-LLM configuration for all tests
 # free_gpu_memory_fraction limits actual VRAM allocation (required for multi-worker on same GPU)
@@ -67,7 +44,7 @@ TRTLLM_ARGS: Dict[str, Any] = {
 }


-class TRTLLMProcess:
+class TRTLLMProcess(ManagedEngineProcessMixin):
    """Manages TRT-LLM workers using dynamo.trtllm (HTTP API + KV events).

    This is a drop-in replacement for MockerProcess that uses real TRT-LLM workers.
@@ -223,97 +200,8 @@ class TRTLLMProcess:
                f"with endpoint: {self.endpoint}"
            )

-    def __enter__(self):
-        """Start all TRT-LLM worker processes with sequential initialization.
-
-        Workers are started sequentially with a delay between each to avoid
-        resource contention during initialization. This prevents
-        MPI initialization conflicts when multiple workers
-        try to initialize simultaneously on the same GPU.
-        """
-        logger.info(
-            f"[TRTLLMProcess] Starting {len(self.worker_processes)} worker processes sequentially..."
-        )
-
-        # Start each process sequentially, waiting for initialization before next
-        for i, process in enumerate(self.worker_processes):
-            logger.info(f"[TRTLLMProcess] Starting TRT-LLM worker {i}...")
-            try:
-                # Manually initialize the process without blocking on health checks
-                process._logger = logging.getLogger(process.__class__.__name__)
-                process._command_name = process.command[0]
-                process.log_dir = resolve_test_output_path(process.log_dir)
-                os.makedirs(process.log_dir, exist_ok=True)
-                log_name = f"{process._command_name}.log.txt"
-                process._log_path = os.path.join(process.log_dir, log_name)
-
-                if process.data_dir:
-                    process._remove_directory(process.data_dir)
-
-                process._terminate_all_matching_process_names()
-                logger.info(
-                    f"[TRTLLMProcess] Launching process {i} (pid will be assigned)..."
-                )
-                process._start_process()  # Start the process but don't wait
-                logger.info(
-                    f"[TRTLLMProcess] Worker {i} launched with PID: {process.proc.pid if process.proc else 'unknown'}"
-                )
-                time.sleep(process.delayed_start)
-
-                # Wait for initialization before starting next worker
-                # This prevents MPI initialization conflicts
-                if i < len(self.worker_processes) - 1:
-                    init_delay = 5  # seconds
-                    logger.info(
-                        f"[TRTLLMProcess] Waiting {init_delay}s for worker {i} to initialize before starting next worker..."
-                    )
-                    time.sleep(init_delay)
-
-            except Exception:
-                logger.exception(f"[TRTLLMProcess] Failed to start worker {i}")
-                # Clean up on failure
-                try:
-                    process.__exit__(None, None, None)
-                except Exception as cleanup_err:
-                    logger.warning(
-                        f"[TRTLLMProcess] Error during cleanup: {cleanup_err}"
-                    )
-                raise
-
-        logger.info(
-            f"[TRTLLMProcess] All {len(self.worker_processes)} workers launched with sequential initialization."
-        )
-        logger.info("[TRTLLMProcess] Waiting for health checks to complete...")
-
-        # Now wait for health checks for all processes
-        for i, process in enumerate(self.worker_processes):
-            logger.info(f"[TRTLLMProcess] Checking health for worker {i}...")
-            try:
-                elapsed = process._check_ports(process.timeout)
-                process._check_urls(process.timeout - elapsed)
-                process._check_funcs(process.timeout - elapsed)
-                logger.info(f"[TRTLLMProcess] Worker {i} health checks passed")
-            except Exception:
-                logger.error(f"[TRTLLMProcess] Worker {i} health check failed")
-                # Clean up all processes on failure
-                self.__exit__(None, None, None)
-                raise
-
-        logger.info(
-            "[TRTLLMProcess] All workers started successfully and passed health checks!"
-        )
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        """Stop all TRT-LLM worker processes gracefully."""
-        for i, process in enumerate(self.worker_processes):
-            logger.info(f"Stopping TRT-LLM worker {i}")
-            process.__exit__(exc_type, exc_val, exc_tb)
-
-        # Add delay to ensure full cleanup of NATS/ETCD/MPI resources
-        # This prevents test isolation issues when running multiple tests
-        logger.info("Waiting for TRT-LLM worker resources to fully clean up...")
-        time.sleep(2)
+    process_name = "TRT-LLM worker"
+    cleanup_name = "TRT-LLM worker resources"


 @pytest.mark.gpu_1
@@ -327,40 +215,16 @@ def test_trtllm_kv_router_basic(
    set_ucx_tls_no_mm,
    request_plane,
 ):
-    """
-    Quick e2e sanity test for KV router with TRT-LLM engine instances.
-    Tests both NATS and TCP request planes.
-    """
-
-    # runtime_services starts etcd and nats
-    N_TRTLLM_WORKERS = 2
-    logger.info(
-        f"Starting TRT-LLM KV router test with {N_TRTLLM_WORKERS} workers using request_plane={request_plane}"
-    )
-
-    with TRTLLMProcess(
-        request,
-        trtllm_args=TRTLLM_ARGS,
-        num_workers=N_TRTLLM_WORKERS,
-        single_gpu=True,  # fit workers into one GPU
-        request_plane=request_plane,
-    ) as trtllm_workers:
-        # Start TRT-LLM workers
-        logger.info(f"Starting {N_TRTLLM_WORKERS} TRT-LLM workers")
-        logger.info(f"All TRT-LLM workers using namespace: {trtllm_workers.namespace}")
-
-        # Run basic router test (starts router internally and waits for workers to be ready)
-        frontend_port = allocate_frontend_ports(request, 1)[0]
-        _test_router_basic(
-            engine_workers=trtllm_workers,
-            block_size=TRTLLM_BLOCK_SIZE,
+    run_basic_router_test(
+        engine_process_cls=TRTLLMProcess,
+        engine_args_name="trtllm_args",
+        engine_args=TRTLLM_ARGS,
+        num_workers=2,
+        single_gpu=True,
        request=request,
-            frontend_port=frontend_port,
-            test_payload=TEST_PAYLOAD,
-            num_requests=NUM_REQUESTS,
-            frontend_timeout=180,  # 3 minutes should be plenty for TinyLlama
-            store_backend="etcd",  # Explicit for clarity
        request_plane=request_plane,
+        block_size=TRTLLM_BLOCK_SIZE,
+        model_name=MODEL_NAME,
    )


@@ -382,40 +246,22 @@ def test_router_decisions_trtllm_attention_dp(
        * The (worker_id, dp_rank) with events should have exactly 4 events (one per request)
        * All events should be on the forced (worker_id, dp_rank=1) (verifying forced routing and prefix reuse)
    """
-    N_TRTLLM_WORKERS = 1
-    N_ATTENTION_DP_RANKS = 2
-
-    # Create trtllm_args with attention DP enabled
-    TRTLLM_ADP_ARGS = {
+    run_router_decisions_test(
+        engine_process_cls=TRTLLMProcess,
+        engine_args_name="trtllm_args",
+        engine_args={
            **TRTLLM_ARGS,
            "enable_attention_dp": True,
-        "tensor_parallel_size": N_ATTENTION_DP_RANKS,
-    }
-
-    with TRTLLMProcess(
-        request,
-        trtllm_args=TRTLLM_ADP_ARGS,
-        num_workers=N_TRTLLM_WORKERS,
-        single_gpu=False,
+            "tensor_parallel_size": 2,
+        },
+        request=request,
        request_plane=request_plane,
-    ) as trtllm_workers:
-        logger.info(
-            f"Starting 1 TRT-LLM worker with attention DP enabled (attention_dp_size={N_ATTENTION_DP_RANKS})"
-        )
-        logger.info(f"All TRT-LLM workers using namespace: {trtllm_workers.namespace}")
-
-        # Get runtime and create endpoint
-        runtime = get_runtime(request_plane=request_plane)
-        # Use the namespace from the TRT-LLM workers
-        endpoint = runtime.endpoint(f"{trtllm_workers.namespace}.tensorrt_llm.generate")
-
-        _test_router_decisions(
-            trtllm_workers,
-            endpoint,
-            MODEL_NAME,
-            request,
-            test_dp_rank=True,
+        model_name=MODEL_NAME,
        block_size=TRTLLM_BLOCK_SIZE,
+        component_name="tensorrt_llm",
+        num_workers=1,
+        single_gpu=False,
+        test_dp_rank=True,
    )


@@ -430,33 +276,18 @@ def test_router_decisions_trtllm_multiple_workers(
    set_ucx_tls_no_mm,
    request_plane,
 ):
-    # runtime_services starts etcd and nats
-    logger.info("Starting TRT-LLM router prefix reuse test with two workers")
-    N_WORKERS = 2
-
-    with TRTLLMProcess(
-        request,
-        trtllm_args=TRTLLM_ARGS,
-        num_workers=N_WORKERS,
-        single_gpu=True,  # Worker uses GPU 0
+    run_router_decisions_test(
+        engine_process_cls=TRTLLMProcess,
+        engine_args_name="trtllm_args",
+        engine_args=TRTLLM_ARGS,
+        request=request,
        request_plane=request_plane,
-    ) as trtllm_workers:
-        # Start 2 worker processes on the same GPU
-        logger.info(
-            "Starting 2 TRT-LLM worker processes on single GPU (gpu_mem_frac=0.4)"
-        )
-        logger.info(f"All TRT-LLM workers using namespace: {trtllm_workers.namespace}")
-
-        runtime = get_runtime(request_plane=request_plane)
-        endpoint = runtime.endpoint(f"{trtllm_workers.namespace}.tensorrt_llm.generate")
-
-        _test_router_decisions(
-            trtllm_workers,
-            endpoint,
-            MODEL_NAME,
-            request,
-            test_dp_rank=False,
+        model_name=MODEL_NAME,
        block_size=TRTLLM_BLOCK_SIZE,
+        component_name="tensorrt_llm",
+        num_workers=2,
+        single_gpu=True,
+        test_dp_rank=False,
    )


@@ -481,50 +312,16 @@ def test_trtllm_indexers_sync(
    durable_kv_events,
    request_plane,
 ):
-    """
-    Test that two KV routers have synchronized indexer states after processing requests
-    with TRT-LLM workers. This test verifies that both routers converge to the same internal state.
-
-    Tests with configuration:
-    - nats_core: etcd backend, local indexer with NATS Core, TCP request plane
-                 (includes NATS interruption/recovery testing)
-    """
-    # runtime_services_dynamic_ports handles NATS and etcd startup
-    nats_process, _etcd_process = runtime_services_dynamic_ports
-
-    logger.info(
-        f"Starting TRT-LLM indexers sync test: store_backend={store_backend}, "
-        f"durable_kv_events={durable_kv_events}, request_plane={request_plane}"
-    )
-
-    N_TRTLLM_WORKERS = 2
-
-    with TRTLLMProcess(
-        request,
-        trtllm_args=TRTLLM_ARGS,
-        num_workers=N_TRTLLM_WORKERS,
-        single_gpu=True,  # fit workers into one GPU
-        request_plane=request_plane,
+    run_indexers_sync_test(
+        engine_process_cls=TRTLLMProcess,
+        engine_args_name="trtllm_args",
+        engine_args=TRTLLM_ARGS,
+        request=request,
+        runtime_services_dynamic_ports=runtime_services_dynamic_ports,
        store_backend=store_backend,
        durable_kv_events=durable_kv_events,
-    ) as trtllm_workers:
-        # Start TRT-LLM workers
-        logger.info(f"Starting {N_TRTLLM_WORKERS} TRT-LLM workers")
-        logger.info(f"All TRT-LLM workers using namespace: {trtllm_workers.namespace}")
-
-        # Use the common test implementation (creates its own runtimes for each router)
-        # Note: Consumer verification is done inside _test_router_indexers_sync while routers are alive
-        # When using durable_kv_events=True, use JetStream mode for the router
-        _test_router_indexers_sync(
-            engine_workers=trtllm_workers,
+        request_plane=request_plane,
        block_size=TRTLLM_BLOCK_SIZE,
        model_name=MODEL_NAME,
-            num_workers=N_TRTLLM_WORKERS,
-            store_backend=store_backend,
-            request_plane=request_plane,
-            test_nats_interruption=not durable_kv_events,
-            nats_server=nats_process if not durable_kv_events else None,
-            durable_kv_events=durable_kv_events,
+        num_workers=2,
    )
-
-        logger.info("TRT-LLM indexers sync test completed successfully")
--- a/tests/router/test_router_e2e_with_vllm.py
+++ b/tests/router/test_router_e2e_with_vllm.py
@@ -8,21 +8,20 @@
 import json
 import logging
 import os
-import time
 from typing import Any, Dict, Optional

 import pytest

-from tests.router.common import (
-    _test_router_basic,
-    _test_router_decisions,
-    _test_router_indexers_sync,
+from tests.router.e2e_harness import (
+    ManagedEngineProcessMixin,
+    run_basic_router_test,
+    run_indexers_sync_test,
+    run_router_decisions_test,
 )
-from tests.router.helper import generate_random_suffix, get_runtime
+from tests.router.helper import generate_random_suffix
 from tests.utils.constants import DefaultPort
 from tests.utils.managed_process import ManagedProcess
 from tests.utils.port_utils import allocate_ports, deallocate_ports
-from tests.utils.test_output import resolve_test_output_path

 logger = logging.getLogger(__name__)

@@ -35,30 +34,8 @@ pytestmark = [
    pytest.mark.model(MODEL_NAME),
 ]
 SPEEDUP_RATIO = 10.0
-NUM_REQUESTS = 10
 BLOCK_SIZE = 16

-
-def allocate_frontend_ports(request, count: int) -> list[int]:
-    """Allocate random free frontend ports for xdist-safe execution."""
-    ports = allocate_ports(count, DefaultPort.FRONTEND.value)
-    request.addfinalizer(lambda: deallocate_ports(ports))
-    return ports
-
-
-# Shared test payload for all tests
-TEST_PAYLOAD: Dict[str, Any] = {
-    "model": MODEL_NAME,
-    "messages": [
-        {
-            "role": "user",
-            "content": "In a quiet meadow tucked between rolling hills, a plump gray rabbit nibbled on clover beneath the shade of a gnarled oak tree. Its ears twitched at the faint rustle of leaves, but it remained calm, confident in the safety of its burrow just a few hops away. The late afternoon sun warmed its fur, and tiny dust motes danced in the golden light as bees hummed lazily nearby. Though the rabbit lived a simple life, every day was an adventure of scents, shadows, and snacks—an endless search for the tastiest patch of greens and the softest spot to nap.",
-        }
-    ],
-    "stream": True,
-    "max_tokens": 10,
-}
-
 # Shared vLLM configuration for all tests
 # gpu_memory_utilization limits actual VRAM allocation (required for multi-worker on same GPU)
 VLLM_ARGS: Dict[str, Any] = {
@@ -70,7 +47,7 @@ VLLM_ARGS: Dict[str, Any] = {
 }


-class VLLMProcess:
+class VLLMProcess(ManagedEngineProcessMixin):
    """Manages vLLM workers using dynamo.vllm (HTTP API + KV events).

    This is a drop-in replacement for MockerProcess that uses real vLLM workers.
@@ -271,95 +248,9 @@ class VLLMProcess:
                    f"with endpoint: {self.endpoint}"
                )

-    def __enter__(self):
-        """Start all vLLM worker processes with sequential initialization.
-
-        Workers are started sequentially with a delay between each to avoid
-        NIXL/UCX resource contention during initialization. This prevents
-        UCX shared memory handle allocation failures when multiple workers
-        try to initialize simultaneously on the same GPU.
-        """
-        logger.info(
-            f"[VLLMProcess] Starting {len(self.worker_processes)} worker processes sequentially..."
-        )
-
-        # Start each process sequentially, waiting for NIXL initialization before next
-        for i, process in enumerate(self.worker_processes):
-            logger.info(f"[VLLMProcess] Starting vLLM worker {i}...")
-            try:
-                # Manually initialize the process without blocking on health checks
-                process._logger = logging.getLogger(process.__class__.__name__)
-                process._command_name = process.command[0]
-                process.log_dir = resolve_test_output_path(process.log_dir)
-                os.makedirs(process.log_dir, exist_ok=True)
-                log_name = f"{process._command_name}.log.txt"
-                process._log_path = os.path.join(process.log_dir, log_name)
-
-                if process.data_dir:
-                    process._remove_directory(process.data_dir)
-
-                process._terminate_all_matching_process_names()
-                logger.info(
-                    f"[VLLMProcess] Launching process {i} (pid will be assigned)..."
-                )
-                process._start_process()  # Start the process but don't wait
-                logger.info(
-                    f"[VLLMProcess] Worker {i} launched with PID: {process.proc.pid if process.proc else 'unknown'}"
-                )
-                time.sleep(process.delayed_start)
-
-                # Wait for NIXL initialization before starting next worker
-                # This prevents UCX shared memory contention
-                if i < len(self.worker_processes) - 1:
-                    nixl_init_delay = 5  # seconds
-                    logger.info(
-                        f"[VLLMProcess] Waiting {nixl_init_delay}s for worker {i} to initialize NIXL before starting next worker..."
-                    )
-                    time.sleep(nixl_init_delay)
-
-            except Exception:
-                logger.exception(f"[VLLMProcess] Failed to start worker {i}")
-                # Clean up on failure
-                try:
-                    process.__exit__(None, None, None)
-                except Exception as cleanup_err:
-                    logger.warning(f"[VLLMProcess] Error during cleanup: {cleanup_err}")
-                raise
-
-        logger.info(
-            f"[VLLMProcess] All {len(self.worker_processes)} workers launched with sequential initialization."
-        )
-        logger.info("[VLLMProcess] Waiting for health checks to complete...")
-
-        # Now wait for health checks for all processes
-        for i, process in enumerate(self.worker_processes):
-            logger.info(f"[VLLMProcess] Checking health for worker {i}...")
-            try:
-                elapsed = process._check_ports(process.timeout)
-                process._check_urls(process.timeout - elapsed)
-                process._check_funcs(process.timeout - elapsed)
-                logger.info(f"[VLLMProcess] Worker {i} health checks passed")
-            except Exception:
-                logger.error(f"[VLLMProcess] Worker {i} health check failed")
-                # Clean up all processes on failure
-                self.__exit__(None, None, None)
-                raise
-
-        logger.info(
-            "[VLLMProcess] All workers started successfully and passed health checks!"
-        )
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        """Stop all vLLM worker processes gracefully."""
-        for i, process in enumerate(self.worker_processes):
-            logger.info(f"Stopping vLLM worker {i}")
-            process.__exit__(exc_type, exc_val, exc_tb)
-
-        # Add delay to ensure full cleanup of NATS/ETCD/ZMQ resources
-        # This prevents test isolation issues when running multiple tests
-        logger.info("Waiting for vLLM worker resources to fully clean up...")
-        time.sleep(2)
+    process_name = "vLLM worker"
+    cleanup_name = "vLLM worker resources"
+    init_delay_reason = "initialize NIXL before starting next worker"


 @pytest.mark.pre_merge
@@ -373,40 +264,16 @@ def test_vllm_kv_router_basic(
    set_ucx_tls_no_mm,
    request_plane,
 ):
-    """
-    Quick e2e sanity test for KV router with vLLM engine instances.
-    Tests both NATS and TCP request planes.
-    """
-
-    # runtime_services starts etcd and nats
-    N_VLLM_WORKERS = 2
-    logger.info(
-        f"Starting vLLM KV router test with {N_VLLM_WORKERS} workers using request_plane={request_plane}"
-    )
-
-    with VLLMProcess(
-        request,
-        vllm_args=VLLM_ARGS,
-        num_workers=N_VLLM_WORKERS,
-        single_gpu=True,  # fit workers into one GPU
-        request_plane=request_plane,
-    ) as vllm_workers:
-        # Start vLLM workers
-        logger.info(f"Starting {N_VLLM_WORKERS} vLLM workers")
-        logger.info(f"All vLLM workers using namespace: {vllm_workers.namespace}")
-
-        # Run basic router test (starts router internally and waits for workers to be ready)
-        frontend_port = allocate_frontend_ports(request, 1)[0]
-        _test_router_basic(
-            engine_workers=vllm_workers,
-            block_size=BLOCK_SIZE,
+    run_basic_router_test(
+        engine_process_cls=VLLMProcess,
+        engine_args_name="vllm_args",
+        engine_args=VLLM_ARGS,
+        num_workers=2,
+        single_gpu=True,
        request=request,
-            frontend_port=frontend_port,
-            test_payload=TEST_PAYLOAD,
-            num_requests=NUM_REQUESTS,
-            frontend_timeout=180,  # 3 minutes should be plenty for TinyLlama
-            store_backend="etcd",  # Explicit for clarity
        request_plane=request_plane,
+        block_size=BLOCK_SIZE,
+        model_name=MODEL_NAME,
    )


@@ -421,32 +288,18 @@ def test_router_decisions_vllm_multiple_workers(
    set_ucx_tls_no_mm,
    request_plane,
 ):
-    # runtime_services starts etcd and nats
-    logger.info("Starting vLLM router prefix reuse test with two workers")
-    N_WORKERS = 2
-
-    with VLLMProcess(
-        request,
-        vllm_args=VLLM_ARGS,
-        num_workers=N_WORKERS,
-        single_gpu=True,  # Worker uses GPU 0
+    run_router_decisions_test(
+        engine_process_cls=VLLMProcess,
+        engine_args_name="vllm_args",
+        engine_args=VLLM_ARGS,
+        request=request,
        request_plane=request_plane,
-    ) as vllm_workers:
-        # Start 2 worker processes on the same GPU
-        logger.info("Starting 2 vLLM worker processes on single GPU (gpu_mem=0.4)")
-        logger.info(f"All vLLM workers using namespace: {vllm_workers.namespace}")
-
-        # Get runtime and create endpoint
-        runtime = get_runtime(request_plane=request_plane)
-        endpoint = runtime.endpoint(f"{vllm_workers.namespace}.backend.generate")
-
-        _test_router_decisions(
-            vllm_workers,
-            endpoint,
-            MODEL_NAME,
-            request,
-            test_dp_rank=False,
+        model_name=MODEL_NAME,
        block_size=BLOCK_SIZE,
+        component_name="backend",
+        num_workers=2,
+        single_gpu=True,
+        test_dp_rank=False,
    )


@@ -468,34 +321,19 @@ def test_router_decisions_vllm_dp(
        * The (worker_id, dp_rank) with events should have exactly 4 events (one per request)
        * All events should be on the forced (worker_id, dp_rank=1) (verifying forced routing and prefix reuse)
    """
-    N_WORKERS = 1
-    DP_SIZE = 2
-
-    with VLLMProcess(
-        request,
-        vllm_args=VLLM_ARGS,
-        num_workers=N_WORKERS,  # Ignored when data_parallel_size is set
-        single_gpu=False,
-        data_parallel_size=DP_SIZE,  # Creates DP_SIZE processes (one per rank)
+    run_router_decisions_test(
+        engine_process_cls=VLLMProcess,
+        engine_args_name="vllm_args",
+        engine_args=VLLM_ARGS,
+        request=request,
        request_plane=request_plane,
-    ) as vllm_workers:
-        logger.info("Starting 2 vLLM DP ranks (dp_size=2) (gpu_mem=0.4)")
-        logger.info(f"All vLLM workers using namespace: {vllm_workers.namespace}")
-
-        # Get runtime and create endpoint
-        runtime = get_runtime(request_plane=request_plane)
-        # Use the namespace from the vLLM workers
-        endpoint = runtime.endpoint(
-            f"{vllm_workers.namespace}.backend.generate"
-        )  # endpoint is backend.generate
-
-        _test_router_decisions(
-            vllm_workers,
-            endpoint,
-            MODEL_NAME,
-            request,
-            test_dp_rank=True,
+        model_name=MODEL_NAME,
        block_size=BLOCK_SIZE,
+        component_name="backend",
+        num_workers=1,
+        single_gpu=False,
+        test_dp_rank=True,
+        extra_process_kwargs={"data_parallel_size": 2},
    )


@@ -520,50 +358,16 @@ def test_vllm_indexers_sync(
    durable_kv_events,
    request_plane,
 ):
-    """
-    Test that two KV routers have synchronized indexer states after processing requests
-    with vLLM workers. This test verifies that both routers converge to the same internal state.
-
-    Tests with configuration:
-    - nats_core: etcd backend, local indexer with NATS Core, TCP request plane
-                 (includes NATS interruption/recovery testing)
-    """
-    # runtime_services_dynamic_ports handles NATS and etcd startup
-    nats_process, _etcd_process = runtime_services_dynamic_ports
-
-    logger.info(
-        f"Starting vLLM indexers sync test: store_backend={store_backend}, "
-        f"durable_kv_events={durable_kv_events}, request_plane={request_plane}"
-    )
-
-    N_VLLM_WORKERS = 2
-
-    with VLLMProcess(
-        request,
-        vllm_args=VLLM_ARGS,
-        num_workers=N_VLLM_WORKERS,
-        single_gpu=True,  # fit workers into one GPU
-        request_plane=request_plane,
+    run_indexers_sync_test(
+        engine_process_cls=VLLMProcess,
+        engine_args_name="vllm_args",
+        engine_args=VLLM_ARGS,
+        request=request,
+        runtime_services_dynamic_ports=runtime_services_dynamic_ports,
        store_backend=store_backend,
        durable_kv_events=durable_kv_events,
-    ) as vllm_workers:
-        # Start vLLM workers
-        logger.info(f"Starting {N_VLLM_WORKERS} vLLM workers")
-        logger.info(f"All vLLM workers using namespace: {vllm_workers.namespace}")
-
-        # Use the common test implementation (creates its own runtimes for each router)
-        # Note: Consumer verification is done inside _test_router_indexers_sync while routers are alive
-        # When using durable_kv_events=True, use JetStream mode for the router
-        _test_router_indexers_sync(
-            engine_workers=vllm_workers,
+        request_plane=request_plane,
        block_size=BLOCK_SIZE,
        model_name=MODEL_NAME,
-            num_workers=N_VLLM_WORKERS,
-            store_backend=store_backend,
-            request_plane=request_plane,
-            test_nats_interruption=not durable_kv_events,
-            nats_server=nats_process if not durable_kv_events else None,
-            durable_kv_events=durable_kv_events,
+        num_workers=2,
    )
-
-        logger.info("vLLM indexers sync test completed successfully")