feat: efficient serving of multiple mockers (#3997)

Signed-off-by: PeaBrane <yanrpei@gmail.com>

feat: efficient serving of multiple mockers (#3997)
Signed-off-by: PeaBrane <yanrpei@gmail.com>
6ff49edb · Yan Ru Pei · GitHub · 17139df9 · 6ff49edb · 6ff49edb
Unverified Commit 6ff49edb authored Oct 31, 2025 by Yan Ru Pei Committed by GitHub Nov 01, 2025
6 changed files
--- a/benchmarks/router/run_engines.sh
+++ b/benchmarks/router/run_engines.sh
@@ -168,7 +168,38 @@ trap cleanup SIGINT SIGTERM

 echo "Starting $NUM_WORKERS $MODE workers..."

-for i in $(seq 1 $NUM_WORKERS); do
+if [ "$USE_MOCKERS" = true ]; then
+    # For mockers, launch a single process with --num-workers
+    # All workers share the same tokio runtime and thread pool
+    MODE_CAPITALIZED=$(echo "$MODE" | sed 's/\(.\)/\U\1/')
+    echo "[$MODE_CAPITALIZED Mocker] Starting $NUM_WORKERS workers in single process..."
+
+    MOCKER_ARGS=()
+    MOCKER_ARGS+=("--model-path" "$MODEL_PATH")
+    MOCKER_ARGS+=("--num-workers" "$NUM_WORKERS")
+
+    # Set endpoint based on worker mode
+    if [ "$MODE" = "prefill" ]; then
+        MOCKER_ARGS+=("--endpoint" "dyn://test.prefill.generate")
+        MOCKER_ARGS+=("--is-prefill-worker")
+    elif [ "$MODE" = "decode" ]; then
+        MOCKER_ARGS+=("--endpoint" "dyn://test.mocker.generate")
+        MOCKER_ARGS+=("--is-decode-worker")
+    else
+        MOCKER_ARGS+=("--endpoint" "dyn://test.mocker.generate")
+    fi
+
+    if [ "$DATA_PARALLEL_SIZE" -gt 1 ]; then
+        MOCKER_ARGS+=("--data-parallel-size" "$DATA_PARALLEL_SIZE")
+    fi
+    MOCKER_ARGS+=("${EXTRA_ARGS[@]}")
+
+    python -m dynamo.mocker "${MOCKER_ARGS[@]}" &
+    PIDS+=($!)
+    echo "Started mocker with $NUM_WORKERS workers (PID: $!)"
+else
+    # For vLLM and TensorRT-LLM, use the original loop to launch separate processes
+    for i in $(seq 1 $NUM_WORKERS); do
        {
            MODE_CAPITALIZED=$(echo "$MODE" | sed 's/\(.\)/\U\1/')
            echo "[$MODE_CAPITALIZED Worker-$i] Starting..."
@@ -192,29 +223,7 @@ for i in $(seq 1 $NUM_WORKERS); do
                done
            fi

-        if [ "$USE_MOCKERS" = true ]; then
-            # Run mocker engine (no GPU assignment needed)
-            MOCKER_ARGS=()
-            MOCKER_ARGS+=("--model-path" "$MODEL_PATH")
-
-            # Set endpoint based on worker mode
-            if [ "$MODE" = "prefill" ]; then
-                MOCKER_ARGS+=("--endpoint" "dyn://test.prefill.generate")
-                MOCKER_ARGS+=("--is-prefill-worker")
-            elif [ "$MODE" = "decode" ]; then
-                MOCKER_ARGS+=("--endpoint" "dyn://test.mocker.generate")
-                MOCKER_ARGS+=("--is-decode-worker")
-            else
-                MOCKER_ARGS+=("--endpoint" "dyn://test.mocker.generate")
-            fi
-
-            if [ "$DATA_PARALLEL_SIZE" -gt 1 ]; then
-                MOCKER_ARGS+=("--data-parallel-size" "$DATA_PARALLEL_SIZE")
-            fi
-            MOCKER_ARGS+=("${EXTRA_ARGS[@]}")
-
-            exec python -m dynamo.mocker "${MOCKER_ARGS[@]}"
-        elif [ "$USE_TRTLLM" = true ]; then
+            if [ "$USE_TRTLLM" = true ]; then
                echo "[$MODE_CAPITALIZED Worker-$i] Using GPUs: $GPU_DEVICES"
                # Run TensorRT-LLM engine with trtllm-llmapi-launch for proper initialization
                TRTLLM_ARGS=()
@@ -249,7 +258,8 @@ for i in $(seq 1 $NUM_WORKERS); do
        } &
        PIDS+=($!)
        echo "Started $MODE worker $i (PID: $!)"
-done
+    done
+fi

 echo "All workers started. Press Ctrl+C to stop."
 wait

--- a/components/src/dynamo/mocker/README.md
+++ b/components/src/dynamo/mocker/README.md
@@ -24,6 +24,7 @@ The mocker engine now supports a vLLM-style CLI interface with individual argume
 - `--watermark`: KV cache watermark threshold as a fraction (default: 0.01)
 - `--speedup-ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster
 - `--data-parallel-size`: Number of data parallel workers to simulate (default: 1)
+- `--num-workers`: Number of mocker workers to launch in the same process (default: 1). All workers share the same tokio runtime and thread pool

 ### Example with individual arguments (vLLM-style):
 ```bash
@@ -34,6 +35,7 @@ python -m dynamo.mocker \
  --block-size 16 \
  --speedup-ratio 10.0 \
  --max-num-seqs 512 \
+  --num-workers 4 \
  --enable-prefix-caching

 # Start frontend server
@@ -41,4 +43,4 @@ python -m dynamo.frontend --http-port 8000
 ```

 > [!Note]
-> Each mocker instance runs as a single process, and each DP worker (specified by `--data-parallel-size`) is spawned as a lightweight async task within that process. For benchmarking (e.g., router testing), you would much prefer launching one mocker instance with a large `--data-parallel-size` rather than multiple separate mocker instances to reduce overhead.
\ No newline at end of file
+> Each mocker instance runs as a single process, and each DP worker (specified by `--data-parallel-size`) is spawned as a lightweight async task within that process. For benchmarking (e.g., router testing), you can use `--num-workers` to launch multiple mocker engines in the same process, which is more efficient than launching separate processes since they all share the same tokio runtime and thread pool.
\ No newline at end of file
--- a/components/src/dynamo/mocker/args.py
+++ b/components/src/dynamo/mocker/args.py
@@ -175,6 +175,13 @@ def parse_args():
        default=None,
        help="Simulated engine startup time in seconds (default: None)",
    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=1,
+        help="Number of mocker workers to launch in the same process (default: 1). "
+        "All workers share the same tokio runtime and thread pool.",
+    )

    # Legacy support - allow direct JSON file specification
    parser.add_argument(
@@ -201,6 +208,10 @@ def parse_args():
    args = parser.parse_args()
    validate_worker_type_args(args)

+    # Validate num_workers
+    if args.num_workers < 1:
+        raise ValueError(f"--num-workers must be at least 1, got {args.num_workers}")
+
    # Set endpoint default based on worker type if not explicitly provided
    if args.endpoint is None:
        if args.is_prefill_worker:

--- a/components/src/dynamo/mocker/main.py
+++ b/components/src/dynamo/mocker/main.py
@@ -4,12 +4,16 @@
 # Usage: `python -m dynamo.mocker --model-path /data/models/Qwen3-0.6B`
 # Now supports vLLM-style individual arguments for MockEngineArgs

+import asyncio
 import logging
+import os

 import uvloop

+os.environ.setdefault("DYN_COMPUTE_THREADS", "0")
+
 from dynamo.llm import EngineType, EntrypointArgs, make_engine, run_input
-from dynamo.runtime import DistributedRuntime, dynamo_worker
+from dynamo.runtime import DistributedRuntime
 from dynamo.runtime.logging import configure_dynamo_logging

 from .args import create_temp_engine_args_file, parse_args
@@ -18,8 +22,12 @@ configure_dynamo_logging()
 logger = logging.getLogger(__name__)


-@dynamo_worker(static=False)
-async def worker(runtime: DistributedRuntime):
+async def worker():
+    """Main worker function that launches mocker instances.
+
+    Each mocker gets its own DistributedRuntime instance for true isolation,
+    while still sharing the same event loop and tokio runtime.
+    """
    args = parse_args()

    # Handle extra_engine_args: either use provided file or create from CLI args
@@ -33,7 +41,41 @@ async def worker(runtime: DistributedRuntime):
        logger.info("Created MockEngineArgs from CLI arguments")

    try:
-        # Create engine configuration
+        logger.info(
+            f"Launching {args.num_workers} mocker worker(s) with isolated DistributedRuntime instances"
+        )
+        await launch_workers(args, extra_engine_args_path)
+    finally:
+        # Clean up temporary file if we created one
+        if not args.extra_engine_args and extra_engine_args_path.exists():
+            try:
+                extra_engine_args_path.unlink()
+                logger.debug(f"Cleaned up temporary file {extra_engine_args_path}")
+            except Exception as e:
+                logger.warning(f"Failed to clean up temporary file: {e}")
+
+
+async def launch_workers(args, extra_engine_args_path):
+    """Launch mocker worker(s) with isolated DistributedRuntime instances.
+
+    Each worker gets its own DistributedRuntime, which means:
+    - Separate etcd/NATS connections
+    - Separate Component instances (no shared overhead)
+    - Independent service registration and stats scraping
+    - But still sharing the same tokio runtime (efficient)
+    """
+    loop = asyncio.get_running_loop()
+    futures = []
+    runtimes = []
+
+    for worker_id in range(args.num_workers):
+        logger.info(f"Creating mocker worker {worker_id + 1}/{args.num_workers}")
+
+        # Create a separate DistributedRuntime for this worker (on same event loop)
+        runtime = DistributedRuntime(loop, False)
+        runtimes.append(runtime)
+
+        # Create EntrypointArgs for this worker
        entrypoint_args = EntrypointArgs(
            engine_type=EngineType.Mocker,
            model_path=args.model_path,
@@ -43,18 +85,23 @@ async def worker(runtime: DistributedRuntime):
            is_prefill=args.is_prefill_worker,
        )

-        # Create and run the engine
-        # NOTE: only supports dyn endpoint for now
+        # Create the engine with this worker's isolated runtime
        engine_config = await make_engine(runtime, entrypoint_args)
-        await run_input(runtime, args.endpoint, engine_config)
-    finally:
-        # Clean up temporary file if we created one
-        if not args.extra_engine_args and extra_engine_args_path.exists():
+
+        # run_input returns a Rust Future (not a Python coroutine)
+        future = run_input(runtime, args.endpoint, engine_config)
+        futures.append(future)
+
+    logger.info(f"All {args.num_workers} mocker worker(s) created and running")
+
    try:
-                extra_engine_args_path.unlink()
-                logger.debug(f"Cleaned up temporary file {extra_engine_args_path}")
-            except Exception as e:
-                logger.warning(f"Failed to clean up temporary file: {e}")
+        # Wait for all futures to complete
+        await asyncio.gather(*futures, return_exceptions=True)
+    finally:
+        # Clean up runtimes
+        logger.info("Shutting down DistributedRuntime instances")
+        for runtime in runtimes:
+            runtime.shutdown()


 def main():

--- a/lib/bindings/python/rust/lib.rs
+++ b/lib/bindings/python/rust/lib.rs
@@ -428,17 +428,26 @@ enum ModelInput {
 impl DistributedRuntime {
    #[new]
    fn new(event_loop: PyObject, is_static: bool) -> PyResult<Self> {
-        let worker = rs::Worker::from_settings().map_err(to_pyerr)?;
+        // Try to get existing runtime first, create new Worker only if needed
+        // This allows multiple DistributedRuntime instances to share the same tokio runtime
+        let runtime = rs::Worker::runtime_from_existing()
+            .or_else(|_| {
+                // No existing Worker, create new one
+                let worker = rs::Worker::from_settings()?;
+
+                // Initialize pyo3 bridge (only happens once per process)
                INIT.get_or_try_init(|| {
                    let primary = worker.tokio_runtime()?;
-            pyo3_async_runtimes::tokio::init_with_runtime(primary)
-                .map_err(|e| rs::error!("failed to initialize pyo3 static runtime: {:?}", e))?;
+                    pyo3_async_runtimes::tokio::init_with_runtime(primary).map_err(|e| {
+                        rs::error!("failed to initialize pyo3 static runtime: {:?}", e)
+                    })?;
                    rs::OK(())
+                })?;
+
+                rs::OK(worker.runtime().clone())
            })
            .map_err(to_pyerr)?;

-        let runtime = worker.runtime().clone();
-
        // Initialize logging in context where tokio runtime is available
        // otel exporter requires it
        if std::env::var("OTEL_EXPORT_ENABLED")

--- a/lib/llm/src/kv_router/publisher.rs
+++ b/lib/llm/src/kv_router/publisher.rs
@@ -860,6 +860,7 @@ impl WorkerMetricsPublisher {
                            {
                                tracing::warn!("Failed to publish metrics over NATS: {}", e);
                            }
+                        }

                        // Reset timer to pending state to avoid tight loop
                        // It will be reset to 1ms when metrics actually change
@@ -869,7 +870,6 @@ impl WorkerMetricsPublisher {
                    }
                }
            }
-            }
        });
    }
 }