Unverified Commit 6ff49edb authored by Yan Ru Pei's avatar Yan Ru Pei Committed by GitHub
Browse files

feat: efficient serving of multiple mockers (#3997)


Signed-off-by: default avatarPeaBrane <yanrpei@gmail.com>
parent 17139df9
......@@ -168,7 +168,38 @@ trap cleanup SIGINT SIGTERM
echo "Starting $NUM_WORKERS $MODE workers..."
for i in $(seq 1 $NUM_WORKERS); do
if [ "$USE_MOCKERS" = true ]; then
# For mockers, launch a single process with --num-workers
# All workers share the same tokio runtime and thread pool
MODE_CAPITALIZED=$(echo "$MODE" | sed 's/\(.\)/\U\1/')
echo "[$MODE_CAPITALIZED Mocker] Starting $NUM_WORKERS workers in single process..."
MOCKER_ARGS=()
MOCKER_ARGS+=("--model-path" "$MODEL_PATH")
MOCKER_ARGS+=("--num-workers" "$NUM_WORKERS")
# Set endpoint based on worker mode
if [ "$MODE" = "prefill" ]; then
MOCKER_ARGS+=("--endpoint" "dyn://test.prefill.generate")
MOCKER_ARGS+=("--is-prefill-worker")
elif [ "$MODE" = "decode" ]; then
MOCKER_ARGS+=("--endpoint" "dyn://test.mocker.generate")
MOCKER_ARGS+=("--is-decode-worker")
else
MOCKER_ARGS+=("--endpoint" "dyn://test.mocker.generate")
fi
if [ "$DATA_PARALLEL_SIZE" -gt 1 ]; then
MOCKER_ARGS+=("--data-parallel-size" "$DATA_PARALLEL_SIZE")
fi
MOCKER_ARGS+=("${EXTRA_ARGS[@]}")
python -m dynamo.mocker "${MOCKER_ARGS[@]}" &
PIDS+=($!)
echo "Started mocker with $NUM_WORKERS workers (PID: $!)"
else
# For vLLM and TensorRT-LLM, use the original loop to launch separate processes
for i in $(seq 1 $NUM_WORKERS); do
{
MODE_CAPITALIZED=$(echo "$MODE" | sed 's/\(.\)/\U\1/')
echo "[$MODE_CAPITALIZED Worker-$i] Starting..."
......@@ -192,29 +223,7 @@ for i in $(seq 1 $NUM_WORKERS); do
done
fi
if [ "$USE_MOCKERS" = true ]; then
# Run mocker engine (no GPU assignment needed)
MOCKER_ARGS=()
MOCKER_ARGS+=("--model-path" "$MODEL_PATH")
# Set endpoint based on worker mode
if [ "$MODE" = "prefill" ]; then
MOCKER_ARGS+=("--endpoint" "dyn://test.prefill.generate")
MOCKER_ARGS+=("--is-prefill-worker")
elif [ "$MODE" = "decode" ]; then
MOCKER_ARGS+=("--endpoint" "dyn://test.mocker.generate")
MOCKER_ARGS+=("--is-decode-worker")
else
MOCKER_ARGS+=("--endpoint" "dyn://test.mocker.generate")
fi
if [ "$DATA_PARALLEL_SIZE" -gt 1 ]; then
MOCKER_ARGS+=("--data-parallel-size" "$DATA_PARALLEL_SIZE")
fi
MOCKER_ARGS+=("${EXTRA_ARGS[@]}")
exec python -m dynamo.mocker "${MOCKER_ARGS[@]}"
elif [ "$USE_TRTLLM" = true ]; then
if [ "$USE_TRTLLM" = true ]; then
echo "[$MODE_CAPITALIZED Worker-$i] Using GPUs: $GPU_DEVICES"
# Run TensorRT-LLM engine with trtllm-llmapi-launch for proper initialization
TRTLLM_ARGS=()
......@@ -249,7 +258,8 @@ for i in $(seq 1 $NUM_WORKERS); do
} &
PIDS+=($!)
echo "Started $MODE worker $i (PID: $!)"
done
done
fi
echo "All workers started. Press Ctrl+C to stop."
wait
......
......@@ -24,6 +24,7 @@ The mocker engine now supports a vLLM-style CLI interface with individual argume
- `--watermark`: KV cache watermark threshold as a fraction (default: 0.01)
- `--speedup-ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster
- `--data-parallel-size`: Number of data parallel workers to simulate (default: 1)
- `--num-workers`: Number of mocker workers to launch in the same process (default: 1). All workers share the same tokio runtime and thread pool
### Example with individual arguments (vLLM-style):
```bash
......@@ -34,6 +35,7 @@ python -m dynamo.mocker \
--block-size 16 \
--speedup-ratio 10.0 \
--max-num-seqs 512 \
--num-workers 4 \
--enable-prefix-caching
# Start frontend server
......@@ -41,4 +43,4 @@ python -m dynamo.frontend --http-port 8000
```
> [!Note]
> Each mocker instance runs as a single process, and each DP worker (specified by `--data-parallel-size`) is spawned as a lightweight async task within that process. For benchmarking (e.g., router testing), you would much prefer launching one mocker instance with a large `--data-parallel-size` rather than multiple separate mocker instances to reduce overhead.
\ No newline at end of file
> Each mocker instance runs as a single process, and each DP worker (specified by `--data-parallel-size`) is spawned as a lightweight async task within that process. For benchmarking (e.g., router testing), you can use `--num-workers` to launch multiple mocker engines in the same process, which is more efficient than launching separate processes since they all share the same tokio runtime and thread pool.
\ No newline at end of file
......@@ -175,6 +175,13 @@ def parse_args():
default=None,
help="Simulated engine startup time in seconds (default: None)",
)
parser.add_argument(
"--num-workers",
type=int,
default=1,
help="Number of mocker workers to launch in the same process (default: 1). "
"All workers share the same tokio runtime and thread pool.",
)
# Legacy support - allow direct JSON file specification
parser.add_argument(
......@@ -201,6 +208,10 @@ def parse_args():
args = parser.parse_args()
validate_worker_type_args(args)
# Validate num_workers
if args.num_workers < 1:
raise ValueError(f"--num-workers must be at least 1, got {args.num_workers}")
# Set endpoint default based on worker type if not explicitly provided
if args.endpoint is None:
if args.is_prefill_worker:
......
......@@ -4,12 +4,16 @@
# Usage: `python -m dynamo.mocker --model-path /data/models/Qwen3-0.6B`
# Now supports vLLM-style individual arguments for MockEngineArgs
import asyncio
import logging
import os
import uvloop
os.environ.setdefault("DYN_COMPUTE_THREADS", "0")
from dynamo.llm import EngineType, EntrypointArgs, make_engine, run_input
from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime import DistributedRuntime
from dynamo.runtime.logging import configure_dynamo_logging
from .args import create_temp_engine_args_file, parse_args
......@@ -18,8 +22,12 @@ configure_dynamo_logging()
logger = logging.getLogger(__name__)
@dynamo_worker(static=False)
async def worker(runtime: DistributedRuntime):
async def worker():
"""Main worker function that launches mocker instances.
Each mocker gets its own DistributedRuntime instance for true isolation,
while still sharing the same event loop and tokio runtime.
"""
args = parse_args()
# Handle extra_engine_args: either use provided file or create from CLI args
......@@ -33,7 +41,41 @@ async def worker(runtime: DistributedRuntime):
logger.info("Created MockEngineArgs from CLI arguments")
try:
# Create engine configuration
logger.info(
f"Launching {args.num_workers} mocker worker(s) with isolated DistributedRuntime instances"
)
await launch_workers(args, extra_engine_args_path)
finally:
# Clean up temporary file if we created one
if not args.extra_engine_args and extra_engine_args_path.exists():
try:
extra_engine_args_path.unlink()
logger.debug(f"Cleaned up temporary file {extra_engine_args_path}")
except Exception as e:
logger.warning(f"Failed to clean up temporary file: {e}")
async def launch_workers(args, extra_engine_args_path):
"""Launch mocker worker(s) with isolated DistributedRuntime instances.
Each worker gets its own DistributedRuntime, which means:
- Separate etcd/NATS connections
- Separate Component instances (no shared overhead)
- Independent service registration and stats scraping
- But still sharing the same tokio runtime (efficient)
"""
loop = asyncio.get_running_loop()
futures = []
runtimes = []
for worker_id in range(args.num_workers):
logger.info(f"Creating mocker worker {worker_id + 1}/{args.num_workers}")
# Create a separate DistributedRuntime for this worker (on same event loop)
runtime = DistributedRuntime(loop, False)
runtimes.append(runtime)
# Create EntrypointArgs for this worker
entrypoint_args = EntrypointArgs(
engine_type=EngineType.Mocker,
model_path=args.model_path,
......@@ -43,18 +85,23 @@ async def worker(runtime: DistributedRuntime):
is_prefill=args.is_prefill_worker,
)
# Create and run the engine
# NOTE: only supports dyn endpoint for now
# Create the engine with this worker's isolated runtime
engine_config = await make_engine(runtime, entrypoint_args)
await run_input(runtime, args.endpoint, engine_config)
finally:
# Clean up temporary file if we created one
if not args.extra_engine_args and extra_engine_args_path.exists():
# run_input returns a Rust Future (not a Python coroutine)
future = run_input(runtime, args.endpoint, engine_config)
futures.append(future)
logger.info(f"All {args.num_workers} mocker worker(s) created and running")
try:
extra_engine_args_path.unlink()
logger.debug(f"Cleaned up temporary file {extra_engine_args_path}")
except Exception as e:
logger.warning(f"Failed to clean up temporary file: {e}")
# Wait for all futures to complete
await asyncio.gather(*futures, return_exceptions=True)
finally:
# Clean up runtimes
logger.info("Shutting down DistributedRuntime instances")
for runtime in runtimes:
runtime.shutdown()
def main():
......
......@@ -428,17 +428,26 @@ enum ModelInput {
impl DistributedRuntime {
#[new]
fn new(event_loop: PyObject, is_static: bool) -> PyResult<Self> {
let worker = rs::Worker::from_settings().map_err(to_pyerr)?;
// Try to get existing runtime first, create new Worker only if needed
// This allows multiple DistributedRuntime instances to share the same tokio runtime
let runtime = rs::Worker::runtime_from_existing()
.or_else(|_| {
// No existing Worker, create new one
let worker = rs::Worker::from_settings()?;
// Initialize pyo3 bridge (only happens once per process)
INIT.get_or_try_init(|| {
let primary = worker.tokio_runtime()?;
pyo3_async_runtimes::tokio::init_with_runtime(primary)
.map_err(|e| rs::error!("failed to initialize pyo3 static runtime: {:?}", e))?;
pyo3_async_runtimes::tokio::init_with_runtime(primary).map_err(|e| {
rs::error!("failed to initialize pyo3 static runtime: {:?}", e)
})?;
rs::OK(())
})?;
rs::OK(worker.runtime().clone())
})
.map_err(to_pyerr)?;
let runtime = worker.runtime().clone();
// Initialize logging in context where tokio runtime is available
// otel exporter requires it
if std::env::var("OTEL_EXPORT_ENABLED")
......
......@@ -860,6 +860,7 @@ impl WorkerMetricsPublisher {
{
tracing::warn!("Failed to publish metrics over NATS: {}", e);
}
}
// Reset timer to pending state to avoid tight loop
// It will be reset to 1ms when metrics actually change
......@@ -869,7 +870,6 @@ impl WorkerMetricsPublisher {
}
}
}
}
});
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment