Unverified Commit f00d700e authored by Alec's avatar Alec Committed by GitHub
Browse files

refactor: remove old examples with old UX (#1899)

parent c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import asyncio
import json
import logging
import os
import time
from datetime import datetime
from typing import Any, List
import numpy as np
from rich.console import Console
from rich.table import Table
from tensorboardX import SummaryWriter
from utils.prefill_queue import PrefillQueue
from dynamo.llm import KvMetricsAggregator
from dynamo.planner import KubernetesConnector, LocalConnector
from dynamo.planner.defaults import LoadPlannerDefaults
from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging
configure_dynamo_logging()
logger = logging.getLogger(__name__)
# will not decrease decode worker number within 3 adjustment interval after a new decode worker
# is added. this is to leave time for the new decode worker to populate its kv cache.
NEW_DECODE_WORKER_GRACE_PERIOD = 3
# we do not scale up prefill worker if the prefill queue size is estimated to reduce within
# --prefill-queue-scale-up-threshold within the next NEW_PREFILL_WORKER_QUEUE_BUFFER_PERIOD
# adjustment intervals following the trend observed in the current adjustment interval.
# this is to account for the time for prefill workers to start.
NEW_PREFILL_WORKER_QUEUE_BUFFER_PERIOD = 3
class Planner:
def __init__(self, runtime: DistributedRuntime, args: argparse.Namespace):
self.runtime = runtime
self.args = args
self.namespace = args.namespace
if args.environment == "local":
self.connector = LocalConnector(args.namespace, runtime)
elif args.environment == "kubernetes":
self.connector = KubernetesConnector(args.namespace)
else:
raise ValueError(f"Invalid environment: {args.environment}")
self._prefill_queue_nats_server = os.getenv(
"NATS_SERVER", "nats://localhost:4222"
)
self._prefill_queue_stream_name = f"{self.namespace}_prefill_queue"
self.prefill_client: Any | None = None
self.workers_client: Any | None = None
self.p_endpoints: List[int] = []
self.d_endpoints: List[int] = []
self.decode_worker_remaining_grace_period = 0
if args.log_dir is None:
args.log_dir = f"logs/{datetime.now().strftime('%m%d_%H%M%S')}"
self.writer = SummaryWriter(args.log_dir)
logger.info(f"Components present in namespace: {args.namespace}")
self.init_time = time.time()
# Set the appropriate logger function for repeated metric logging
self._repeating_log_func = logger.debug if args.no_operation else logger.info
async def set_metric_aggregator(self):
# TODO: separate KV metrics and prefill metrics
kv_listener = self.runtime.namespace(self.namespace).component("VllmWorker")
await kv_listener.create_service()
self.metrics_aggregator = KvMetricsAggregator(kv_listener)
async def get_workers_info(self):
try:
if self.prefill_client is None:
self.prefill_client = (
await self.runtime.namespace(self.namespace)
.component("PrefillWorker")
.endpoint("mock")
.client()
)
# TODO: remove this sleep after rust client() is blocking until watching state
await asyncio.sleep(0.1)
# TODO: use etcd events instead of pulling instance_ids
p_endpoints = self.prefill_client.instance_ids()
except Exception:
p_endpoints = []
self._repeating_log_func(
"No prefill workers found, operating in aggregated mode"
)
try:
if self.workers_client is None:
self.workers_client = (
await self.runtime.namespace(self.namespace)
.component("VllmWorker")
.endpoint("generate")
.client()
)
# TODO: remove this sleep after rust client() is blocking until watching state
await asyncio.sleep(0.1)
# TODO: use etcd events instead of pulling instance_ids
d_endpoints = self.workers_client.instance_ids()
except Exception as e:
raise RuntimeError(f"Failed to get decode worker endpoints: {e}")
return p_endpoints, d_endpoints
async def reset_adjustment_interval(self):
self._repeating_log_func(
f"Reset metrics for new adjustment interval at t={time.time() - self.init_time:.1f}s"
)
self.p_endpoints, self.d_endpoints = await self.get_workers_info()
self._repeating_log_func(
f"Number of prefill workers: {len(self.p_endpoints)}, number of decode workers: {len(self.d_endpoints)}"
)
self.metrics_collection_time = []
self.prefill_queue_load = []
self.kv_load = []
self.last_adjustment_time = time.time()
async def collect_metrics(self):
self._repeating_log_func(
f"Collecting metrics at t={time.time() - self.init_time:.1f}s"
)
# collect prefill queue load
try:
async with PrefillQueue.get_instance(
nats_server=self._prefill_queue_nats_server,
stream_name=self._prefill_queue_stream_name,
) as prefill_queue:
prefill_queue_size = await prefill_queue.get_queue_size()
measure_time = time.time() - self.init_time
self.prefill_queue_load.append(prefill_queue_size)
self._repeating_log_func(
f"Collected prefill queue size at t={measure_time:.1f}s: {int(prefill_queue_size)}"
)
self.writer.add_scalar(
"prefill_queue_size", prefill_queue_size, measure_time
)
except Exception as e:
self._repeating_log_func(
f"Failed to collect prefill queue size metrics: {e}"
)
# collect kv load
total_active_requests: int = 0
total_queued_requests: int = 0
metrics = await self.metrics_aggregator.get_metrics()
try:
prev_kv_load_len = len(self.kv_load)
for endpoint in metrics.endpoints:
kv_load = getattr(endpoint, "gpu_cache_usage_perc", 0.0)
num_requests_waiting = getattr(endpoint, "num_requests_waiting", 0)
total_queued_requests += num_requests_waiting
request_active_slots = getattr(endpoint, "request_active_slots", None)
if request_active_slots:
total_active_requests += request_active_slots
if num_requests_waiting > 0:
# estimate kv load after waiting requests are scheduled based on current isl/osl
# TODO: use actual isl/osl estimation after the request_active_slot bug in disaggg is fixed
# Currently, we assume each request uses 0.02 kv cache
# kv_load = kv_load * (request_active_slots + num_requests_waiting) / request_active_slots
kv_load = kv_load + 0.02 * num_requests_waiting
self.kv_load.append(kv_load)
measure_time = time.time() - self.init_time
self._repeating_log_func(
f"Collected kv load at t={measure_time:.1f}s: {self.kv_load[prev_kv_load_len:]} (act/pnd req: {total_active_requests}/{total_queued_requests})"
)
average_kv_load = np.mean(self.kv_load[prev_kv_load_len:])
self.writer.add_scalar("average_kv_load", average_kv_load, measure_time)
self.writer.add_scalar(
"total_queued_requests", total_queued_requests, measure_time
)
except Exception as e:
self._repeating_log_func(f"Failed to collect kv load metrics: {e}")
p_endpoints, d_endpoints = await self.get_workers_info()
self.writer.add_scalar(
"num_prefill_workers", len(p_endpoints), time.time() - self.init_time
)
self.writer.add_scalar(
"num_decode_workers", len(d_endpoints), time.time() - self.init_time
)
curr_gpu_usage = (
len(p_endpoints) * self.args.prefill_engine_num_gpu
+ len(d_endpoints) * self.args.decode_engine_num_gpu
)
self.writer.add_scalar("num_gpu", curr_gpu_usage, time.time() - self.init_time)
self.metrics_collection_time.append(time.time())
async def make_adjustments(self):
# Note: all adjustments are blocking. Non-blocking adjustment and metric pulling
# make the optimization problem too complex and should not be needed in most cases.
logger.info(f"Making adjustments at t={time.time() - self.init_time:.1f}s")
# check if decode/prefill workers is still the same
# note that we only check length as endpoint ids might change
new_p_endpoints, new_d_endpoints = await self.get_workers_info()
if len(new_p_endpoints) != len(self.p_endpoints) or len(new_d_endpoints) != len(
self.d_endpoints
):
logger.info("Decode/prefill workers changed, no adjustments will be made")
return
# compute current gpu usage
curr_gpu_usage = (
len(self.p_endpoints) * self.args.prefill_engine_num_gpu
+ len(self.d_endpoints) * self.args.decode_engine_num_gpu
)
logger.info(f"Current engines use {curr_gpu_usage} GPUs")
avg_prefill_queue_load = np.mean(self.prefill_queue_load) / len(
self.p_endpoints
)
avg_kv_load = np.mean(self.kv_load)
# first check if we need to scale down any workers
if (
avg_prefill_queue_load < self.args.prefill_queue_scale_down_threshold
and len(self.p_endpoints) > self.args.min_endpoint
):
logger.info(
f"Average prefill queue load ({avg_prefill_queue_load:.2f}) is below threshold ({self.args.prefill_queue_scale_down_threshold:.2f}), scaling down prefill workers"
)
success = await self.connector.remove_component("PrefillWorker")
if success:
curr_gpu_usage -= self.args.prefill_engine_num_gpu
else:
logger.info("Failed to scale down prefill worker")
if (
avg_kv_load < self.args.decode_kv_scale_down_threshold
and len(self.d_endpoints) > self.args.min_endpoint
):
if self.decode_worker_remaining_grace_period > 0:
logger.info(
f"Decode worker remaining grace period is {self.decode_worker_remaining_grace_period}, skipping scale down"
)
else:
logger.info(
f"Average kv load ({avg_kv_load:.2f}) is below threshold ({self.args.decode_kv_scale_down_threshold:.2f}), scaling down decode workers"
)
success = await self.connector.remove_component("VllmWorker")
if success:
curr_gpu_usage -= self.args.decode_engine_num_gpu
else:
logger.info("Failed to scale down decode worker")
# check if we need to scale up workers
# we first check for prefill worker because prefill queueing can also lead
# to high kv load on decode workers
if (
avg_prefill_queue_load > self.args.prefill_queue_scale_up_threshold
and curr_gpu_usage + self.args.prefill_engine_num_gpu
<= self.args.max_gpu_budget
):
logger.info(
f"Average prefill queue load ({avg_prefill_queue_load:.2f}) is above threshold ({self.args.prefill_queue_scale_up_threshold:.2f})"
)
# check prefill queue size trend:
prefill_queue_size_change = (
self.prefill_queue_load[-1] - self.prefill_queue_load[0]
)
predicted_prefill_future_queue_size = (
self.prefill_queue_load[-1]
+ prefill_queue_size_change * NEW_PREFILL_WORKER_QUEUE_BUFFER_PERIOD
)
if (
predicted_prefill_future_queue_size
> self.args.prefill_queue_scale_up_threshold
):
logger.info(
f"Predicted future prefill queue size ({predicted_prefill_future_queue_size:.2f}) is also above threshold ({self.args.prefill_queue_scale_up_threshold:.2f}), scaling up prefill workers"
)
success = await self.connector.add_component("PrefillWorker")
if success:
curr_gpu_usage += self.args.prefill_engine_num_gpu
else:
logger.info("Failed to scale up prefill worker")
else:
logger.info(
f"Predicted future prefill queue size ({predicted_prefill_future_queue_size:.2f}) is below threshold ({self.args.prefill_queue_scale_up_threshold:.2f}), skipping prefill worker scaling"
)
if (
avg_kv_load > self.args.decode_kv_scale_up_threshold
and curr_gpu_usage + self.args.decode_engine_num_gpu
<= self.args.max_gpu_budget
):
logger.info(
f"Average kv load ({avg_kv_load:.2f}) is above threshold ({self.args.decode_kv_scale_up_threshold:.2f}), scaling up decode workers"
)
success = await self.connector.add_component("VllmWorker")
if success:
curr_gpu_usage += self.args.decode_engine_num_gpu
self.decode_worker_remaining_grace_period = (
NEW_DECODE_WORKER_GRACE_PERIOD
)
else:
logger.info("Failed to scale up decode worker")
# no adjustment needed, just log the current metrics
if (
avg_prefill_queue_load > self.args.prefill_queue_scale_down_threshold
and avg_prefill_queue_load < self.args.prefill_queue_scale_up_threshold
):
logger.info(
f"Average prefill queue load ({avg_prefill_queue_load:.2f}) is within threshold, no prefill worker scaling needed"
)
if (
avg_kv_load > self.args.decode_kv_scale_down_threshold
and avg_kv_load < self.args.decode_kv_scale_up_threshold
):
logger.info(
f"Average kv load ({avg_kv_load:.2f}) is within threshold, no decode worker scaling needed"
)
logger.info(f"Engines after adjustment use {curr_gpu_usage} GPUs")
if self.decode_worker_remaining_grace_period > 0:
self.decode_worker_remaining_grace_period -= 1
async def run(self):
"""Main loop for the planner"""
await self.set_metric_aggregator()
if self._repeating_log_func == logger.debug:
logger.info(
"Running in no-operation mode - detailed metrics will be logged at DEBUG level"
)
await self.reset_adjustment_interval()
while True:
current_time = time.time()
# Collect metrics at each metric pulling interval
if (
len(self.metrics_collection_time) == 0
or current_time - self.metrics_collection_time[-1]
>= self.args.metric_pulling_interval
):
await self.collect_metrics()
# Check if it's time for adjustment
if (
current_time - self.last_adjustment_time
>= self.args.adjustment_interval
):
if not self.args.no_operation:
# blockingly make adjustments to avoid overcompensation
await self.make_adjustments()
await self.reset_adjustment_interval()
# Sleep to avoid busy waiting
await asyncio.sleep(self.args.metric_pulling_interval / 10)
# @dynamo_worker()
# TODO: let's make it such that planner still works via CLI invokation
async def start_planner(runtime: DistributedRuntime, args: argparse.Namespace):
planner = Planner(runtime, args)
console = Console()
table = Table()
table.add_column("Component", style="cyan")
table.add_column("Endpoint", style="green")
components = await runtime.etcd_client().kv_get_prefix(args.namespace)
for component in components:
try:
data = json.loads(component["value"].decode("utf-8"))
if "component" in data:
name = data["component"]
endpoint = data["endpoint"]
table.add_row(name, endpoint)
except Exception:
# Some entries may not be valid JSON or might be binary data
pass
console.print(table)
await planner.run()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Common planner arguments
parser.add_argument(
"--namespace",
type=str,
default=LoadPlannerDefaults.namespace,
help="Namespace planner will look at",
)
parser.add_argument(
"--environment",
type=str,
default=LoadPlannerDefaults.environment,
help="Environment to run the planner in (local, kubernetes)",
)
parser.add_argument(
"--no-operation",
action="store_true",
default=LoadPlannerDefaults.no_operation,
help="Do not make any adjustments, just observe the metrics",
)
parser.add_argument(
"--log-dir",
type=str,
default=LoadPlannerDefaults.log_dir,
help="Tensorboard logging directory",
)
parser.add_argument(
"--adjustment-interval",
type=int,
default=LoadPlannerDefaults.adjustment_interval,
help="Interval in seconds between scaling adjustments",
)
parser.add_argument(
"--max-gpu-budget",
type=int,
default=LoadPlannerDefaults.max_gpu_budget,
help="Maximum number of GPUs to use",
)
parser.add_argument(
"--min-endpoint",
type=int,
default=LoadPlannerDefaults.min_endpoint,
help="Minimum number of endpoints to keep for prefill/decode workers",
)
parser.add_argument(
"--metric-pulling-interval",
type=int,
default=LoadPlannerDefaults.metric_pulling_interval,
help="Interval in seconds between metric pulls",
)
parser.add_argument(
"--decode-engine-num-gpu",
type=int,
default=LoadPlannerDefaults.decode_engine_num_gpu,
help="Number of GPUs per decode engine",
)
parser.add_argument(
"--prefill-engine-num-gpu",
type=int,
default=LoadPlannerDefaults.prefill_engine_num_gpu,
help="Number of GPUs per prefill engine",
)
# Load-planner specific arguments
parser.add_argument(
"--decode-kv-scale-up-threshold",
type=float,
default=LoadPlannerDefaults.decode_kv_scale_up_threshold,
help="KV cache utilization threshold to scale up decode workers",
)
parser.add_argument(
"--decode-kv-scale-down-threshold",
type=float,
default=LoadPlannerDefaults.decode_kv_scale_down_threshold,
help="KV cache utilization threshold to scale down decode workers",
)
parser.add_argument(
"--prefill-queue-scale-up-threshold",
type=float,
default=LoadPlannerDefaults.prefill_queue_scale_up_threshold,
help="Queue utilization threshold to scale up prefill workers, this threshold is per prefill worker",
)
parser.add_argument(
"--prefill-queue-scale-down-threshold",
type=float,
default=LoadPlannerDefaults.prefill_queue_scale_down_threshold,
help="Queue utilization threshold to scale down prefill workers, this threshold is per prefill worker",
)
args = parser.parse_args()
asyncio.run(dynamo_worker()(start_planner)(args))
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
from pydantic import BaseModel
from components.planner import start_planner # type: ignore[attr-defined]
from dynamo.planner.defaults import LoadPlannerDefaults
from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.sdk import async_on_start, dynamo_context, endpoint, service
from dynamo.sdk.core.protocol.interface import ComponentType
from dynamo.sdk.lib.config import ServiceConfig
from dynamo.sdk.lib.image import DYNAMO_IMAGE
logger = logging.getLogger(__name__)
class RequestType(BaseModel):
text: str
@service(
dynamo={
"namespace": "dynamo",
"component_type": ComponentType.PLANNER,
},
resources={"cpu": "10", "memory": "20Gi"},
workers=1,
image=DYNAMO_IMAGE,
)
class Planner:
def __init__(self):
configure_dynamo_logging(service_name="Planner")
logger.info("Starting planner")
self.runtime = dynamo_context["runtime"]
config = ServiceConfig.get_instance()
# Get namespace directly from dynamo_context as it contains the active namespace
self.namespace = dynamo_context["namespace"]
config_instance = config.get("Planner", {})
self.args = argparse.Namespace(
namespace=self.namespace,
environment=config_instance.get(
"environment", LoadPlannerDefaults.environment
),
no_operation=config_instance.get(
"no-operation", LoadPlannerDefaults.no_operation
),
log_dir=config_instance.get("log-dir", LoadPlannerDefaults.log_dir),
adjustment_interval=config_instance.get(
"adjustment-interval", LoadPlannerDefaults.adjustment_interval
),
metric_pulling_interval=config_instance.get(
"metric-pulling-interval", LoadPlannerDefaults.metric_pulling_interval
),
max_gpu_budget=config_instance.get(
"max-gpu-budget", LoadPlannerDefaults.max_gpu_budget
),
min_endpoint=config_instance.get(
"min-endpoint", LoadPlannerDefaults.min_endpoint
),
decode_kv_scale_up_threshold=config_instance.get(
"decode-kv-scale-up-threshold",
LoadPlannerDefaults.decode_kv_scale_up_threshold,
),
decode_kv_scale_down_threshold=config_instance.get(
"decode-kv-scale-down-threshold",
LoadPlannerDefaults.decode_kv_scale_down_threshold,
),
prefill_queue_scale_up_threshold=config_instance.get(
"prefill-queue-scale-up-threshold",
LoadPlannerDefaults.prefill_queue_scale_up_threshold,
),
prefill_queue_scale_down_threshold=config_instance.get(
"prefill-queue-scale-down-threshold",
LoadPlannerDefaults.prefill_queue_scale_down_threshold,
),
decode_engine_num_gpu=config_instance.get(
"decode-engine-num-gpu", LoadPlannerDefaults.decode_engine_num_gpu
),
prefill_engine_num_gpu=config_instance.get(
"prefill-engine-num-gpu", LoadPlannerDefaults.prefill_engine_num_gpu
),
)
@async_on_start
async def async_init(self):
import asyncio
await asyncio.sleep(30)
logger.info("Calling start_planner")
await start_planner(self.runtime, self.args)
logger.info("Planner started")
@endpoint()
async def generate(self, request: RequestType):
"""Dummy endpoint to satisfy that each component has an endpoint"""
yield "mock endpoint"
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import logging
import os
import signal
import sys
from pydantic import BaseModel
from utils.nixl import NixlMetadataStore
from utils.prefill_queue import PrefillQueue
from utils.vllm import parse_vllm_args
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args,
)
from vllm.inputs.data import TokensPrompt
from vllm.remote_prefill import RemotePrefillParams, RemotePrefillRequest
from dynamo.sdk import async_on_start, dynamo_context, endpoint, service
logger = logging.getLogger(__name__)
class RequestType(BaseModel):
text: str
@service(
dynamo={
"namespace": "dynamo",
},
resources={"gpu": 1, "cpu": "10", "memory": "20Gi"},
workers=1,
)
class PrefillWorker:
def __init__(self):
class_name = self.__class__.__name__
self.engine_args = parse_vllm_args(class_name, "")
self._loaded_metadata = set()
self.initialized = False
if self.engine_args.enable_chunked_prefill is not False:
logger.info("Chunked prefill is not supported yet, setting to False")
self.engine_args.enable_chunked_prefill = False
if self.engine_args.pipeline_parallel_size != 1:
logger.info("Pipeline parallel size is not supported yet, setting to 1")
self.engine_args.pipeline_parallel_size = 1
if self.engine_args.disable_async_output_proc is not True:
logger.info("Async output processing is not supported yet, setting to True")
self.engine_args.disable_async_output_proc = True
if self.engine_args.enforce_eager is not True:
logger.info("Prefill must be done eagerly, setting to True")
self.engine_args.enforce_eager = True
if self.engine_args.enable_prefix_caching is not False:
logger.info(
"Prefix caching is not supported yet in prefill worker, setting to False"
)
self.engine_args.enable_prefix_caching = False
@async_on_start
async def async_init(self):
self._engine_context = build_async_engine_client_from_engine_args(
self.engine_args
)
if self._engine_context is not None:
self.engine_client = await self._engine_context.__aenter__()
else:
raise RuntimeError("Failed to initialize engine client")
runtime = dynamo_context["runtime"]
metadata = self.engine_client.nixl_metadata
self._metadata_store = NixlMetadataStore("dynamo", runtime)
await self._metadata_store.put(metadata.engine_id, metadata)
self.task = asyncio.create_task(self.prefill_queue_handler())
def prefill_queue_handler_cb(fut):
try:
fut.result()
logger.info("prefill queue handler exited successfully")
except Exception as e:
logger.error(f"[ERROR] prefill queue handler failed: {e!r}")
sys.exit(1)
self.task.add_done_callback(prefill_queue_handler_cb)
self.shutdown_requested = False
# Set up signal handler for graceful shutdown
# TODO: move to dynamo sdk
loop = asyncio.get_running_loop()
def signal_handler():
# Schedule the shutdown coroutine instead of calling it directly
asyncio.create_task(self.graceful_shutdown(runtime))
for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, signal_handler)
logger.info("PrefillWorker initialized")
async def graceful_shutdown(self, runtime):
logger.info("Received shutdown signal, shutting down DistributedRuntime")
# first shutdown the vllm engine
self.shutdown_requested = True
await asyncio.wait_for(self.task, timeout=None)
# then shutdown the mock endpoint
runtime.shutdown()
logger.info("DistributedRuntime shutdown complete")
def shutdown_vllm_engine(self):
"""Shutdown the background loop"""
logger.info("Shutting down vllm engine")
loop = asyncio.get_event_loop()
try:
self.engine_client.close()
logger.info("PrefillWorker shutdown complete")
except Exception as e:
logger.error(f"Error during shutdown: {e}")
finally:
loop.stop()
async def prefill_queue_handler(self):
logger.info("Prefill queue handler entered")
prefill_queue_nats_server = os.getenv("NATS_SERVER", "nats://localhost:4222")
namespace, _ = PrefillWorker.dynamo_address() # type: ignore
prefill_queue_stream_name = f"{namespace}_prefill_queue"
logger.info(
f"Prefill queue: {prefill_queue_nats_server}:{prefill_queue_stream_name}"
)
self.initialized = True
# TODO: integrate prefill_queue to a dynamo endpoint
async with PrefillQueue.get_instance(
nats_server=prefill_queue_nats_server,
stream_name=prefill_queue_stream_name,
) as prefill_queue:
logger.info("prefill queue handler started")
while True:
# TODO: this might add a small overhead to pull prefill from nats
# need to test and check how much overhead it is
prefill_request = await prefill_queue.dequeue_prefill_request()
if prefill_request is not None:
logger.info(
f"Dequeued prefill request: {prefill_request.request_id}"
)
async for _ in self.generate(prefill_request):
pass
if self.shutdown_requested:
logger.info(
"Shutdown requested, checking if engine has any pending prefill sending requests"
)
while True:
if not await self.engine_client.has_unfinished_requests():
break
logger.info(
"Engine has pending prefill sending requests, rechecking in 1 second..."
)
await asyncio.sleep(1)
self.shutdown_vllm_engine()
break
async def generate(self, request: RemotePrefillRequest):
sampling_params = request.sampling_params
sampling_params.max_tokens = 1
sampling_params.min_tokens = 1
remote_prefill_params = RemotePrefillParams(
is_remote_decode=True,
decode_block_ids=request.block_ids,
decode_engine_id=request.engine_id,
decode_computed_block_ids=request.computed_block_ids,
)
# TODO check if metadata has changed
# and reload - currently only loading once
if request.engine_id not in self._loaded_metadata:
remote_metadata = await self._metadata_store.get(request.engine_id)
await self.engine_client.add_remote_nixl_metadata(remote_metadata)
logger.info(
f"Loaded nixl metadata from engine {request.engine_id} into "
f"engine {self.engine_client.nixl_metadata.engine_id}"
)
self._loaded_metadata.add(request.engine_id)
async for _ in self.engine_client.generate(
request_id=request.request_id,
prompt=TokensPrompt(prompt_token_ids=request.prompt_token_ids),
sampling_params=sampling_params,
remote_prefill_params=remote_prefill_params,
):
yield
@endpoint()
async def mock(self, req: RequestType):
yield f"mock_response: {req}"
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import logging
import uuid
from enum import Enum
from typing import Any, AsyncIterator, Dict, List, Tuple, Union
from components.kv_router import Router
from components.worker import VllmWorker
from transformers import AutoTokenizer
from utils.chat_processor import ChatProcessor, CompletionsProcessor, ProcessMixIn
from utils.check_worker import check_required_workers
from utils.protocol import LocalBlockHashes, MyRequestOutput, vLLMGenerateRequest
from utils.vllm import RouterType, parse_vllm_args
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.protocol import ChatCompletionRequest, CompletionRequest
from vllm.outputs import RequestOutput
from vllm.transformers_utils.tokenizer import AnyTokenizer
from dynamo.llm import KvMetricsAggregator, compute_block_hash_for_seq_py
from dynamo.runtime import EtcdKvCache
from dynamo.sdk import async_on_start, depends, dynamo_context, endpoint, service
logger = logging.getLogger(__name__)
class RequestType(Enum):
CHAT = "chat"
COMPLETION = "completion"
@service(
dynamo={
"namespace": "dynamo",
},
resources={"cpu": "10", "memory": "20Gi"},
workers=1,
)
class Processor(ProcessMixIn):
"""
vLLM pre and post processing
"""
worker = depends(VllmWorker)
router = depends(Router)
def __init__(self):
class_name = self.__class__.__name__
self.engine_args = parse_vllm_args(class_name, "")
self.model_config = self.engine_args.create_model_config()
self.default_sampling_params = self.model_config.get_diff_sampling_param()
self.tokenizer = self._create_tokenizer(self.engine_args)
self.chat_processor = ChatProcessor(self.tokenizer, self.model_config)
self.completions_processor = CompletionsProcessor(
self.tokenizer, self.model_config
)
self.min_workers = 1
self.request_queue: asyncio.Queue[Dict[str, Any]] = asyncio.Queue()
self.request_futures: Dict[str, asyncio.Future] = {}
self.num_worker_tasks = (
self.engine_args.router_num_threads
) # Number of worker tasks to process the queue
self.worker_tasks: List[asyncio.Task] = []
print(f"Processor init: {self.engine_args.router}")
def _create_tokenizer(self, engine_args: AsyncEngineArgs) -> AnyTokenizer:
"""Create a TokenizerGroup using engine arguments similar to VLLM's approach"""
model_path = engine_args.model
# Create the base tokenizer with VLLM's typical settings
base_tokenizer = AutoTokenizer.from_pretrained(
model_path,
trust_remote_code=True,
padding_side="left",
truncation_side="left",
use_fast=True, # VLLM might use the fast tokenizer for efficiency
)
return base_tokenizer
@async_on_start
async def async_init(self):
runtime = dynamo_context["runtime"]
comp_ns, comp_name = VllmWorker.dynamo_address() # type: ignore
self.worker_client = (
await runtime.namespace(comp_ns)
.component(comp_name)
.endpoint("generate")
.client()
)
self.use_router = self.engine_args.router in (
RouterType.KV,
RouterType.KV_LOAD,
RouterType.APPROX_KV,
)
if self.use_router:
router_ns, router_name = Router.dynamo_address() # type: ignore
self.router_client = (
await runtime.namespace(router_ns)
.component(router_name)
.endpoint("generate")
.client()
)
await check_required_workers(self.worker_client, self.min_workers)
kv_listener = runtime.namespace("dynamo").component("VllmWorker")
await kv_listener.create_service()
self.metrics_aggregator = KvMetricsAggregator(kv_listener)
self.etcd_kv_cache = await EtcdKvCache.create(
runtime.etcd_client(),
f"/{comp_ns}/processor/",
{"router": self.engine_args.router},
)
# Start multiple worker tasks to process the queue
self._start_worker_tasks()
def _start_worker_tasks(self):
"""Start multiple worker tasks to process the queue concurrently"""
# Clear any existing worker tasks
for task in self.worker_tasks:
if not task.done():
task.cancel()
self.worker_tasks = []
# Create new worker tasks
for i in range(self.num_worker_tasks):
task = asyncio.create_task(self._process_queue(worker_id=i))
self.worker_tasks.append(task)
logger.info(f"Started {self.num_worker_tasks} queue worker tasks")
async def _process_queue(self, worker_id: int):
"""Background task to process the request queue"""
logger.info(f"Queue worker {worker_id} started")
while True:
try:
# Get the next request from the queue
request_data = await self.request_queue.get()
# Process the request
try:
await self._process_request(request_data)
except Exception as e:
logger.error(f"Worker {worker_id}: Error processing request: {e}")
finally:
# Mark the task as done
self.request_queue.task_done()
except asyncio.CancelledError:
logger.info(f"Queue worker {worker_id} was cancelled")
break
except Exception as e:
logger.error(
f"Worker {worker_id}: Unexpected error in queue processing: {e}"
)
# Sleep briefly to avoid tight error loops
await asyncio.sleep(0.1)
async def _get_kv_load(self):
metrics = await self.metrics_aggregator.get_metrics()
kv_load = {}
for end_point in metrics.endpoints:
worker_id = end_point.worker_id
kv_load[worker_id] = getattr(end_point, "gpu_cache_usage_perc", 0.0)
return kv_load
async def _get_pending_requests(self):
metrics = await self.metrics_aggregator.get_metrics()
pending_requests = {}
for end_point in metrics.endpoints:
worker_id = end_point.worker_id
pending_requests[worker_id] = getattr(endpoint, "num_requests_waiting", 0)
return pending_requests
async def _generate(
self,
raw_request: Union[CompletionRequest, ChatCompletionRequest],
request_type: RequestType,
):
request_id = str(uuid.uuid4())
logger.debug(f"Got raw request: {raw_request}")
# Create a future for this request
future: asyncio.Future[AsyncIterator[Any]] = asyncio.Future()
self.request_futures[request_id] = future
# Enqueue the request with minimal processing
await self.request_queue.put(
{
"request_id": request_id,
"raw_request": raw_request,
"request_type": request_type,
}
)
try:
# Wait for the future to complete and yield the results
generator = await future
async for response in generator:
yield response
finally:
# Clean up the future when done
if request_id in self.request_futures:
del self.request_futures[request_id]
async def _process_request(self, request_data: Dict[str, Any]):
"""Process a single request from the queue"""
request_id = request_data["request_id"]
raw_request = request_data["raw_request"]
request_type = request_data["request_type"]
try:
# Parse the raw request here instead of in _generate
(
request,
conversation,
prompt,
engine_prompt,
sampling_params,
) = await self._parse_raw_request(raw_request)
# Create an async generator function to process this request
async def process_and_stream():
# TODO: queue request at processor when engines are full
router_mode = (await self.etcd_kv_cache.get("router")).decode()
self.use_router = router_mode in (
RouterType.KV,
RouterType.KV_LOAD,
RouterType.APPROX_KV,
)
prefix_hit_rate = 0.0 # Default value
if self.use_router:
token_ids = engine_prompt["prompt_token_ids"]
router_generator = await self.router_client.generate(
LocalBlockHashes(
hashes=compute_block_hash_for_seq_py(
token_ids, self.engine_args.block_size
),
tokens=token_ids,
num_tokens=len(token_ids),
).model_dump_json()
)
decision = await router_generator.__anext__()
worker_id, prefix_hit_rate = decision.data()
prefix_hit_rate = float(prefix_hit_rate)
# Create request object once with default prefix_hit_rate
request_obj = vLLMGenerateRequest(
engine_prompt=engine_prompt,
sampling_params=sampling_params,
request_id=request_id,
prefix_hit_rate=prefix_hit_rate,
).model_dump_json()
if self.use_router:
if worker_id == "":
engine_generator = await self.worker_client.generate(
request_obj
)
else:
engine_generator = await self.worker_client.direct(
request_obj, int(worker_id)
)
elif router_mode == RouterType.RANDOM:
engine_generator = await self.worker_client.generate(request_obj)
elif router_mode == RouterType.ROUND_ROBIN:
engine_generator = await self.worker_client.round_robin(request_obj)
output_generator = self._generate_responses(
engine_generator, request_type
)
# Stream responses directly to the caller
async for response in await self._stream_response(
request, output_generator, request_id, conversation
):
yield response
# Set the future result to our async generator
if request_id in self.request_futures:
self.request_futures[request_id].set_result(process_and_stream())
except Exception as e:
logger.error(f"Error processing request {request_id}: {e}")
# Set exception on the future if it still exists
if (
request_id in self.request_futures
and not self.request_futures[request_id].done()
):
self.request_futures[request_id].set_exception(e)
async def _generate_responses(
self, engine_generator: AsyncIterator[RequestOutput], request_type: RequestType
) -> AsyncIterator[Union[RequestOutput, Tuple[int, RequestOutput]]]:
prompt_idx = 0
async for resp in engine_generator:
# Deserialize the response from the engine
# Creates correct vLLM objects for each field
output = MyRequestOutput.model_validate_json(resp.data())
# OpenAIServingChat.chat_completion_stream_generator() method expects a RequestOutput object
request_output = RequestOutput(
request_id=output.request_id,
prompt=output.prompt,
prompt_token_ids=output.prompt_token_ids,
prompt_logprobs=output.prompt_logprobs,
outputs=output.outputs,
finished=output.finished,
metrics=output.metrics,
)
if request_type == RequestType.CHAT:
# For chat requests, yield the request_output directly.
yield request_output
elif request_type == RequestType.COMPLETION:
# Completion requests can have multiple prompts and stream generator requires the prompt index
yield (prompt_idx, request_output)
else:
raise NotImplementedError(
f"Request type {request_type} not implemented"
)
@endpoint(name="chat/completions")
async def chat_completions(self, raw_request: ChatCompletionRequest):
async for response in self._generate(raw_request, RequestType.CHAT):
yield response
# @endpoint()
# async def completions(self, raw_request: CompletionRequest):
# async for response in self._generate(raw_request, RequestType.COMPLETION):
# yield response
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import logging
import os
import signal
from components.disagg_router import PyDisaggregatedRouter
from components.prefill_worker import PrefillWorker
from utils.nixl import NixlMetadataStore
from utils.prefill_queue import PrefillQueue
from utils.protocol import MyRequestOutput, vLLMGenerateRequest
from utils.vllm import RouterType, parse_vllm_args
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args,
)
from vllm.remote_prefill import RemotePrefillParams, RemotePrefillRequest
from vllm.sampling_params import RequestOutputKind
from dynamo.llm import ForwardPassMetrics, KvStats, WorkerMetricsPublisher, WorkerStats
from dynamo.sdk import async_on_start, depends, dynamo_context, endpoint, service
logger = logging.getLogger(__name__)
@service(
dynamo={
"namespace": "dynamo",
},
resources={"gpu": 1, "cpu": "10", "memory": "20Gi"},
workers=1,
)
class VllmWorker:
prefill_worker = depends(PrefillWorker)
def __init__(self):
self.client = None
self.disaggregated_router: PyDisaggregatedRouter = None # type: ignore
class_name = self.__class__.__name__
self.engine_args = parse_vllm_args(class_name, "")
self.do_remote_prefill = self.engine_args.remote_prefill
self._prefill_queue_nats_server = os.getenv(
"NATS_SERVER", "nats://localhost:4222"
)
self.namespace, _ = VllmWorker.dynamo_address() # type: ignore
self._prefill_queue_stream_name = f"{self.namespace}_prefill_queue"
logger.info(
f"Prefill queue: {self._prefill_queue_nats_server}:{self._prefill_queue_stream_name}"
)
if self.engine_args.remote_prefill:
if self.engine_args.enable_chunked_prefill is not False:
logger.info("Chunked prefill is not supported yet, setting to False")
self.engine_args.enable_chunked_prefill = False
if self.engine_args.preemption_mode != "swap":
logger.info("Preemption mode is not supported yet, setting to swap")
self.engine_args.preemption_mode = "swap"
if self.engine_args.pipeline_parallel_size != 1:
logger.info("Pipeline parallel size is not supported yet, setting to 1")
self.engine_args.pipeline_parallel_size = 1
if self.engine_args.router in (RouterType.KV, RouterType.APPROX_KV):
if not self.engine_args.enable_prefix_caching:
logger.info(
"When using KV router, prefix caching must be enabled, setting to True"
)
self.engine_args.enable_prefix_caching = True
VLLM_WORKER_ID = dynamo_context["endpoints"][0].lease_id()
os.environ["VLLM_WORKER_ID"] = str(VLLM_WORKER_ID)
os.environ["VLLM_KV_NAMESPACE"] = "dynamo"
os.environ["VLLM_KV_COMPONENT"] = class_name
self.metrics_publisher = WorkerMetricsPublisher()
signal.signal(signal.SIGTERM, self.shutdown_vllm_engine)
signal.signal(signal.SIGINT, self.shutdown_vllm_engine)
@async_on_start
async def async_init(self):
self._engine_context = build_async_engine_client_from_engine_args(
self.engine_args
)
if self._engine_context is not None:
self.engine_client = await self._engine_context.__aenter__()
else:
raise RuntimeError("Failed to initialize engine client")
self.engine_client.set_metrics_publisher(self.metrics_publisher)
# Initially send dummy metrics to kick start,
# vLLM will not update stat until forward pass is triggered
worker_stats = WorkerStats(
0, # request_active_slots
1024, # request_total_slots
0, # num_requests_waiting
None, # data_parallel_rank
)
kv_stats = KvStats(
0, # kv_active_blocks
1024, # kv_total_blocks
0.0, # gpu_cache_usage_perc
0.0, # gpu_prefix_cache_hit_rate
)
metrics = ForwardPassMetrics(
worker_stats=worker_stats,
kv_stats=kv_stats,
spec_decode_stats=None,
)
self.metrics_publisher.publish(metrics)
task = asyncio.create_task(self.create_metrics_publisher_endpoint())
task.add_done_callback(
lambda _: logger.info("metrics publisher endpoint created")
)
runtime = dynamo_context["runtime"]
if self.engine_args.remote_prefill:
metadata = self.engine_client.nixl_metadata
metadata_store = NixlMetadataStore("dynamo", runtime)
await metadata_store.put(metadata.engine_id, metadata)
if self.engine_args.conditional_disagg:
self.disaggregated_router = PyDisaggregatedRouter(
runtime,
self.namespace,
max_local_prefill_length=self.engine_args.max_local_prefill_length,
max_prefill_queue_size=self.engine_args.max_prefill_queue_size,
)
await self.disaggregated_router.async_init()
else:
self.disaggregated_router = None
# Set up signal handler for graceful shutdown
# TODO: move to dynamo sdk
loop = asyncio.get_running_loop()
def signal_handler():
# Schedule the shutdown coroutine instead of calling it directly
asyncio.create_task(self.graceful_shutdown(runtime))
for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, signal_handler)
logger.info("VllmWorker has been initialized")
async def graceful_shutdown(self, runtime):
logger.info("Received shutdown signal, shutting down DistributedRuntime")
runtime.shutdown()
logger.info("DistributedRuntime shutdown complete")
def shutdown_vllm_engine(self, signum, frame):
"""Shutdown the background loop"""
logger.info(f"Received signal {signum}, shutting down")
loop = asyncio.get_event_loop()
try:
self.engine_client.close()
logger.info("VllmWorker shutdown complete")
except Exception as e:
logger.error(f"Error during shutdown: {e}")
finally:
loop.stop()
async def create_metrics_publisher_endpoint(self):
component = dynamo_context["component"]
logger.info("Creating metrics publisher endpoint with primary lease")
await self.metrics_publisher.create_endpoint(component)
def get_remote_prefill_request_callback(self):
# TODO: integrate prefill_queue to dynamo endpoint
async def callback(request: RemotePrefillRequest):
async with PrefillQueue.get_instance(
nats_server=self._prefill_queue_nats_server,
stream_name=self._prefill_queue_stream_name,
) as prefill_queue:
await prefill_queue.enqueue_prefill_request(request)
return callback
# TODO: use the same child lease for metrics publisher endpoint and generate endpoint
@endpoint()
async def generate(self, request: vLLMGenerateRequest):
# TODO: consider prefix hit when deciding prefill locally or remotely
if self.disaggregated_router is not None:
async with PrefillQueue.get_instance(
nats_server=self._prefill_queue_nats_server,
stream_name=self._prefill_queue_stream_name,
) as prefill_queue:
prefill_queue_size = await prefill_queue.get_queue_size()
disagg_router_decision = await self.disaggregated_router.prefill_remote(
len(request.engine_prompt["prompt_token_ids"]),
request.prefix_hit_rate,
prefill_queue_size,
)
else:
# always prefill remotely if no disaggregated router is provided
disagg_router_decision = True
if self.do_remote_prefill and disagg_router_decision:
remote_prefill_params = RemotePrefillParams(
is_remote_prefill=True,
remote_prefill_request_callback=self.get_remote_prefill_request_callback(),
)
logger.info(
f"Prefilling remotely for request {request.request_id} with length {len(request.engine_prompt['prompt_token_ids'])}"
)
else:
remote_prefill_params = None
logger.info(
f"Prefilling locally for request {request.request_id} with length {len(request.engine_prompt['prompt_token_ids'])}"
)
# rust HTTP requires Delta streaming
request.sampling_params.output_kind = RequestOutputKind.DELTA
async for response in self.engine_client.generate(
prompt=request.engine_prompt,
sampling_params=request.sampling_params,
request_id=request.request_id,
remote_prefill_params=remote_prefill_params,
):
yield MyRequestOutput(
request_id=response.request_id,
prompt=response.prompt,
prompt_token_ids=response.prompt_token_ids,
prompt_logprobs=response.prompt_logprobs,
outputs=response.outputs,
finished=response.finished,
).model_dump_json()
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
router-num-threads: 4
common-configs: [model, block-size, max-model-len]
VllmWorker:
enforce-eager: true
max-num-batched-tokens: 16384
enable-prefix-caching: true
ServiceArgs:
workers: 1
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len]
Planner:
environment: local
no-operation: true
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
router: kv
block-size: 64
max-model-len: 16384
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
common-configs: [model, block-size, max-model-len, router]
Router:
min-workers: 1
softmax-sample: true
common-configs: [model, block-size, router]
VllmWorker:
enforce-eager: true
max-num-batched-tokens: 16384
enable-prefix-caching: true
tensor-parallel-size: 1
ServiceArgs:
workers: 1
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len, router, kv-transfer-config]
Planner:
environment: local
no-operation: true
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
common-configs: [model, block-size]
VllmWorker:
remote-prefill: true
conditional-disagg: true
max-local-prefill-length: 10
max-prefill-queue-size: 2
ServiceArgs:
workers: 1
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
PrefillWorker:
max-num-batched-tokens: 16384
ServiceArgs:
workers: 1
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
Planner:
environment: local
no-operation: true
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
router: kv
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
common-configs: [model, block-size, max-model-len, router]
Router:
min-workers: 1
common-configs: [model, block-size, router]
VllmWorker:
max-num-batched-tokens: 16384
remote-prefill: true
conditional-disagg: true
max-local-prefill-length: 10
max-prefill-queue-size: 2
tensor-parallel-size: 1
enable-prefix-caching: true
ServiceArgs:
workers: 1
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len, router, kv-transfer-config]
PrefillWorker:
max-num-batched-tokens: 16384
ServiceArgs:
workers: 1
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len, kv-transfer-config]
Planner:
environment: local
no-operation: true
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This configuration file is used in the multinode-examples.md file
# to start the 405B model on 3 nodes.
Frontend:
served_model_name: nvidia/Llama-3.1-405B-Instruct-FP8
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
model: nvidia/Llama-3.1-405B-Instruct-FP8
block-size: 64
max-model-len: 8192
router: kv
Router:
model: nvidia/Llama-3.1-405B-Instruct-FP8
min-workers: 1
VllmWorker:
model: nvidia/Llama-3.1-405B-Instruct-FP8
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
block-size: 64
max-model-len: 8192
max-num-seqs: 16
remote-prefill: true
conditional-disagg: true
max-local-prefill-length: 10
max-prefill-queue-size: 2
gpu-memory-utilization: 0.95
tensor-parallel-size: 8
router: kv
quantization: modelopt
enable-prefix-caching: true
ServiceArgs:
workers: 1
resources:
gpu: '8'
PrefillWorker:
model: nvidia/Llama-3.1-405B-Instruct-FP8
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
block-size: 64
max-model-len: 8192
max-num-seqs: 16
gpu-memory-utilization: 0.95
tensor-parallel-size: 8
quantization: modelopt
ServiceArgs:
workers: 1
resources:
gpu: '8'
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1
block-size: 64
max-model-len: 16384
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
common-configs: [model, block-size, max-model-len]
VllmWorker:
enforce-eager: true
max-num-batched-tokens: 16384
enable-prefix-caching: true
router: random
tensor-parallel-size: 16
ServiceArgs:
workers: 1
resources:
gpu: '1'
common-configs: [model, block-size, max-model-len]
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: deepseek-ai/DeepSeek-R1
block-size: 64
max-model-len: 16384
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
tensor-parallel-size: 16
disable-log-requests: true
Frontend:
served_model_name: deepseek-ai/DeepSeek-R1
endpoint: dynamo.Processor.chat/completions
port: 8000
Processor:
router: round-robin
common-configs: [model, block-size]
VllmWorker:
remote-prefill: true
conditional-disagg: false
ServiceArgs:
workers: 1
resources:
gpu: '16'
common-configs: [model, block-size, max-model-len, kv-transfer-config, tensor-parallel-size, disable-log-requests]
PrefillWorker:
max-num-batched-tokens: 16384
ServiceArgs:
workers: 1
resources:
gpu: '16'
common-configs: [model, block-size, max-model-len, kv-transfer-config, tensor-parallel-size, disable-log-requests]
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: llm-agg
spec:
envs:
- name: DYN_DEPLOYMENT_CONFIG
value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","router-num-threads":4,"common-configs":["model","block-size","max-model-len"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
services:
Frontend:
dynamoNamespace: llm-agg
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.agg:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
Processor:
dynamoNamespace: llm-agg
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.agg:Processor
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Processor
VllmWorker:
envFromSecret: hf-token-secret
dynamoNamespace: llm-agg
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.agg:VllmWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmWorker
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: agg-router
spec:
envs:
- name: DYN_DEPLOYMENT_CONFIG
value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","router":"kv","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"common-configs":["model","block-size","max-model-len","router"]},"Router":{"min-workers":1,"softmax-sample":true,"common-configs":["model","block-size","router"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"tensor-parallel-size":1,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","router","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
services:
Frontend:
dynamoNamespace: llm-agg-router
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.agg_router:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
Processor:
dynamoNamespace: llm-agg-router
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.agg_router:Processor
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Processor
Router:
dynamoNamespace: llm-agg-router
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.agg_router:Router
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Router
VllmWorker:
envFromSecret: hf-token-secret
dynamoNamespace: llm-agg-router
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.agg_router:VllmWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmWorker
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: llm-disagg
spec:
envs:
- name: DYN_DEPLOYMENT_CONFIG
value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","common-configs":["model","block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
services:
Frontend:
dynamoNamespace: llm-disagg
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.disagg:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
Processor:
dynamoNamespace: llm-disagg
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.disagg:Processor
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Processor
VllmWorker:
envFromSecret: hf-token-secret
dynamoNamespace: llm-disagg
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.disagg:VllmWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmWorker
PrefillWorker:
envFromSecret: hf-token-secret
dynamoNamespace: llm-disagg
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.disagg:PrefillWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- PrefillWorker
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: disagg-router
spec:
envs:
- name: DYN_DEPLOYMENT_CONFIG
value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"router":"kv","kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"common-configs":["model","block-size","max-model-len","router"]},"Router":{"min-workers":1,"common-configs":["model","block-size","router"]},"VllmWorker":{"max-num-batched-tokens":16384,"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"tensor-parallel-size":1,"enable-prefix-caching":true,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","router","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
services:
Frontend:
dynamoNamespace: llm-disagg-router
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.disagg_router:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
Processor:
dynamoNamespace: llm-disagg-router
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.disagg_router:Processor
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Processor
Router:
dynamoNamespace: llm-disagg-router
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.disagg_router:Router
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Router
VllmWorker:
envFromSecret: hf-token-secret
dynamoNamespace: llm-disagg-router
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.disagg_router:VllmWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmWorker
PrefillWorker:
envFromSecret: hf-token-secret
dynamoNamespace: llm-disagg-router
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir: /workspace/examples/llm
args:
- dynamo
- serve
- graphs.disagg_router:PrefillWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- PrefillWorker
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from components.frontend import Frontend
from components.planner_service import Planner
from components.processor import Processor
from components.worker import VllmWorker
Frontend.link(Processor).link(VllmWorker)
Frontend.link(Planner)
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from components.frontend import Frontend
from components.kv_router import Router
from components.planner_service import Planner
from components.processor import Processor
from components.worker import VllmWorker
Frontend.link(Processor).link(Router).link(VllmWorker)
Frontend.link(Planner)
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from components.frontend import Frontend
from components.planner_service import Planner
from components.prefill_worker import PrefillWorker
from components.processor import Processor
from components.worker import VllmWorker
Frontend.link(Processor).link(VllmWorker).link(PrefillWorker)
Frontend.link(Planner)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment