Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
f00d700e
"vllm/vscode:/vscode.git/clone" did not exist on "2dc182c0fb17dc0ef09535ed8781c1475cb79e4f"
Unverified
Commit
f00d700e
authored
Jul 14, 2025
by
Alec
Committed by
GitHub
Jul 14, 2025
Browse files
refactor: remove old examples with old UX (#1899)
parent
c7080419
Changes
111
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
2342 deletions
+0
-2342
examples/llm/components/planner.py
examples/llm/components/planner.py
+0
-495
examples/llm/components/planner_service.py
examples/llm/components/planner_service.py
+0
-114
examples/llm/components/prefill_worker.py
examples/llm/components/prefill_worker.py
+0
-211
examples/llm/components/processor.py
examples/llm/components/processor.py
+0
-351
examples/llm/components/worker.py
examples/llm/components/worker.py
+0
-248
examples/llm/configs/agg.yaml
examples/llm/configs/agg.yaml
+0
-42
examples/llm/configs/agg_router.yaml
examples/llm/configs/agg_router.yaml
+0
-48
examples/llm/configs/disagg.yaml
examples/llm/configs/disagg.yaml
+0
-51
examples/llm/configs/disagg_router.yaml
examples/llm/configs/disagg_router.yaml
+0
-58
examples/llm/configs/multinode-405b.yaml
examples/llm/configs/multinode-405b.yaml
+0
-66
examples/llm/configs/multinode_agg_r1.yaml
examples/llm/configs/multinode_agg_r1.yaml
+0
-39
examples/llm/configs/mutinode_disagg_r1.yaml
examples/llm/configs/mutinode_disagg_r1.yaml
+0
-47
examples/llm/deploy/agg.yaml
examples/llm/deploy/agg.yaml
+0
-100
examples/llm/deploy/agg_router.yaml
examples/llm/deploy/agg_router.yaml
+0
-125
examples/llm/deploy/disagg.yaml
examples/llm/deploy/disagg.yaml
+0
-127
examples/llm/deploy/disagg_router.yaml
examples/llm/deploy/disagg_router.yaml
+0
-152
examples/llm/graphs/__init__.py
examples/llm/graphs/__init__.py
+0
-0
examples/llm/graphs/agg.py
examples/llm/graphs/agg.py
+0
-22
examples/llm/graphs/agg_router.py
examples/llm/graphs/agg_router.py
+0
-23
examples/llm/graphs/disagg.py
examples/llm/graphs/disagg.py
+0
-23
No files found.
examples/llm/components/planner.py
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
asyncio
import
json
import
logging
import
os
import
time
from
datetime
import
datetime
from
typing
import
Any
,
List
import
numpy
as
np
from
rich.console
import
Console
from
rich.table
import
Table
from
tensorboardX
import
SummaryWriter
from
utils.prefill_queue
import
PrefillQueue
from
dynamo.llm
import
KvMetricsAggregator
from
dynamo.planner
import
KubernetesConnector
,
LocalConnector
from
dynamo.planner.defaults
import
LoadPlannerDefaults
from
dynamo.runtime
import
DistributedRuntime
,
dynamo_worker
from
dynamo.runtime.logging
import
configure_dynamo_logging
configure_dynamo_logging
()
logger
=
logging
.
getLogger
(
__name__
)
# will not decrease decode worker number within 3 adjustment interval after a new decode worker
# is added. this is to leave time for the new decode worker to populate its kv cache.
NEW_DECODE_WORKER_GRACE_PERIOD
=
3
# we do not scale up prefill worker if the prefill queue size is estimated to reduce within
# --prefill-queue-scale-up-threshold within the next NEW_PREFILL_WORKER_QUEUE_BUFFER_PERIOD
# adjustment intervals following the trend observed in the current adjustment interval.
# this is to account for the time for prefill workers to start.
NEW_PREFILL_WORKER_QUEUE_BUFFER_PERIOD
=
3
class
Planner
:
def
__init__
(
self
,
runtime
:
DistributedRuntime
,
args
:
argparse
.
Namespace
):
self
.
runtime
=
runtime
self
.
args
=
args
self
.
namespace
=
args
.
namespace
if
args
.
environment
==
"local"
:
self
.
connector
=
LocalConnector
(
args
.
namespace
,
runtime
)
elif
args
.
environment
==
"kubernetes"
:
self
.
connector
=
KubernetesConnector
(
args
.
namespace
)
else
:
raise
ValueError
(
f
"Invalid environment:
{
args
.
environment
}
"
)
self
.
_prefill_queue_nats_server
=
os
.
getenv
(
"NATS_SERVER"
,
"nats://localhost:4222"
)
self
.
_prefill_queue_stream_name
=
f
"
{
self
.
namespace
}
_prefill_queue"
self
.
prefill_client
:
Any
|
None
=
None
self
.
workers_client
:
Any
|
None
=
None
self
.
p_endpoints
:
List
[
int
]
=
[]
self
.
d_endpoints
:
List
[
int
]
=
[]
self
.
decode_worker_remaining_grace_period
=
0
if
args
.
log_dir
is
None
:
args
.
log_dir
=
f
"logs/
{
datetime
.
now
().
strftime
(
'%m%d_%H%M%S'
)
}
"
self
.
writer
=
SummaryWriter
(
args
.
log_dir
)
logger
.
info
(
f
"Components present in namespace:
{
args
.
namespace
}
"
)
self
.
init_time
=
time
.
time
()
# Set the appropriate logger function for repeated metric logging
self
.
_repeating_log_func
=
logger
.
debug
if
args
.
no_operation
else
logger
.
info
async
def
set_metric_aggregator
(
self
):
# TODO: separate KV metrics and prefill metrics
kv_listener
=
self
.
runtime
.
namespace
(
self
.
namespace
).
component
(
"VllmWorker"
)
await
kv_listener
.
create_service
()
self
.
metrics_aggregator
=
KvMetricsAggregator
(
kv_listener
)
async
def
get_workers_info
(
self
):
try
:
if
self
.
prefill_client
is
None
:
self
.
prefill_client
=
(
await
self
.
runtime
.
namespace
(
self
.
namespace
)
.
component
(
"PrefillWorker"
)
.
endpoint
(
"mock"
)
.
client
()
)
# TODO: remove this sleep after rust client() is blocking until watching state
await
asyncio
.
sleep
(
0.1
)
# TODO: use etcd events instead of pulling instance_ids
p_endpoints
=
self
.
prefill_client
.
instance_ids
()
except
Exception
:
p_endpoints
=
[]
self
.
_repeating_log_func
(
"No prefill workers found, operating in aggregated mode"
)
try
:
if
self
.
workers_client
is
None
:
self
.
workers_client
=
(
await
self
.
runtime
.
namespace
(
self
.
namespace
)
.
component
(
"VllmWorker"
)
.
endpoint
(
"generate"
)
.
client
()
)
# TODO: remove this sleep after rust client() is blocking until watching state
await
asyncio
.
sleep
(
0.1
)
# TODO: use etcd events instead of pulling instance_ids
d_endpoints
=
self
.
workers_client
.
instance_ids
()
except
Exception
as
e
:
raise
RuntimeError
(
f
"Failed to get decode worker endpoints:
{
e
}
"
)
return
p_endpoints
,
d_endpoints
async
def
reset_adjustment_interval
(
self
):
self
.
_repeating_log_func
(
f
"Reset metrics for new adjustment interval at t=
{
time
.
time
()
-
self
.
init_time
:.
1
f
}
s"
)
self
.
p_endpoints
,
self
.
d_endpoints
=
await
self
.
get_workers_info
()
self
.
_repeating_log_func
(
f
"Number of prefill workers:
{
len
(
self
.
p_endpoints
)
}
, number of decode workers:
{
len
(
self
.
d_endpoints
)
}
"
)
self
.
metrics_collection_time
=
[]
self
.
prefill_queue_load
=
[]
self
.
kv_load
=
[]
self
.
last_adjustment_time
=
time
.
time
()
async
def
collect_metrics
(
self
):
self
.
_repeating_log_func
(
f
"Collecting metrics at t=
{
time
.
time
()
-
self
.
init_time
:.
1
f
}
s"
)
# collect prefill queue load
try
:
async
with
PrefillQueue
.
get_instance
(
nats_server
=
self
.
_prefill_queue_nats_server
,
stream_name
=
self
.
_prefill_queue_stream_name
,
)
as
prefill_queue
:
prefill_queue_size
=
await
prefill_queue
.
get_queue_size
()
measure_time
=
time
.
time
()
-
self
.
init_time
self
.
prefill_queue_load
.
append
(
prefill_queue_size
)
self
.
_repeating_log_func
(
f
"Collected prefill queue size at t=
{
measure_time
:.
1
f
}
s:
{
int
(
prefill_queue_size
)
}
"
)
self
.
writer
.
add_scalar
(
"prefill_queue_size"
,
prefill_queue_size
,
measure_time
)
except
Exception
as
e
:
self
.
_repeating_log_func
(
f
"Failed to collect prefill queue size metrics:
{
e
}
"
)
# collect kv load
total_active_requests
:
int
=
0
total_queued_requests
:
int
=
0
metrics
=
await
self
.
metrics_aggregator
.
get_metrics
()
try
:
prev_kv_load_len
=
len
(
self
.
kv_load
)
for
endpoint
in
metrics
.
endpoints
:
kv_load
=
getattr
(
endpoint
,
"gpu_cache_usage_perc"
,
0.0
)
num_requests_waiting
=
getattr
(
endpoint
,
"num_requests_waiting"
,
0
)
total_queued_requests
+=
num_requests_waiting
request_active_slots
=
getattr
(
endpoint
,
"request_active_slots"
,
None
)
if
request_active_slots
:
total_active_requests
+=
request_active_slots
if
num_requests_waiting
>
0
:
# estimate kv load after waiting requests are scheduled based on current isl/osl
# TODO: use actual isl/osl estimation after the request_active_slot bug in disaggg is fixed
# Currently, we assume each request uses 0.02 kv cache
# kv_load = kv_load * (request_active_slots + num_requests_waiting) / request_active_slots
kv_load
=
kv_load
+
0.02
*
num_requests_waiting
self
.
kv_load
.
append
(
kv_load
)
measure_time
=
time
.
time
()
-
self
.
init_time
self
.
_repeating_log_func
(
f
"Collected kv load at t=
{
measure_time
:.
1
f
}
s:
{
self
.
kv_load
[
prev_kv_load_len
:]
}
(act/pnd req:
{
total_active_requests
}
/
{
total_queued_requests
}
)"
)
average_kv_load
=
np
.
mean
(
self
.
kv_load
[
prev_kv_load_len
:])
self
.
writer
.
add_scalar
(
"average_kv_load"
,
average_kv_load
,
measure_time
)
self
.
writer
.
add_scalar
(
"total_queued_requests"
,
total_queued_requests
,
measure_time
)
except
Exception
as
e
:
self
.
_repeating_log_func
(
f
"Failed to collect kv load metrics:
{
e
}
"
)
p_endpoints
,
d_endpoints
=
await
self
.
get_workers_info
()
self
.
writer
.
add_scalar
(
"num_prefill_workers"
,
len
(
p_endpoints
),
time
.
time
()
-
self
.
init_time
)
self
.
writer
.
add_scalar
(
"num_decode_workers"
,
len
(
d_endpoints
),
time
.
time
()
-
self
.
init_time
)
curr_gpu_usage
=
(
len
(
p_endpoints
)
*
self
.
args
.
prefill_engine_num_gpu
+
len
(
d_endpoints
)
*
self
.
args
.
decode_engine_num_gpu
)
self
.
writer
.
add_scalar
(
"num_gpu"
,
curr_gpu_usage
,
time
.
time
()
-
self
.
init_time
)
self
.
metrics_collection_time
.
append
(
time
.
time
())
async
def
make_adjustments
(
self
):
# Note: all adjustments are blocking. Non-blocking adjustment and metric pulling
# make the optimization problem too complex and should not be needed in most cases.
logger
.
info
(
f
"Making adjustments at t=
{
time
.
time
()
-
self
.
init_time
:.
1
f
}
s"
)
# check if decode/prefill workers is still the same
# note that we only check length as endpoint ids might change
new_p_endpoints
,
new_d_endpoints
=
await
self
.
get_workers_info
()
if
len
(
new_p_endpoints
)
!=
len
(
self
.
p_endpoints
)
or
len
(
new_d_endpoints
)
!=
len
(
self
.
d_endpoints
):
logger
.
info
(
"Decode/prefill workers changed, no adjustments will be made"
)
return
# compute current gpu usage
curr_gpu_usage
=
(
len
(
self
.
p_endpoints
)
*
self
.
args
.
prefill_engine_num_gpu
+
len
(
self
.
d_endpoints
)
*
self
.
args
.
decode_engine_num_gpu
)
logger
.
info
(
f
"Current engines use
{
curr_gpu_usage
}
GPUs"
)
avg_prefill_queue_load
=
np
.
mean
(
self
.
prefill_queue_load
)
/
len
(
self
.
p_endpoints
)
avg_kv_load
=
np
.
mean
(
self
.
kv_load
)
# first check if we need to scale down any workers
if
(
avg_prefill_queue_load
<
self
.
args
.
prefill_queue_scale_down_threshold
and
len
(
self
.
p_endpoints
)
>
self
.
args
.
min_endpoint
):
logger
.
info
(
f
"Average prefill queue load (
{
avg_prefill_queue_load
:.
2
f
}
) is below threshold (
{
self
.
args
.
prefill_queue_scale_down_threshold
:.
2
f
}
), scaling down prefill workers"
)
success
=
await
self
.
connector
.
remove_component
(
"PrefillWorker"
)
if
success
:
curr_gpu_usage
-=
self
.
args
.
prefill_engine_num_gpu
else
:
logger
.
info
(
"Failed to scale down prefill worker"
)
if
(
avg_kv_load
<
self
.
args
.
decode_kv_scale_down_threshold
and
len
(
self
.
d_endpoints
)
>
self
.
args
.
min_endpoint
):
if
self
.
decode_worker_remaining_grace_period
>
0
:
logger
.
info
(
f
"Decode worker remaining grace period is
{
self
.
decode_worker_remaining_grace_period
}
, skipping scale down"
)
else
:
logger
.
info
(
f
"Average kv load (
{
avg_kv_load
:.
2
f
}
) is below threshold (
{
self
.
args
.
decode_kv_scale_down_threshold
:.
2
f
}
), scaling down decode workers"
)
success
=
await
self
.
connector
.
remove_component
(
"VllmWorker"
)
if
success
:
curr_gpu_usage
-=
self
.
args
.
decode_engine_num_gpu
else
:
logger
.
info
(
"Failed to scale down decode worker"
)
# check if we need to scale up workers
# we first check for prefill worker because prefill queueing can also lead
# to high kv load on decode workers
if
(
avg_prefill_queue_load
>
self
.
args
.
prefill_queue_scale_up_threshold
and
curr_gpu_usage
+
self
.
args
.
prefill_engine_num_gpu
<=
self
.
args
.
max_gpu_budget
):
logger
.
info
(
f
"Average prefill queue load (
{
avg_prefill_queue_load
:.
2
f
}
) is above threshold (
{
self
.
args
.
prefill_queue_scale_up_threshold
:.
2
f
}
)"
)
# check prefill queue size trend:
prefill_queue_size_change
=
(
self
.
prefill_queue_load
[
-
1
]
-
self
.
prefill_queue_load
[
0
]
)
predicted_prefill_future_queue_size
=
(
self
.
prefill_queue_load
[
-
1
]
+
prefill_queue_size_change
*
NEW_PREFILL_WORKER_QUEUE_BUFFER_PERIOD
)
if
(
predicted_prefill_future_queue_size
>
self
.
args
.
prefill_queue_scale_up_threshold
):
logger
.
info
(
f
"Predicted future prefill queue size (
{
predicted_prefill_future_queue_size
:.
2
f
}
) is also above threshold (
{
self
.
args
.
prefill_queue_scale_up_threshold
:.
2
f
}
), scaling up prefill workers"
)
success
=
await
self
.
connector
.
add_component
(
"PrefillWorker"
)
if
success
:
curr_gpu_usage
+=
self
.
args
.
prefill_engine_num_gpu
else
:
logger
.
info
(
"Failed to scale up prefill worker"
)
else
:
logger
.
info
(
f
"Predicted future prefill queue size (
{
predicted_prefill_future_queue_size
:.
2
f
}
) is below threshold (
{
self
.
args
.
prefill_queue_scale_up_threshold
:.
2
f
}
), skipping prefill worker scaling"
)
if
(
avg_kv_load
>
self
.
args
.
decode_kv_scale_up_threshold
and
curr_gpu_usage
+
self
.
args
.
decode_engine_num_gpu
<=
self
.
args
.
max_gpu_budget
):
logger
.
info
(
f
"Average kv load (
{
avg_kv_load
:.
2
f
}
) is above threshold (
{
self
.
args
.
decode_kv_scale_up_threshold
:.
2
f
}
), scaling up decode workers"
)
success
=
await
self
.
connector
.
add_component
(
"VllmWorker"
)
if
success
:
curr_gpu_usage
+=
self
.
args
.
decode_engine_num_gpu
self
.
decode_worker_remaining_grace_period
=
(
NEW_DECODE_WORKER_GRACE_PERIOD
)
else
:
logger
.
info
(
"Failed to scale up decode worker"
)
# no adjustment needed, just log the current metrics
if
(
avg_prefill_queue_load
>
self
.
args
.
prefill_queue_scale_down_threshold
and
avg_prefill_queue_load
<
self
.
args
.
prefill_queue_scale_up_threshold
):
logger
.
info
(
f
"Average prefill queue load (
{
avg_prefill_queue_load
:.
2
f
}
) is within threshold, no prefill worker scaling needed"
)
if
(
avg_kv_load
>
self
.
args
.
decode_kv_scale_down_threshold
and
avg_kv_load
<
self
.
args
.
decode_kv_scale_up_threshold
):
logger
.
info
(
f
"Average kv load (
{
avg_kv_load
:.
2
f
}
) is within threshold, no decode worker scaling needed"
)
logger
.
info
(
f
"Engines after adjustment use
{
curr_gpu_usage
}
GPUs"
)
if
self
.
decode_worker_remaining_grace_period
>
0
:
self
.
decode_worker_remaining_grace_period
-=
1
async
def
run
(
self
):
"""Main loop for the planner"""
await
self
.
set_metric_aggregator
()
if
self
.
_repeating_log_func
==
logger
.
debug
:
logger
.
info
(
"Running in no-operation mode - detailed metrics will be logged at DEBUG level"
)
await
self
.
reset_adjustment_interval
()
while
True
:
current_time
=
time
.
time
()
# Collect metrics at each metric pulling interval
if
(
len
(
self
.
metrics_collection_time
)
==
0
or
current_time
-
self
.
metrics_collection_time
[
-
1
]
>=
self
.
args
.
metric_pulling_interval
):
await
self
.
collect_metrics
()
# Check if it's time for adjustment
if
(
current_time
-
self
.
last_adjustment_time
>=
self
.
args
.
adjustment_interval
):
if
not
self
.
args
.
no_operation
:
# blockingly make adjustments to avoid overcompensation
await
self
.
make_adjustments
()
await
self
.
reset_adjustment_interval
()
# Sleep to avoid busy waiting
await
asyncio
.
sleep
(
self
.
args
.
metric_pulling_interval
/
10
)
# @dynamo_worker()
# TODO: let's make it such that planner still works via CLI invokation
async
def
start_planner
(
runtime
:
DistributedRuntime
,
args
:
argparse
.
Namespace
):
planner
=
Planner
(
runtime
,
args
)
console
=
Console
()
table
=
Table
()
table
.
add_column
(
"Component"
,
style
=
"cyan"
)
table
.
add_column
(
"Endpoint"
,
style
=
"green"
)
components
=
await
runtime
.
etcd_client
().
kv_get_prefix
(
args
.
namespace
)
for
component
in
components
:
try
:
data
=
json
.
loads
(
component
[
"value"
].
decode
(
"utf-8"
))
if
"component"
in
data
:
name
=
data
[
"component"
]
endpoint
=
data
[
"endpoint"
]
table
.
add_row
(
name
,
endpoint
)
except
Exception
:
# Some entries may not be valid JSON or might be binary data
pass
console
.
print
(
table
)
await
planner
.
run
()
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
# Common planner arguments
parser
.
add_argument
(
"--namespace"
,
type
=
str
,
default
=
LoadPlannerDefaults
.
namespace
,
help
=
"Namespace planner will look at"
,
)
parser
.
add_argument
(
"--environment"
,
type
=
str
,
default
=
LoadPlannerDefaults
.
environment
,
help
=
"Environment to run the planner in (local, kubernetes)"
,
)
parser
.
add_argument
(
"--no-operation"
,
action
=
"store_true"
,
default
=
LoadPlannerDefaults
.
no_operation
,
help
=
"Do not make any adjustments, just observe the metrics"
,
)
parser
.
add_argument
(
"--log-dir"
,
type
=
str
,
default
=
LoadPlannerDefaults
.
log_dir
,
help
=
"Tensorboard logging directory"
,
)
parser
.
add_argument
(
"--adjustment-interval"
,
type
=
int
,
default
=
LoadPlannerDefaults
.
adjustment_interval
,
help
=
"Interval in seconds between scaling adjustments"
,
)
parser
.
add_argument
(
"--max-gpu-budget"
,
type
=
int
,
default
=
LoadPlannerDefaults
.
max_gpu_budget
,
help
=
"Maximum number of GPUs to use"
,
)
parser
.
add_argument
(
"--min-endpoint"
,
type
=
int
,
default
=
LoadPlannerDefaults
.
min_endpoint
,
help
=
"Minimum number of endpoints to keep for prefill/decode workers"
,
)
parser
.
add_argument
(
"--metric-pulling-interval"
,
type
=
int
,
default
=
LoadPlannerDefaults
.
metric_pulling_interval
,
help
=
"Interval in seconds between metric pulls"
,
)
parser
.
add_argument
(
"--decode-engine-num-gpu"
,
type
=
int
,
default
=
LoadPlannerDefaults
.
decode_engine_num_gpu
,
help
=
"Number of GPUs per decode engine"
,
)
parser
.
add_argument
(
"--prefill-engine-num-gpu"
,
type
=
int
,
default
=
LoadPlannerDefaults
.
prefill_engine_num_gpu
,
help
=
"Number of GPUs per prefill engine"
,
)
# Load-planner specific arguments
parser
.
add_argument
(
"--decode-kv-scale-up-threshold"
,
type
=
float
,
default
=
LoadPlannerDefaults
.
decode_kv_scale_up_threshold
,
help
=
"KV cache utilization threshold to scale up decode workers"
,
)
parser
.
add_argument
(
"--decode-kv-scale-down-threshold"
,
type
=
float
,
default
=
LoadPlannerDefaults
.
decode_kv_scale_down_threshold
,
help
=
"KV cache utilization threshold to scale down decode workers"
,
)
parser
.
add_argument
(
"--prefill-queue-scale-up-threshold"
,
type
=
float
,
default
=
LoadPlannerDefaults
.
prefill_queue_scale_up_threshold
,
help
=
"Queue utilization threshold to scale up prefill workers, this threshold is per prefill worker"
,
)
parser
.
add_argument
(
"--prefill-queue-scale-down-threshold"
,
type
=
float
,
default
=
LoadPlannerDefaults
.
prefill_queue_scale_down_threshold
,
help
=
"Queue utilization threshold to scale down prefill workers, this threshold is per prefill worker"
,
)
args
=
parser
.
parse_args
()
asyncio
.
run
(
dynamo_worker
()(
start_planner
)(
args
))
examples/llm/components/planner_service.py
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
logging
from
pydantic
import
BaseModel
from
components.planner
import
start_planner
# type: ignore[attr-defined]
from
dynamo.planner.defaults
import
LoadPlannerDefaults
from
dynamo.runtime.logging
import
configure_dynamo_logging
from
dynamo.sdk
import
async_on_start
,
dynamo_context
,
endpoint
,
service
from
dynamo.sdk.core.protocol.interface
import
ComponentType
from
dynamo.sdk.lib.config
import
ServiceConfig
from
dynamo.sdk.lib.image
import
DYNAMO_IMAGE
logger
=
logging
.
getLogger
(
__name__
)
class
RequestType
(
BaseModel
):
text
:
str
@
service
(
dynamo
=
{
"namespace"
:
"dynamo"
,
"component_type"
:
ComponentType
.
PLANNER
,
},
resources
=
{
"cpu"
:
"10"
,
"memory"
:
"20Gi"
},
workers
=
1
,
image
=
DYNAMO_IMAGE
,
)
class
Planner
:
def
__init__
(
self
):
configure_dynamo_logging
(
service_name
=
"Planner"
)
logger
.
info
(
"Starting planner"
)
self
.
runtime
=
dynamo_context
[
"runtime"
]
config
=
ServiceConfig
.
get_instance
()
# Get namespace directly from dynamo_context as it contains the active namespace
self
.
namespace
=
dynamo_context
[
"namespace"
]
config_instance
=
config
.
get
(
"Planner"
,
{})
self
.
args
=
argparse
.
Namespace
(
namespace
=
self
.
namespace
,
environment
=
config_instance
.
get
(
"environment"
,
LoadPlannerDefaults
.
environment
),
no_operation
=
config_instance
.
get
(
"no-operation"
,
LoadPlannerDefaults
.
no_operation
),
log_dir
=
config_instance
.
get
(
"log-dir"
,
LoadPlannerDefaults
.
log_dir
),
adjustment_interval
=
config_instance
.
get
(
"adjustment-interval"
,
LoadPlannerDefaults
.
adjustment_interval
),
metric_pulling_interval
=
config_instance
.
get
(
"metric-pulling-interval"
,
LoadPlannerDefaults
.
metric_pulling_interval
),
max_gpu_budget
=
config_instance
.
get
(
"max-gpu-budget"
,
LoadPlannerDefaults
.
max_gpu_budget
),
min_endpoint
=
config_instance
.
get
(
"min-endpoint"
,
LoadPlannerDefaults
.
min_endpoint
),
decode_kv_scale_up_threshold
=
config_instance
.
get
(
"decode-kv-scale-up-threshold"
,
LoadPlannerDefaults
.
decode_kv_scale_up_threshold
,
),
decode_kv_scale_down_threshold
=
config_instance
.
get
(
"decode-kv-scale-down-threshold"
,
LoadPlannerDefaults
.
decode_kv_scale_down_threshold
,
),
prefill_queue_scale_up_threshold
=
config_instance
.
get
(
"prefill-queue-scale-up-threshold"
,
LoadPlannerDefaults
.
prefill_queue_scale_up_threshold
,
),
prefill_queue_scale_down_threshold
=
config_instance
.
get
(
"prefill-queue-scale-down-threshold"
,
LoadPlannerDefaults
.
prefill_queue_scale_down_threshold
,
),
decode_engine_num_gpu
=
config_instance
.
get
(
"decode-engine-num-gpu"
,
LoadPlannerDefaults
.
decode_engine_num_gpu
),
prefill_engine_num_gpu
=
config_instance
.
get
(
"prefill-engine-num-gpu"
,
LoadPlannerDefaults
.
prefill_engine_num_gpu
),
)
@
async_on_start
async
def
async_init
(
self
):
import
asyncio
await
asyncio
.
sleep
(
30
)
logger
.
info
(
"Calling start_planner"
)
await
start_planner
(
self
.
runtime
,
self
.
args
)
logger
.
info
(
"Planner started"
)
@
endpoint
()
async
def
generate
(
self
,
request
:
RequestType
):
"""Dummy endpoint to satisfy that each component has an endpoint"""
yield
"mock endpoint"
examples/llm/components/prefill_worker.py
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
logging
import
os
import
signal
import
sys
from
pydantic
import
BaseModel
from
utils.nixl
import
NixlMetadataStore
from
utils.prefill_queue
import
PrefillQueue
from
utils.vllm
import
parse_vllm_args
from
vllm.entrypoints.openai.api_server
import
(
build_async_engine_client_from_engine_args
,
)
from
vllm.inputs.data
import
TokensPrompt
from
vllm.remote_prefill
import
RemotePrefillParams
,
RemotePrefillRequest
from
dynamo.sdk
import
async_on_start
,
dynamo_context
,
endpoint
,
service
logger
=
logging
.
getLogger
(
__name__
)
class
RequestType
(
BaseModel
):
text
:
str
@
service
(
dynamo
=
{
"namespace"
:
"dynamo"
,
},
resources
=
{
"gpu"
:
1
,
"cpu"
:
"10"
,
"memory"
:
"20Gi"
},
workers
=
1
,
)
class
PrefillWorker
:
def
__init__
(
self
):
class_name
=
self
.
__class__
.
__name__
self
.
engine_args
=
parse_vllm_args
(
class_name
,
""
)
self
.
_loaded_metadata
=
set
()
self
.
initialized
=
False
if
self
.
engine_args
.
enable_chunked_prefill
is
not
False
:
logger
.
info
(
"Chunked prefill is not supported yet, setting to False"
)
self
.
engine_args
.
enable_chunked_prefill
=
False
if
self
.
engine_args
.
pipeline_parallel_size
!=
1
:
logger
.
info
(
"Pipeline parallel size is not supported yet, setting to 1"
)
self
.
engine_args
.
pipeline_parallel_size
=
1
if
self
.
engine_args
.
disable_async_output_proc
is
not
True
:
logger
.
info
(
"Async output processing is not supported yet, setting to True"
)
self
.
engine_args
.
disable_async_output_proc
=
True
if
self
.
engine_args
.
enforce_eager
is
not
True
:
logger
.
info
(
"Prefill must be done eagerly, setting to True"
)
self
.
engine_args
.
enforce_eager
=
True
if
self
.
engine_args
.
enable_prefix_caching
is
not
False
:
logger
.
info
(
"Prefix caching is not supported yet in prefill worker, setting to False"
)
self
.
engine_args
.
enable_prefix_caching
=
False
@
async_on_start
async
def
async_init
(
self
):
self
.
_engine_context
=
build_async_engine_client_from_engine_args
(
self
.
engine_args
)
if
self
.
_engine_context
is
not
None
:
self
.
engine_client
=
await
self
.
_engine_context
.
__aenter__
()
else
:
raise
RuntimeError
(
"Failed to initialize engine client"
)
runtime
=
dynamo_context
[
"runtime"
]
metadata
=
self
.
engine_client
.
nixl_metadata
self
.
_metadata_store
=
NixlMetadataStore
(
"dynamo"
,
runtime
)
await
self
.
_metadata_store
.
put
(
metadata
.
engine_id
,
metadata
)
self
.
task
=
asyncio
.
create_task
(
self
.
prefill_queue_handler
())
def
prefill_queue_handler_cb
(
fut
):
try
:
fut
.
result
()
logger
.
info
(
"prefill queue handler exited successfully"
)
except
Exception
as
e
:
logger
.
error
(
f
"[ERROR] prefill queue handler failed:
{
e
!
r
}
"
)
sys
.
exit
(
1
)
self
.
task
.
add_done_callback
(
prefill_queue_handler_cb
)
self
.
shutdown_requested
=
False
# Set up signal handler for graceful shutdown
# TODO: move to dynamo sdk
loop
=
asyncio
.
get_running_loop
()
def
signal_handler
():
# Schedule the shutdown coroutine instead of calling it directly
asyncio
.
create_task
(
self
.
graceful_shutdown
(
runtime
))
for
sig
in
(
signal
.
SIGTERM
,
signal
.
SIGINT
):
loop
.
add_signal_handler
(
sig
,
signal_handler
)
logger
.
info
(
"PrefillWorker initialized"
)
async
def
graceful_shutdown
(
self
,
runtime
):
logger
.
info
(
"Received shutdown signal, shutting down DistributedRuntime"
)
# first shutdown the vllm engine
self
.
shutdown_requested
=
True
await
asyncio
.
wait_for
(
self
.
task
,
timeout
=
None
)
# then shutdown the mock endpoint
runtime
.
shutdown
()
logger
.
info
(
"DistributedRuntime shutdown complete"
)
def
shutdown_vllm_engine
(
self
):
"""Shutdown the background loop"""
logger
.
info
(
"Shutting down vllm engine"
)
loop
=
asyncio
.
get_event_loop
()
try
:
self
.
engine_client
.
close
()
logger
.
info
(
"PrefillWorker shutdown complete"
)
except
Exception
as
e
:
logger
.
error
(
f
"Error during shutdown:
{
e
}
"
)
finally
:
loop
.
stop
()
async
def
prefill_queue_handler
(
self
):
logger
.
info
(
"Prefill queue handler entered"
)
prefill_queue_nats_server
=
os
.
getenv
(
"NATS_SERVER"
,
"nats://localhost:4222"
)
namespace
,
_
=
PrefillWorker
.
dynamo_address
()
# type: ignore
prefill_queue_stream_name
=
f
"
{
namespace
}
_prefill_queue"
logger
.
info
(
f
"Prefill queue:
{
prefill_queue_nats_server
}
:
{
prefill_queue_stream_name
}
"
)
self
.
initialized
=
True
# TODO: integrate prefill_queue to a dynamo endpoint
async
with
PrefillQueue
.
get_instance
(
nats_server
=
prefill_queue_nats_server
,
stream_name
=
prefill_queue_stream_name
,
)
as
prefill_queue
:
logger
.
info
(
"prefill queue handler started"
)
while
True
:
# TODO: this might add a small overhead to pull prefill from nats
# need to test and check how much overhead it is
prefill_request
=
await
prefill_queue
.
dequeue_prefill_request
()
if
prefill_request
is
not
None
:
logger
.
info
(
f
"Dequeued prefill request:
{
prefill_request
.
request_id
}
"
)
async
for
_
in
self
.
generate
(
prefill_request
):
pass
if
self
.
shutdown_requested
:
logger
.
info
(
"Shutdown requested, checking if engine has any pending prefill sending requests"
)
while
True
:
if
not
await
self
.
engine_client
.
has_unfinished_requests
():
break
logger
.
info
(
"Engine has pending prefill sending requests, rechecking in 1 second..."
)
await
asyncio
.
sleep
(
1
)
self
.
shutdown_vllm_engine
()
break
async
def
generate
(
self
,
request
:
RemotePrefillRequest
):
sampling_params
=
request
.
sampling_params
sampling_params
.
max_tokens
=
1
sampling_params
.
min_tokens
=
1
remote_prefill_params
=
RemotePrefillParams
(
is_remote_decode
=
True
,
decode_block_ids
=
request
.
block_ids
,
decode_engine_id
=
request
.
engine_id
,
decode_computed_block_ids
=
request
.
computed_block_ids
,
)
# TODO check if metadata has changed
# and reload - currently only loading once
if
request
.
engine_id
not
in
self
.
_loaded_metadata
:
remote_metadata
=
await
self
.
_metadata_store
.
get
(
request
.
engine_id
)
await
self
.
engine_client
.
add_remote_nixl_metadata
(
remote_metadata
)
logger
.
info
(
f
"Loaded nixl metadata from engine
{
request
.
engine_id
}
into "
f
"engine
{
self
.
engine_client
.
nixl_metadata
.
engine_id
}
"
)
self
.
_loaded_metadata
.
add
(
request
.
engine_id
)
async
for
_
in
self
.
engine_client
.
generate
(
request_id
=
request
.
request_id
,
prompt
=
TokensPrompt
(
prompt_token_ids
=
request
.
prompt_token_ids
),
sampling_params
=
sampling_params
,
remote_prefill_params
=
remote_prefill_params
,
):
yield
@
endpoint
()
async
def
mock
(
self
,
req
:
RequestType
):
yield
f
"mock_response:
{
req
}
"
examples/llm/components/processor.py
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
logging
import
uuid
from
enum
import
Enum
from
typing
import
Any
,
AsyncIterator
,
Dict
,
List
,
Tuple
,
Union
from
components.kv_router
import
Router
from
components.worker
import
VllmWorker
from
transformers
import
AutoTokenizer
from
utils.chat_processor
import
ChatProcessor
,
CompletionsProcessor
,
ProcessMixIn
from
utils.check_worker
import
check_required_workers
from
utils.protocol
import
LocalBlockHashes
,
MyRequestOutput
,
vLLMGenerateRequest
from
utils.vllm
import
RouterType
,
parse_vllm_args
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
,
CompletionRequest
from
vllm.outputs
import
RequestOutput
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
dynamo.llm
import
KvMetricsAggregator
,
compute_block_hash_for_seq_py
from
dynamo.runtime
import
EtcdKvCache
from
dynamo.sdk
import
async_on_start
,
depends
,
dynamo_context
,
endpoint
,
service
logger
=
logging
.
getLogger
(
__name__
)
class
RequestType
(
Enum
):
CHAT
=
"chat"
COMPLETION
=
"completion"
@
service
(
dynamo
=
{
"namespace"
:
"dynamo"
,
},
resources
=
{
"cpu"
:
"10"
,
"memory"
:
"20Gi"
},
workers
=
1
,
)
class
Processor
(
ProcessMixIn
):
"""
vLLM pre and post processing
"""
worker
=
depends
(
VllmWorker
)
router
=
depends
(
Router
)
def
__init__
(
self
):
class_name
=
self
.
__class__
.
__name__
self
.
engine_args
=
parse_vllm_args
(
class_name
,
""
)
self
.
model_config
=
self
.
engine_args
.
create_model_config
()
self
.
default_sampling_params
=
self
.
model_config
.
get_diff_sampling_param
()
self
.
tokenizer
=
self
.
_create_tokenizer
(
self
.
engine_args
)
self
.
chat_processor
=
ChatProcessor
(
self
.
tokenizer
,
self
.
model_config
)
self
.
completions_processor
=
CompletionsProcessor
(
self
.
tokenizer
,
self
.
model_config
)
self
.
min_workers
=
1
self
.
request_queue
:
asyncio
.
Queue
[
Dict
[
str
,
Any
]]
=
asyncio
.
Queue
()
self
.
request_futures
:
Dict
[
str
,
asyncio
.
Future
]
=
{}
self
.
num_worker_tasks
=
(
self
.
engine_args
.
router_num_threads
)
# Number of worker tasks to process the queue
self
.
worker_tasks
:
List
[
asyncio
.
Task
]
=
[]
print
(
f
"Processor init:
{
self
.
engine_args
.
router
}
"
)
def
_create_tokenizer
(
self
,
engine_args
:
AsyncEngineArgs
)
->
AnyTokenizer
:
"""Create a TokenizerGroup using engine arguments similar to VLLM's approach"""
model_path
=
engine_args
.
model
# Create the base tokenizer with VLLM's typical settings
base_tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
,
padding_side
=
"left"
,
truncation_side
=
"left"
,
use_fast
=
True
,
# VLLM might use the fast tokenizer for efficiency
)
return
base_tokenizer
@
async_on_start
async
def
async_init
(
self
):
runtime
=
dynamo_context
[
"runtime"
]
comp_ns
,
comp_name
=
VllmWorker
.
dynamo_address
()
# type: ignore
self
.
worker_client
=
(
await
runtime
.
namespace
(
comp_ns
)
.
component
(
comp_name
)
.
endpoint
(
"generate"
)
.
client
()
)
self
.
use_router
=
self
.
engine_args
.
router
in
(
RouterType
.
KV
,
RouterType
.
KV_LOAD
,
RouterType
.
APPROX_KV
,
)
if
self
.
use_router
:
router_ns
,
router_name
=
Router
.
dynamo_address
()
# type: ignore
self
.
router_client
=
(
await
runtime
.
namespace
(
router_ns
)
.
component
(
router_name
)
.
endpoint
(
"generate"
)
.
client
()
)
await
check_required_workers
(
self
.
worker_client
,
self
.
min_workers
)
kv_listener
=
runtime
.
namespace
(
"dynamo"
).
component
(
"VllmWorker"
)
await
kv_listener
.
create_service
()
self
.
metrics_aggregator
=
KvMetricsAggregator
(
kv_listener
)
self
.
etcd_kv_cache
=
await
EtcdKvCache
.
create
(
runtime
.
etcd_client
(),
f
"/
{
comp_ns
}
/processor/"
,
{
"router"
:
self
.
engine_args
.
router
},
)
# Start multiple worker tasks to process the queue
self
.
_start_worker_tasks
()
def
_start_worker_tasks
(
self
):
"""Start multiple worker tasks to process the queue concurrently"""
# Clear any existing worker tasks
for
task
in
self
.
worker_tasks
:
if
not
task
.
done
():
task
.
cancel
()
self
.
worker_tasks
=
[]
# Create new worker tasks
for
i
in
range
(
self
.
num_worker_tasks
):
task
=
asyncio
.
create_task
(
self
.
_process_queue
(
worker_id
=
i
))
self
.
worker_tasks
.
append
(
task
)
logger
.
info
(
f
"Started
{
self
.
num_worker_tasks
}
queue worker tasks"
)
async
def
_process_queue
(
self
,
worker_id
:
int
):
"""Background task to process the request queue"""
logger
.
info
(
f
"Queue worker
{
worker_id
}
started"
)
while
True
:
try
:
# Get the next request from the queue
request_data
=
await
self
.
request_queue
.
get
()
# Process the request
try
:
await
self
.
_process_request
(
request_data
)
except
Exception
as
e
:
logger
.
error
(
f
"Worker
{
worker_id
}
: Error processing request:
{
e
}
"
)
finally
:
# Mark the task as done
self
.
request_queue
.
task_done
()
except
asyncio
.
CancelledError
:
logger
.
info
(
f
"Queue worker
{
worker_id
}
was cancelled"
)
break
except
Exception
as
e
:
logger
.
error
(
f
"Worker
{
worker_id
}
: Unexpected error in queue processing:
{
e
}
"
)
# Sleep briefly to avoid tight error loops
await
asyncio
.
sleep
(
0.1
)
async
def
_get_kv_load
(
self
):
metrics
=
await
self
.
metrics_aggregator
.
get_metrics
()
kv_load
=
{}
for
end_point
in
metrics
.
endpoints
:
worker_id
=
end_point
.
worker_id
kv_load
[
worker_id
]
=
getattr
(
end_point
,
"gpu_cache_usage_perc"
,
0.0
)
return
kv_load
async
def
_get_pending_requests
(
self
):
metrics
=
await
self
.
metrics_aggregator
.
get_metrics
()
pending_requests
=
{}
for
end_point
in
metrics
.
endpoints
:
worker_id
=
end_point
.
worker_id
pending_requests
[
worker_id
]
=
getattr
(
endpoint
,
"num_requests_waiting"
,
0
)
return
pending_requests
async
def
_generate
(
self
,
raw_request
:
Union
[
CompletionRequest
,
ChatCompletionRequest
],
request_type
:
RequestType
,
):
request_id
=
str
(
uuid
.
uuid4
())
logger
.
debug
(
f
"Got raw request:
{
raw_request
}
"
)
# Create a future for this request
future
:
asyncio
.
Future
[
AsyncIterator
[
Any
]]
=
asyncio
.
Future
()
self
.
request_futures
[
request_id
]
=
future
# Enqueue the request with minimal processing
await
self
.
request_queue
.
put
(
{
"request_id"
:
request_id
,
"raw_request"
:
raw_request
,
"request_type"
:
request_type
,
}
)
try
:
# Wait for the future to complete and yield the results
generator
=
await
future
async
for
response
in
generator
:
yield
response
finally
:
# Clean up the future when done
if
request_id
in
self
.
request_futures
:
del
self
.
request_futures
[
request_id
]
async
def
_process_request
(
self
,
request_data
:
Dict
[
str
,
Any
]):
"""Process a single request from the queue"""
request_id
=
request_data
[
"request_id"
]
raw_request
=
request_data
[
"raw_request"
]
request_type
=
request_data
[
"request_type"
]
try
:
# Parse the raw request here instead of in _generate
(
request
,
conversation
,
prompt
,
engine_prompt
,
sampling_params
,
)
=
await
self
.
_parse_raw_request
(
raw_request
)
# Create an async generator function to process this request
async
def
process_and_stream
():
# TODO: queue request at processor when engines are full
router_mode
=
(
await
self
.
etcd_kv_cache
.
get
(
"router"
)).
decode
()
self
.
use_router
=
router_mode
in
(
RouterType
.
KV
,
RouterType
.
KV_LOAD
,
RouterType
.
APPROX_KV
,
)
prefix_hit_rate
=
0.0
# Default value
if
self
.
use_router
:
token_ids
=
engine_prompt
[
"prompt_token_ids"
]
router_generator
=
await
self
.
router_client
.
generate
(
LocalBlockHashes
(
hashes
=
compute_block_hash_for_seq_py
(
token_ids
,
self
.
engine_args
.
block_size
),
tokens
=
token_ids
,
num_tokens
=
len
(
token_ids
),
).
model_dump_json
()
)
decision
=
await
router_generator
.
__anext__
()
worker_id
,
prefix_hit_rate
=
decision
.
data
()
prefix_hit_rate
=
float
(
prefix_hit_rate
)
# Create request object once with default prefix_hit_rate
request_obj
=
vLLMGenerateRequest
(
engine_prompt
=
engine_prompt
,
sampling_params
=
sampling_params
,
request_id
=
request_id
,
prefix_hit_rate
=
prefix_hit_rate
,
).
model_dump_json
()
if
self
.
use_router
:
if
worker_id
==
""
:
engine_generator
=
await
self
.
worker_client
.
generate
(
request_obj
)
else
:
engine_generator
=
await
self
.
worker_client
.
direct
(
request_obj
,
int
(
worker_id
)
)
elif
router_mode
==
RouterType
.
RANDOM
:
engine_generator
=
await
self
.
worker_client
.
generate
(
request_obj
)
elif
router_mode
==
RouterType
.
ROUND_ROBIN
:
engine_generator
=
await
self
.
worker_client
.
round_robin
(
request_obj
)
output_generator
=
self
.
_generate_responses
(
engine_generator
,
request_type
)
# Stream responses directly to the caller
async
for
response
in
await
self
.
_stream_response
(
request
,
output_generator
,
request_id
,
conversation
):
yield
response
# Set the future result to our async generator
if
request_id
in
self
.
request_futures
:
self
.
request_futures
[
request_id
].
set_result
(
process_and_stream
())
except
Exception
as
e
:
logger
.
error
(
f
"Error processing request
{
request_id
}
:
{
e
}
"
)
# Set exception on the future if it still exists
if
(
request_id
in
self
.
request_futures
and
not
self
.
request_futures
[
request_id
].
done
()
):
self
.
request_futures
[
request_id
].
set_exception
(
e
)
async
def
_generate_responses
(
self
,
engine_generator
:
AsyncIterator
[
RequestOutput
],
request_type
:
RequestType
)
->
AsyncIterator
[
Union
[
RequestOutput
,
Tuple
[
int
,
RequestOutput
]]]:
prompt_idx
=
0
async
for
resp
in
engine_generator
:
# Deserialize the response from the engine
# Creates correct vLLM objects for each field
output
=
MyRequestOutput
.
model_validate_json
(
resp
.
data
())
# OpenAIServingChat.chat_completion_stream_generator() method expects a RequestOutput object
request_output
=
RequestOutput
(
request_id
=
output
.
request_id
,
prompt
=
output
.
prompt
,
prompt_token_ids
=
output
.
prompt_token_ids
,
prompt_logprobs
=
output
.
prompt_logprobs
,
outputs
=
output
.
outputs
,
finished
=
output
.
finished
,
metrics
=
output
.
metrics
,
)
if
request_type
==
RequestType
.
CHAT
:
# For chat requests, yield the request_output directly.
yield
request_output
elif
request_type
==
RequestType
.
COMPLETION
:
# Completion requests can have multiple prompts and stream generator requires the prompt index
yield
(
prompt_idx
,
request_output
)
else
:
raise
NotImplementedError
(
f
"Request type
{
request_type
}
not implemented"
)
@
endpoint
(
name
=
"chat/completions"
)
async
def
chat_completions
(
self
,
raw_request
:
ChatCompletionRequest
):
async
for
response
in
self
.
_generate
(
raw_request
,
RequestType
.
CHAT
):
yield
response
# @endpoint()
# async def completions(self, raw_request: CompletionRequest):
# async for response in self._generate(raw_request, RequestType.COMPLETION):
# yield response
examples/llm/components/worker.py
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
logging
import
os
import
signal
from
components.disagg_router
import
PyDisaggregatedRouter
from
components.prefill_worker
import
PrefillWorker
from
utils.nixl
import
NixlMetadataStore
from
utils.prefill_queue
import
PrefillQueue
from
utils.protocol
import
MyRequestOutput
,
vLLMGenerateRequest
from
utils.vllm
import
RouterType
,
parse_vllm_args
from
vllm.entrypoints.openai.api_server
import
(
build_async_engine_client_from_engine_args
,
)
from
vllm.remote_prefill
import
RemotePrefillParams
,
RemotePrefillRequest
from
vllm.sampling_params
import
RequestOutputKind
from
dynamo.llm
import
ForwardPassMetrics
,
KvStats
,
WorkerMetricsPublisher
,
WorkerStats
from
dynamo.sdk
import
async_on_start
,
depends
,
dynamo_context
,
endpoint
,
service
logger
=
logging
.
getLogger
(
__name__
)
@
service
(
dynamo
=
{
"namespace"
:
"dynamo"
,
},
resources
=
{
"gpu"
:
1
,
"cpu"
:
"10"
,
"memory"
:
"20Gi"
},
workers
=
1
,
)
class
VllmWorker
:
prefill_worker
=
depends
(
PrefillWorker
)
def
__init__
(
self
):
self
.
client
=
None
self
.
disaggregated_router
:
PyDisaggregatedRouter
=
None
# type: ignore
class_name
=
self
.
__class__
.
__name__
self
.
engine_args
=
parse_vllm_args
(
class_name
,
""
)
self
.
do_remote_prefill
=
self
.
engine_args
.
remote_prefill
self
.
_prefill_queue_nats_server
=
os
.
getenv
(
"NATS_SERVER"
,
"nats://localhost:4222"
)
self
.
namespace
,
_
=
VllmWorker
.
dynamo_address
()
# type: ignore
self
.
_prefill_queue_stream_name
=
f
"
{
self
.
namespace
}
_prefill_queue"
logger
.
info
(
f
"Prefill queue:
{
self
.
_prefill_queue_nats_server
}
:
{
self
.
_prefill_queue_stream_name
}
"
)
if
self
.
engine_args
.
remote_prefill
:
if
self
.
engine_args
.
enable_chunked_prefill
is
not
False
:
logger
.
info
(
"Chunked prefill is not supported yet, setting to False"
)
self
.
engine_args
.
enable_chunked_prefill
=
False
if
self
.
engine_args
.
preemption_mode
!=
"swap"
:
logger
.
info
(
"Preemption mode is not supported yet, setting to swap"
)
self
.
engine_args
.
preemption_mode
=
"swap"
if
self
.
engine_args
.
pipeline_parallel_size
!=
1
:
logger
.
info
(
"Pipeline parallel size is not supported yet, setting to 1"
)
self
.
engine_args
.
pipeline_parallel_size
=
1
if
self
.
engine_args
.
router
in
(
RouterType
.
KV
,
RouterType
.
APPROX_KV
):
if
not
self
.
engine_args
.
enable_prefix_caching
:
logger
.
info
(
"When using KV router, prefix caching must be enabled, setting to True"
)
self
.
engine_args
.
enable_prefix_caching
=
True
VLLM_WORKER_ID
=
dynamo_context
[
"endpoints"
][
0
].
lease_id
()
os
.
environ
[
"VLLM_WORKER_ID"
]
=
str
(
VLLM_WORKER_ID
)
os
.
environ
[
"VLLM_KV_NAMESPACE"
]
=
"dynamo"
os
.
environ
[
"VLLM_KV_COMPONENT"
]
=
class_name
self
.
metrics_publisher
=
WorkerMetricsPublisher
()
signal
.
signal
(
signal
.
SIGTERM
,
self
.
shutdown_vllm_engine
)
signal
.
signal
(
signal
.
SIGINT
,
self
.
shutdown_vllm_engine
)
@
async_on_start
async
def
async_init
(
self
):
self
.
_engine_context
=
build_async_engine_client_from_engine_args
(
self
.
engine_args
)
if
self
.
_engine_context
is
not
None
:
self
.
engine_client
=
await
self
.
_engine_context
.
__aenter__
()
else
:
raise
RuntimeError
(
"Failed to initialize engine client"
)
self
.
engine_client
.
set_metrics_publisher
(
self
.
metrics_publisher
)
# Initially send dummy metrics to kick start,
# vLLM will not update stat until forward pass is triggered
worker_stats
=
WorkerStats
(
0
,
# request_active_slots
1024
,
# request_total_slots
0
,
# num_requests_waiting
None
,
# data_parallel_rank
)
kv_stats
=
KvStats
(
0
,
# kv_active_blocks
1024
,
# kv_total_blocks
0.0
,
# gpu_cache_usage_perc
0.0
,
# gpu_prefix_cache_hit_rate
)
metrics
=
ForwardPassMetrics
(
worker_stats
=
worker_stats
,
kv_stats
=
kv_stats
,
spec_decode_stats
=
None
,
)
self
.
metrics_publisher
.
publish
(
metrics
)
task
=
asyncio
.
create_task
(
self
.
create_metrics_publisher_endpoint
())
task
.
add_done_callback
(
lambda
_
:
logger
.
info
(
"metrics publisher endpoint created"
)
)
runtime
=
dynamo_context
[
"runtime"
]
if
self
.
engine_args
.
remote_prefill
:
metadata
=
self
.
engine_client
.
nixl_metadata
metadata_store
=
NixlMetadataStore
(
"dynamo"
,
runtime
)
await
metadata_store
.
put
(
metadata
.
engine_id
,
metadata
)
if
self
.
engine_args
.
conditional_disagg
:
self
.
disaggregated_router
=
PyDisaggregatedRouter
(
runtime
,
self
.
namespace
,
max_local_prefill_length
=
self
.
engine_args
.
max_local_prefill_length
,
max_prefill_queue_size
=
self
.
engine_args
.
max_prefill_queue_size
,
)
await
self
.
disaggregated_router
.
async_init
()
else
:
self
.
disaggregated_router
=
None
# Set up signal handler for graceful shutdown
# TODO: move to dynamo sdk
loop
=
asyncio
.
get_running_loop
()
def
signal_handler
():
# Schedule the shutdown coroutine instead of calling it directly
asyncio
.
create_task
(
self
.
graceful_shutdown
(
runtime
))
for
sig
in
(
signal
.
SIGTERM
,
signal
.
SIGINT
):
loop
.
add_signal_handler
(
sig
,
signal_handler
)
logger
.
info
(
"VllmWorker has been initialized"
)
async
def
graceful_shutdown
(
self
,
runtime
):
logger
.
info
(
"Received shutdown signal, shutting down DistributedRuntime"
)
runtime
.
shutdown
()
logger
.
info
(
"DistributedRuntime shutdown complete"
)
def
shutdown_vllm_engine
(
self
,
signum
,
frame
):
"""Shutdown the background loop"""
logger
.
info
(
f
"Received signal
{
signum
}
, shutting down"
)
loop
=
asyncio
.
get_event_loop
()
try
:
self
.
engine_client
.
close
()
logger
.
info
(
"VllmWorker shutdown complete"
)
except
Exception
as
e
:
logger
.
error
(
f
"Error during shutdown:
{
e
}
"
)
finally
:
loop
.
stop
()
async
def
create_metrics_publisher_endpoint
(
self
):
component
=
dynamo_context
[
"component"
]
logger
.
info
(
"Creating metrics publisher endpoint with primary lease"
)
await
self
.
metrics_publisher
.
create_endpoint
(
component
)
def
get_remote_prefill_request_callback
(
self
):
# TODO: integrate prefill_queue to dynamo endpoint
async
def
callback
(
request
:
RemotePrefillRequest
):
async
with
PrefillQueue
.
get_instance
(
nats_server
=
self
.
_prefill_queue_nats_server
,
stream_name
=
self
.
_prefill_queue_stream_name
,
)
as
prefill_queue
:
await
prefill_queue
.
enqueue_prefill_request
(
request
)
return
callback
# TODO: use the same child lease for metrics publisher endpoint and generate endpoint
@
endpoint
()
async
def
generate
(
self
,
request
:
vLLMGenerateRequest
):
# TODO: consider prefix hit when deciding prefill locally or remotely
if
self
.
disaggregated_router
is
not
None
:
async
with
PrefillQueue
.
get_instance
(
nats_server
=
self
.
_prefill_queue_nats_server
,
stream_name
=
self
.
_prefill_queue_stream_name
,
)
as
prefill_queue
:
prefill_queue_size
=
await
prefill_queue
.
get_queue_size
()
disagg_router_decision
=
await
self
.
disaggregated_router
.
prefill_remote
(
len
(
request
.
engine_prompt
[
"prompt_token_ids"
]),
request
.
prefix_hit_rate
,
prefill_queue_size
,
)
else
:
# always prefill remotely if no disaggregated router is provided
disagg_router_decision
=
True
if
self
.
do_remote_prefill
and
disagg_router_decision
:
remote_prefill_params
=
RemotePrefillParams
(
is_remote_prefill
=
True
,
remote_prefill_request_callback
=
self
.
get_remote_prefill_request_callback
(),
)
logger
.
info
(
f
"Prefilling remotely for request
{
request
.
request_id
}
with length
{
len
(
request
.
engine_prompt
[
'prompt_token_ids'
])
}
"
)
else
:
remote_prefill_params
=
None
logger
.
info
(
f
"Prefilling locally for request
{
request
.
request_id
}
with length
{
len
(
request
.
engine_prompt
[
'prompt_token_ids'
])
}
"
)
# rust HTTP requires Delta streaming
request
.
sampling_params
.
output_kind
=
RequestOutputKind
.
DELTA
async
for
response
in
self
.
engine_client
.
generate
(
prompt
=
request
.
engine_prompt
,
sampling_params
=
request
.
sampling_params
,
request_id
=
request
.
request_id
,
remote_prefill_params
=
remote_prefill_params
,
):
yield
MyRequestOutput
(
request_id
=
response
.
request_id
,
prompt
=
response
.
prompt
,
prompt_token_ids
=
response
.
prompt_token_ids
,
prompt_logprobs
=
response
.
prompt_logprobs
,
outputs
=
response
.
outputs
,
finished
=
response
.
finished
,
).
model_dump_json
()
examples/llm/configs/agg.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size
:
64
max-model-len
:
16384
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
router
:
round-robin
router-num-threads
:
4
common-configs
:
[
model
,
block-size
,
max-model-len
]
VllmWorker
:
enforce-eager
:
true
max-num-batched-tokens
:
16384
enable-prefix-caching
:
true
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
]
Planner
:
environment
:
local
no-operation
:
true
\ No newline at end of file
examples/llm/configs/agg_router.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
router
:
kv
block-size
:
64
max-model-len
:
16384
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
common-configs
:
[
model
,
block-size
,
max-model-len
,
router
]
Router
:
min-workers
:
1
softmax-sample
:
true
common-configs
:
[
model
,
block-size
,
router
]
VllmWorker
:
enforce-eager
:
true
max-num-batched-tokens
:
16384
enable-prefix-caching
:
true
tensor-parallel-size
:
1
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
,
router
,
kv-transfer-config
]
Planner
:
environment
:
local
no-operation
:
true
\ No newline at end of file
examples/llm/configs/disagg.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size
:
64
max-model-len
:
16384
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
router
:
round-robin
common-configs
:
[
model
,
block-size
]
VllmWorker
:
remote-prefill
:
true
conditional-disagg
:
true
max-local-prefill-length
:
10
max-prefill-queue-size
:
2
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
]
PrefillWorker
:
max-num-batched-tokens
:
16384
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
]
Planner
:
environment
:
local
no-operation
:
true
examples/llm/configs/disagg_router.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size
:
64
max-model-len
:
16384
router
:
kv
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
common-configs
:
[
model
,
block-size
,
max-model-len
,
router
]
Router
:
min-workers
:
1
common-configs
:
[
model
,
block-size
,
router
]
VllmWorker
:
max-num-batched-tokens
:
16384
remote-prefill
:
true
conditional-disagg
:
true
max-local-prefill-length
:
10
max-prefill-queue-size
:
2
tensor-parallel-size
:
1
enable-prefix-caching
:
true
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
,
router
,
kv-transfer-config
]
PrefillWorker
:
max-num-batched-tokens
:
16384
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
]
Planner
:
environment
:
local
no-operation
:
true
\ No newline at end of file
examples/llm/configs/multinode-405b.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This configuration file is used in the multinode-examples.md file
# to start the 405B model on 3 nodes.
Frontend
:
served_model_name
:
nvidia/Llama-3.1-405B-Instruct-FP8
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
model
:
nvidia/Llama-3.1-405B-Instruct-FP8
block-size
:
64
max-model-len
:
8192
router
:
kv
Router
:
model
:
nvidia/Llama-3.1-405B-Instruct-FP8
min-workers
:
1
VllmWorker
:
model
:
nvidia/Llama-3.1-405B-Instruct-FP8
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
block-size
:
64
max-model-len
:
8192
max-num-seqs
:
16
remote-prefill
:
true
conditional-disagg
:
true
max-local-prefill-length
:
10
max-prefill-queue-size
:
2
gpu-memory-utilization
:
0.95
tensor-parallel-size
:
8
router
:
kv
quantization
:
modelopt
enable-prefix-caching
:
true
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
8'
PrefillWorker
:
model
:
nvidia/Llama-3.1-405B-Instruct-FP8
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
block-size
:
64
max-model-len
:
8192
max-num-seqs
:
16
gpu-memory-utilization
:
0.95
tensor-parallel-size
:
8
quantization
:
modelopt
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
8'
\ No newline at end of file
examples/llm/configs/multinode_agg_r1.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1
block-size
:
64
max-model-len
:
16384
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
router
:
round-robin
common-configs
:
[
model
,
block-size
,
max-model-len
]
VllmWorker
:
enforce-eager
:
true
max-num-batched-tokens
:
16384
enable-prefix-caching
:
true
router
:
random
tensor-parallel-size
:
16
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
]
examples/llm/configs/mutinode_disagg_r1.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1
block-size
:
64
max-model-len
:
16384
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
tensor-parallel-size
:
16
disable-log-requests
:
true
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
router
:
round-robin
common-configs
:
[
model
,
block-size
]
VllmWorker
:
remote-prefill
:
true
conditional-disagg
:
false
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
16'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
,
tensor-parallel-size
,
disable-log-requests
]
PrefillWorker
:
max-num-batched-tokens
:
16384
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
16'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
,
tensor-parallel-size
,
disable-log-requests
]
examples/llm/deploy/agg.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion
:
nvidia.com/v1alpha1
kind
:
DynamoGraphDeployment
metadata
:
name
:
llm-agg
spec
:
envs
:
-
name
:
DYN_DEPLOYMENT_CONFIG
value
:
'
{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","router-num-threads":4,"common-configs":["model","block-size","max-model-len"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
services
:
Frontend
:
dynamoNamespace
:
llm-agg
componentType
:
main
replicas
:
1
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
1"
memory
:
"
2Gi"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.agg:Frontend
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
Frontend
Processor
:
dynamoNamespace
:
llm-agg
componentType
:
worker
replicas
:
1
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
1"
memory
:
"
2Gi"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.agg:Processor
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
Processor
VllmWorker
:
envFromSecret
:
hf-token-secret
dynamoNamespace
:
llm-agg
replicas
:
1
resources
:
requests
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
limits
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.agg:VllmWorker
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
VllmWorker
examples/llm/deploy/agg_router.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion
:
nvidia.com/v1alpha1
kind
:
DynamoGraphDeployment
metadata
:
name
:
agg-router
spec
:
envs
:
-
name
:
DYN_DEPLOYMENT_CONFIG
value
:
'
{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","router":"kv","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"common-configs":["model","block-size","max-model-len","router"]},"Router":{"min-workers":1,"softmax-sample":true,"common-configs":["model","block-size","router"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"tensor-parallel-size":1,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","router","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
services
:
Frontend
:
dynamoNamespace
:
llm-agg-router
componentType
:
main
replicas
:
1
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
1"
memory
:
"
2Gi"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.agg_router:Frontend
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
Frontend
Processor
:
dynamoNamespace
:
llm-agg-router
componentType
:
worker
replicas
:
1
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
1"
memory
:
"
2Gi"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.agg_router:Processor
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
Processor
Router
:
dynamoNamespace
:
llm-agg-router
componentType
:
worker
replicas
:
1
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
1"
memory
:
"
2Gi"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.agg_router:Router
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
Router
VllmWorker
:
envFromSecret
:
hf-token-secret
dynamoNamespace
:
llm-agg-router
replicas
:
1
resources
:
requests
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
limits
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.agg_router:VllmWorker
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
VllmWorker
examples/llm/deploy/disagg.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion
:
nvidia.com/v1alpha1
kind
:
DynamoGraphDeployment
metadata
:
name
:
llm-disagg
spec
:
envs
:
-
name
:
DYN_DEPLOYMENT_CONFIG
value
:
'
{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","common-configs":["model","block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
services
:
Frontend
:
dynamoNamespace
:
llm-disagg
componentType
:
main
replicas
:
1
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
1"
memory
:
"
2Gi"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.disagg:Frontend
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
Frontend
Processor
:
dynamoNamespace
:
llm-disagg
componentType
:
worker
replicas
:
1
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
1"
memory
:
"
2Gi"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.disagg:Processor
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
Processor
VllmWorker
:
envFromSecret
:
hf-token-secret
dynamoNamespace
:
llm-disagg
replicas
:
1
resources
:
requests
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
limits
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.disagg:VllmWorker
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
VllmWorker
PrefillWorker
:
envFromSecret
:
hf-token-secret
dynamoNamespace
:
llm-disagg
replicas
:
1
resources
:
requests
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
limits
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.disagg:PrefillWorker
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
PrefillWorker
examples/llm/deploy/disagg_router.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion
:
nvidia.com/v1alpha1
kind
:
DynamoGraphDeployment
metadata
:
name
:
disagg-router
spec
:
envs
:
-
name
:
DYN_DEPLOYMENT_CONFIG
value
:
'
{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"router":"kv","kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"common-configs":["model","block-size","max-model-len","router"]},"Router":{"min-workers":1,"common-configs":["model","block-size","router"]},"VllmWorker":{"max-num-batched-tokens":16384,"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"tensor-parallel-size":1,"enable-prefix-caching":true,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","router","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
services
:
Frontend
:
dynamoNamespace
:
llm-disagg-router
componentType
:
main
replicas
:
1
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
1"
memory
:
"
2Gi"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.disagg_router:Frontend
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
Frontend
Processor
:
dynamoNamespace
:
llm-disagg-router
componentType
:
worker
replicas
:
1
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
1"
memory
:
"
2Gi"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.disagg_router:Processor
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
Processor
Router
:
dynamoNamespace
:
llm-disagg-router
componentType
:
worker
replicas
:
1
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
1"
memory
:
"
2Gi"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.disagg_router:Router
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
Router
VllmWorker
:
envFromSecret
:
hf-token-secret
dynamoNamespace
:
llm-disagg-router
replicas
:
1
resources
:
requests
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
limits
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.disagg_router:VllmWorker
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
VllmWorker
PrefillWorker
:
envFromSecret
:
hf-token-secret
dynamoNamespace
:
llm-disagg-router
replicas
:
1
resources
:
requests
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
limits
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.disagg_router:PrefillWorker
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
PrefillWorker
examples/llm/graphs/__init__.py
deleted
100644 → 0
View file @
c7080419
examples/llm/graphs/agg.py
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
components.frontend
import
Frontend
from
components.planner_service
import
Planner
from
components.processor
import
Processor
from
components.worker
import
VllmWorker
Frontend
.
link
(
Processor
).
link
(
VllmWorker
)
Frontend
.
link
(
Planner
)
examples/llm/graphs/agg_router.py
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
components.frontend
import
Frontend
from
components.kv_router
import
Router
from
components.planner_service
import
Planner
from
components.processor
import
Processor
from
components.worker
import
VllmWorker
Frontend
.
link
(
Processor
).
link
(
Router
).
link
(
VllmWorker
)
Frontend
.
link
(
Planner
)
examples/llm/graphs/disagg.py
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
components.frontend
import
Frontend
from
components.planner_service
import
Planner
from
components.prefill_worker
import
PrefillWorker
from
components.processor
import
Processor
from
components.worker
import
VllmWorker
Frontend
.
link
(
Processor
).
link
(
VllmWorker
).
link
(
PrefillWorker
)
Frontend
.
link
(
Planner
)
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment