Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
f00d700e
Unverified
Commit
f00d700e
authored
Jul 14, 2025
by
Alec
Committed by
GitHub
Jul 14, 2025
Browse files
refactor: remove old examples with old UX (#1899)
parent
c7080419
Changes
111
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
2342 deletions
+0
-2342
examples/llm/components/planner.py
examples/llm/components/planner.py
+0
-495
examples/llm/components/planner_service.py
examples/llm/components/planner_service.py
+0
-114
examples/llm/components/prefill_worker.py
examples/llm/components/prefill_worker.py
+0
-211
examples/llm/components/processor.py
examples/llm/components/processor.py
+0
-351
examples/llm/components/worker.py
examples/llm/components/worker.py
+0
-248
examples/llm/configs/agg.yaml
examples/llm/configs/agg.yaml
+0
-42
examples/llm/configs/agg_router.yaml
examples/llm/configs/agg_router.yaml
+0
-48
examples/llm/configs/disagg.yaml
examples/llm/configs/disagg.yaml
+0
-51
examples/llm/configs/disagg_router.yaml
examples/llm/configs/disagg_router.yaml
+0
-58
examples/llm/configs/multinode-405b.yaml
examples/llm/configs/multinode-405b.yaml
+0
-66
examples/llm/configs/multinode_agg_r1.yaml
examples/llm/configs/multinode_agg_r1.yaml
+0
-39
examples/llm/configs/mutinode_disagg_r1.yaml
examples/llm/configs/mutinode_disagg_r1.yaml
+0
-47
examples/llm/deploy/agg.yaml
examples/llm/deploy/agg.yaml
+0
-100
examples/llm/deploy/agg_router.yaml
examples/llm/deploy/agg_router.yaml
+0
-125
examples/llm/deploy/disagg.yaml
examples/llm/deploy/disagg.yaml
+0
-127
examples/llm/deploy/disagg_router.yaml
examples/llm/deploy/disagg_router.yaml
+0
-152
examples/llm/graphs/__init__.py
examples/llm/graphs/__init__.py
+0
-0
examples/llm/graphs/agg.py
examples/llm/graphs/agg.py
+0
-22
examples/llm/graphs/agg_router.py
examples/llm/graphs/agg_router.py
+0
-23
examples/llm/graphs/disagg.py
examples/llm/graphs/disagg.py
+0
-23
No files found.
examples/llm/components/planner.py
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
asyncio
import
json
import
logging
import
os
import
time
from
datetime
import
datetime
from
typing
import
Any
,
List
import
numpy
as
np
from
rich.console
import
Console
from
rich.table
import
Table
from
tensorboardX
import
SummaryWriter
from
utils.prefill_queue
import
PrefillQueue
from
dynamo.llm
import
KvMetricsAggregator
from
dynamo.planner
import
KubernetesConnector
,
LocalConnector
from
dynamo.planner.defaults
import
LoadPlannerDefaults
from
dynamo.runtime
import
DistributedRuntime
,
dynamo_worker
from
dynamo.runtime.logging
import
configure_dynamo_logging
configure_dynamo_logging
()
logger
=
logging
.
getLogger
(
__name__
)
# will not decrease decode worker number within 3 adjustment interval after a new decode worker
# is added. this is to leave time for the new decode worker to populate its kv cache.
NEW_DECODE_WORKER_GRACE_PERIOD
=
3
# we do not scale up prefill worker if the prefill queue size is estimated to reduce within
# --prefill-queue-scale-up-threshold within the next NEW_PREFILL_WORKER_QUEUE_BUFFER_PERIOD
# adjustment intervals following the trend observed in the current adjustment interval.
# this is to account for the time for prefill workers to start.
NEW_PREFILL_WORKER_QUEUE_BUFFER_PERIOD
=
3
class
Planner
:
def
__init__
(
self
,
runtime
:
DistributedRuntime
,
args
:
argparse
.
Namespace
):
self
.
runtime
=
runtime
self
.
args
=
args
self
.
namespace
=
args
.
namespace
if
args
.
environment
==
"local"
:
self
.
connector
=
LocalConnector
(
args
.
namespace
,
runtime
)
elif
args
.
environment
==
"kubernetes"
:
self
.
connector
=
KubernetesConnector
(
args
.
namespace
)
else
:
raise
ValueError
(
f
"Invalid environment:
{
args
.
environment
}
"
)
self
.
_prefill_queue_nats_server
=
os
.
getenv
(
"NATS_SERVER"
,
"nats://localhost:4222"
)
self
.
_prefill_queue_stream_name
=
f
"
{
self
.
namespace
}
_prefill_queue"
self
.
prefill_client
:
Any
|
None
=
None
self
.
workers_client
:
Any
|
None
=
None
self
.
p_endpoints
:
List
[
int
]
=
[]
self
.
d_endpoints
:
List
[
int
]
=
[]
self
.
decode_worker_remaining_grace_period
=
0
if
args
.
log_dir
is
None
:
args
.
log_dir
=
f
"logs/
{
datetime
.
now
().
strftime
(
'%m%d_%H%M%S'
)
}
"
self
.
writer
=
SummaryWriter
(
args
.
log_dir
)
logger
.
info
(
f
"Components present in namespace:
{
args
.
namespace
}
"
)
self
.
init_time
=
time
.
time
()
# Set the appropriate logger function for repeated metric logging
self
.
_repeating_log_func
=
logger
.
debug
if
args
.
no_operation
else
logger
.
info
async
def
set_metric_aggregator
(
self
):
# TODO: separate KV metrics and prefill metrics
kv_listener
=
self
.
runtime
.
namespace
(
self
.
namespace
).
component
(
"VllmWorker"
)
await
kv_listener
.
create_service
()
self
.
metrics_aggregator
=
KvMetricsAggregator
(
kv_listener
)
async
def
get_workers_info
(
self
):
try
:
if
self
.
prefill_client
is
None
:
self
.
prefill_client
=
(
await
self
.
runtime
.
namespace
(
self
.
namespace
)
.
component
(
"PrefillWorker"
)
.
endpoint
(
"mock"
)
.
client
()
)
# TODO: remove this sleep after rust client() is blocking until watching state
await
asyncio
.
sleep
(
0.1
)
# TODO: use etcd events instead of pulling instance_ids
p_endpoints
=
self
.
prefill_client
.
instance_ids
()
except
Exception
:
p_endpoints
=
[]
self
.
_repeating_log_func
(
"No prefill workers found, operating in aggregated mode"
)
try
:
if
self
.
workers_client
is
None
:
self
.
workers_client
=
(
await
self
.
runtime
.
namespace
(
self
.
namespace
)
.
component
(
"VllmWorker"
)
.
endpoint
(
"generate"
)
.
client
()
)
# TODO: remove this sleep after rust client() is blocking until watching state
await
asyncio
.
sleep
(
0.1
)
# TODO: use etcd events instead of pulling instance_ids
d_endpoints
=
self
.
workers_client
.
instance_ids
()
except
Exception
as
e
:
raise
RuntimeError
(
f
"Failed to get decode worker endpoints:
{
e
}
"
)
return
p_endpoints
,
d_endpoints
async
def
reset_adjustment_interval
(
self
):
self
.
_repeating_log_func
(
f
"Reset metrics for new adjustment interval at t=
{
time
.
time
()
-
self
.
init_time
:.
1
f
}
s"
)
self
.
p_endpoints
,
self
.
d_endpoints
=
await
self
.
get_workers_info
()
self
.
_repeating_log_func
(
f
"Number of prefill workers:
{
len
(
self
.
p_endpoints
)
}
, number of decode workers:
{
len
(
self
.
d_endpoints
)
}
"
)
self
.
metrics_collection_time
=
[]
self
.
prefill_queue_load
=
[]
self
.
kv_load
=
[]
self
.
last_adjustment_time
=
time
.
time
()
async
def
collect_metrics
(
self
):
self
.
_repeating_log_func
(
f
"Collecting metrics at t=
{
time
.
time
()
-
self
.
init_time
:.
1
f
}
s"
)
# collect prefill queue load
try
:
async
with
PrefillQueue
.
get_instance
(
nats_server
=
self
.
_prefill_queue_nats_server
,
stream_name
=
self
.
_prefill_queue_stream_name
,
)
as
prefill_queue
:
prefill_queue_size
=
await
prefill_queue
.
get_queue_size
()
measure_time
=
time
.
time
()
-
self
.
init_time
self
.
prefill_queue_load
.
append
(
prefill_queue_size
)
self
.
_repeating_log_func
(
f
"Collected prefill queue size at t=
{
measure_time
:.
1
f
}
s:
{
int
(
prefill_queue_size
)
}
"
)
self
.
writer
.
add_scalar
(
"prefill_queue_size"
,
prefill_queue_size
,
measure_time
)
except
Exception
as
e
:
self
.
_repeating_log_func
(
f
"Failed to collect prefill queue size metrics:
{
e
}
"
)
# collect kv load
total_active_requests
:
int
=
0
total_queued_requests
:
int
=
0
metrics
=
await
self
.
metrics_aggregator
.
get_metrics
()
try
:
prev_kv_load_len
=
len
(
self
.
kv_load
)
for
endpoint
in
metrics
.
endpoints
:
kv_load
=
getattr
(
endpoint
,
"gpu_cache_usage_perc"
,
0.0
)
num_requests_waiting
=
getattr
(
endpoint
,
"num_requests_waiting"
,
0
)
total_queued_requests
+=
num_requests_waiting
request_active_slots
=
getattr
(
endpoint
,
"request_active_slots"
,
None
)
if
request_active_slots
:
total_active_requests
+=
request_active_slots
if
num_requests_waiting
>
0
:
# estimate kv load after waiting requests are scheduled based on current isl/osl
# TODO: use actual isl/osl estimation after the request_active_slot bug in disaggg is fixed
# Currently, we assume each request uses 0.02 kv cache
# kv_load = kv_load * (request_active_slots + num_requests_waiting) / request_active_slots
kv_load
=
kv_load
+
0.02
*
num_requests_waiting
self
.
kv_load
.
append
(
kv_load
)
measure_time
=
time
.
time
()
-
self
.
init_time
self
.
_repeating_log_func
(
f
"Collected kv load at t=
{
measure_time
:.
1
f
}
s:
{
self
.
kv_load
[
prev_kv_load_len
:]
}
(act/pnd req:
{
total_active_requests
}
/
{
total_queued_requests
}
)"
)
average_kv_load
=
np
.
mean
(
self
.
kv_load
[
prev_kv_load_len
:])
self
.
writer
.
add_scalar
(
"average_kv_load"
,
average_kv_load
,
measure_time
)
self
.
writer
.
add_scalar
(
"total_queued_requests"
,
total_queued_requests
,
measure_time
)
except
Exception
as
e
:
self
.
_repeating_log_func
(
f
"Failed to collect kv load metrics:
{
e
}
"
)
p_endpoints
,
d_endpoints
=
await
self
.
get_workers_info
()
self
.
writer
.
add_scalar
(
"num_prefill_workers"
,
len
(
p_endpoints
),
time
.
time
()
-
self
.
init_time
)
self
.
writer
.
add_scalar
(
"num_decode_workers"
,
len
(
d_endpoints
),
time
.
time
()
-
self
.
init_time
)
curr_gpu_usage
=
(
len
(
p_endpoints
)
*
self
.
args
.
prefill_engine_num_gpu
+
len
(
d_endpoints
)
*
self
.
args
.
decode_engine_num_gpu
)
self
.
writer
.
add_scalar
(
"num_gpu"
,
curr_gpu_usage
,
time
.
time
()
-
self
.
init_time
)
self
.
metrics_collection_time
.
append
(
time
.
time
())
async
def
make_adjustments
(
self
):
# Note: all adjustments are blocking. Non-blocking adjustment and metric pulling
# make the optimization problem too complex and should not be needed in most cases.
logger
.
info
(
f
"Making adjustments at t=
{
time
.
time
()
-
self
.
init_time
:.
1
f
}
s"
)
# check if decode/prefill workers is still the same
# note that we only check length as endpoint ids might change
new_p_endpoints
,
new_d_endpoints
=
await
self
.
get_workers_info
()
if
len
(
new_p_endpoints
)
!=
len
(
self
.
p_endpoints
)
or
len
(
new_d_endpoints
)
!=
len
(
self
.
d_endpoints
):
logger
.
info
(
"Decode/prefill workers changed, no adjustments will be made"
)
return
# compute current gpu usage
curr_gpu_usage
=
(
len
(
self
.
p_endpoints
)
*
self
.
args
.
prefill_engine_num_gpu
+
len
(
self
.
d_endpoints
)
*
self
.
args
.
decode_engine_num_gpu
)
logger
.
info
(
f
"Current engines use
{
curr_gpu_usage
}
GPUs"
)
avg_prefill_queue_load
=
np
.
mean
(
self
.
prefill_queue_load
)
/
len
(
self
.
p_endpoints
)
avg_kv_load
=
np
.
mean
(
self
.
kv_load
)
# first check if we need to scale down any workers
if
(
avg_prefill_queue_load
<
self
.
args
.
prefill_queue_scale_down_threshold
and
len
(
self
.
p_endpoints
)
>
self
.
args
.
min_endpoint
):
logger
.
info
(
f
"Average prefill queue load (
{
avg_prefill_queue_load
:.
2
f
}
) is below threshold (
{
self
.
args
.
prefill_queue_scale_down_threshold
:.
2
f
}
), scaling down prefill workers"
)
success
=
await
self
.
connector
.
remove_component
(
"PrefillWorker"
)
if
success
:
curr_gpu_usage
-=
self
.
args
.
prefill_engine_num_gpu
else
:
logger
.
info
(
"Failed to scale down prefill worker"
)
if
(
avg_kv_load
<
self
.
args
.
decode_kv_scale_down_threshold
and
len
(
self
.
d_endpoints
)
>
self
.
args
.
min_endpoint
):
if
self
.
decode_worker_remaining_grace_period
>
0
:
logger
.
info
(
f
"Decode worker remaining grace period is
{
self
.
decode_worker_remaining_grace_period
}
, skipping scale down"
)
else
:
logger
.
info
(
f
"Average kv load (
{
avg_kv_load
:.
2
f
}
) is below threshold (
{
self
.
args
.
decode_kv_scale_down_threshold
:.
2
f
}
), scaling down decode workers"
)
success
=
await
self
.
connector
.
remove_component
(
"VllmWorker"
)
if
success
:
curr_gpu_usage
-=
self
.
args
.
decode_engine_num_gpu
else
:
logger
.
info
(
"Failed to scale down decode worker"
)
# check if we need to scale up workers
# we first check for prefill worker because prefill queueing can also lead
# to high kv load on decode workers
if
(
avg_prefill_queue_load
>
self
.
args
.
prefill_queue_scale_up_threshold
and
curr_gpu_usage
+
self
.
args
.
prefill_engine_num_gpu
<=
self
.
args
.
max_gpu_budget
):
logger
.
info
(
f
"Average prefill queue load (
{
avg_prefill_queue_load
:.
2
f
}
) is above threshold (
{
self
.
args
.
prefill_queue_scale_up_threshold
:.
2
f
}
)"
)
# check prefill queue size trend:
prefill_queue_size_change
=
(
self
.
prefill_queue_load
[
-
1
]
-
self
.
prefill_queue_load
[
0
]
)
predicted_prefill_future_queue_size
=
(
self
.
prefill_queue_load
[
-
1
]
+
prefill_queue_size_change
*
NEW_PREFILL_WORKER_QUEUE_BUFFER_PERIOD
)
if
(
predicted_prefill_future_queue_size
>
self
.
args
.
prefill_queue_scale_up_threshold
):
logger
.
info
(
f
"Predicted future prefill queue size (
{
predicted_prefill_future_queue_size
:.
2
f
}
) is also above threshold (
{
self
.
args
.
prefill_queue_scale_up_threshold
:.
2
f
}
), scaling up prefill workers"
)
success
=
await
self
.
connector
.
add_component
(
"PrefillWorker"
)
if
success
:
curr_gpu_usage
+=
self
.
args
.
prefill_engine_num_gpu
else
:
logger
.
info
(
"Failed to scale up prefill worker"
)
else
:
logger
.
info
(
f
"Predicted future prefill queue size (
{
predicted_prefill_future_queue_size
:.
2
f
}
) is below threshold (
{
self
.
args
.
prefill_queue_scale_up_threshold
:.
2
f
}
), skipping prefill worker scaling"
)
if
(
avg_kv_load
>
self
.
args
.
decode_kv_scale_up_threshold
and
curr_gpu_usage
+
self
.
args
.
decode_engine_num_gpu
<=
self
.
args
.
max_gpu_budget
):
logger
.
info
(
f
"Average kv load (
{
avg_kv_load
:.
2
f
}
) is above threshold (
{
self
.
args
.
decode_kv_scale_up_threshold
:.
2
f
}
), scaling up decode workers"
)
success
=
await
self
.
connector
.
add_component
(
"VllmWorker"
)
if
success
:
curr_gpu_usage
+=
self
.
args
.
decode_engine_num_gpu
self
.
decode_worker_remaining_grace_period
=
(
NEW_DECODE_WORKER_GRACE_PERIOD
)
else
:
logger
.
info
(
"Failed to scale up decode worker"
)
# no adjustment needed, just log the current metrics
if
(
avg_prefill_queue_load
>
self
.
args
.
prefill_queue_scale_down_threshold
and
avg_prefill_queue_load
<
self
.
args
.
prefill_queue_scale_up_threshold
):
logger
.
info
(
f
"Average prefill queue load (
{
avg_prefill_queue_load
:.
2
f
}
) is within threshold, no prefill worker scaling needed"
)
if
(
avg_kv_load
>
self
.
args
.
decode_kv_scale_down_threshold
and
avg_kv_load
<
self
.
args
.
decode_kv_scale_up_threshold
):
logger
.
info
(
f
"Average kv load (
{
avg_kv_load
:.
2
f
}
) is within threshold, no decode worker scaling needed"
)
logger
.
info
(
f
"Engines after adjustment use
{
curr_gpu_usage
}
GPUs"
)
if
self
.
decode_worker_remaining_grace_period
>
0
:
self
.
decode_worker_remaining_grace_period
-=
1
async
def
run
(
self
):
"""Main loop for the planner"""
await
self
.
set_metric_aggregator
()
if
self
.
_repeating_log_func
==
logger
.
debug
:
logger
.
info
(
"Running in no-operation mode - detailed metrics will be logged at DEBUG level"
)
await
self
.
reset_adjustment_interval
()
while
True
:
current_time
=
time
.
time
()
# Collect metrics at each metric pulling interval
if
(
len
(
self
.
metrics_collection_time
)
==
0
or
current_time
-
self
.
metrics_collection_time
[
-
1
]
>=
self
.
args
.
metric_pulling_interval
):
await
self
.
collect_metrics
()
# Check if it's time for adjustment
if
(
current_time
-
self
.
last_adjustment_time
>=
self
.
args
.
adjustment_interval
):
if
not
self
.
args
.
no_operation
:
# blockingly make adjustments to avoid overcompensation
await
self
.
make_adjustments
()
await
self
.
reset_adjustment_interval
()
# Sleep to avoid busy waiting
await
asyncio
.
sleep
(
self
.
args
.
metric_pulling_interval
/
10
)
# @dynamo_worker()
# TODO: let's make it such that planner still works via CLI invokation
async
def
start_planner
(
runtime
:
DistributedRuntime
,
args
:
argparse
.
Namespace
):
planner
=
Planner
(
runtime
,
args
)
console
=
Console
()
table
=
Table
()
table
.
add_column
(
"Component"
,
style
=
"cyan"
)
table
.
add_column
(
"Endpoint"
,
style
=
"green"
)
components
=
await
runtime
.
etcd_client
().
kv_get_prefix
(
args
.
namespace
)
for
component
in
components
:
try
:
data
=
json
.
loads
(
component
[
"value"
].
decode
(
"utf-8"
))
if
"component"
in
data
:
name
=
data
[
"component"
]
endpoint
=
data
[
"endpoint"
]
table
.
add_row
(
name
,
endpoint
)
except
Exception
:
# Some entries may not be valid JSON or might be binary data
pass
console
.
print
(
table
)
await
planner
.
run
()
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
# Common planner arguments
parser
.
add_argument
(
"--namespace"
,
type
=
str
,
default
=
LoadPlannerDefaults
.
namespace
,
help
=
"Namespace planner will look at"
,
)
parser
.
add_argument
(
"--environment"
,
type
=
str
,
default
=
LoadPlannerDefaults
.
environment
,
help
=
"Environment to run the planner in (local, kubernetes)"
,
)
parser
.
add_argument
(
"--no-operation"
,
action
=
"store_true"
,
default
=
LoadPlannerDefaults
.
no_operation
,
help
=
"Do not make any adjustments, just observe the metrics"
,
)
parser
.
add_argument
(
"--log-dir"
,
type
=
str
,
default
=
LoadPlannerDefaults
.
log_dir
,
help
=
"Tensorboard logging directory"
,
)
parser
.
add_argument
(
"--adjustment-interval"
,
type
=
int
,
default
=
LoadPlannerDefaults
.
adjustment_interval
,
help
=
"Interval in seconds between scaling adjustments"
,
)
parser
.
add_argument
(
"--max-gpu-budget"
,
type
=
int
,
default
=
LoadPlannerDefaults
.
max_gpu_budget
,
help
=
"Maximum number of GPUs to use"
,
)
parser
.
add_argument
(
"--min-endpoint"
,
type
=
int
,
default
=
LoadPlannerDefaults
.
min_endpoint
,
help
=
"Minimum number of endpoints to keep for prefill/decode workers"
,
)
parser
.
add_argument
(
"--metric-pulling-interval"
,
type
=
int
,
default
=
LoadPlannerDefaults
.
metric_pulling_interval
,
help
=
"Interval in seconds between metric pulls"
,
)
parser
.
add_argument
(
"--decode-engine-num-gpu"
,
type
=
int
,
default
=
LoadPlannerDefaults
.
decode_engine_num_gpu
,
help
=
"Number of GPUs per decode engine"
,
)
parser
.
add_argument
(
"--prefill-engine-num-gpu"
,
type
=
int
,
default
=
LoadPlannerDefaults
.
prefill_engine_num_gpu
,
help
=
"Number of GPUs per prefill engine"
,
)
# Load-planner specific arguments
parser
.
add_argument
(
"--decode-kv-scale-up-threshold"
,
type
=
float
,
default
=
LoadPlannerDefaults
.
decode_kv_scale_up_threshold
,
help
=
"KV cache utilization threshold to scale up decode workers"
,
)
parser
.
add_argument
(
"--decode-kv-scale-down-threshold"
,
type
=
float
,
default
=
LoadPlannerDefaults
.
decode_kv_scale_down_threshold
,
help
=
"KV cache utilization threshold to scale down decode workers"
,
)
parser
.
add_argument
(
"--prefill-queue-scale-up-threshold"
,
type
=
float
,
default
=
LoadPlannerDefaults
.
prefill_queue_scale_up_threshold
,
help
=
"Queue utilization threshold to scale up prefill workers, this threshold is per prefill worker"
,
)
parser
.
add_argument
(
"--prefill-queue-scale-down-threshold"
,
type
=
float
,
default
=
LoadPlannerDefaults
.
prefill_queue_scale_down_threshold
,
help
=
"Queue utilization threshold to scale down prefill workers, this threshold is per prefill worker"
,
)
args
=
parser
.
parse_args
()
asyncio
.
run
(
dynamo_worker
()(
start_planner
)(
args
))
examples/llm/components/planner_service.py
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
logging
from
pydantic
import
BaseModel
from
components.planner
import
start_planner
# type: ignore[attr-defined]
from
dynamo.planner.defaults
import
LoadPlannerDefaults
from
dynamo.runtime.logging
import
configure_dynamo_logging
from
dynamo.sdk
import
async_on_start
,
dynamo_context
,
endpoint
,
service
from
dynamo.sdk.core.protocol.interface
import
ComponentType
from
dynamo.sdk.lib.config
import
ServiceConfig
from
dynamo.sdk.lib.image
import
DYNAMO_IMAGE
logger
=
logging
.
getLogger
(
__name__
)
class
RequestType
(
BaseModel
):
text
:
str
@
service
(
dynamo
=
{
"namespace"
:
"dynamo"
,
"component_type"
:
ComponentType
.
PLANNER
,
},
resources
=
{
"cpu"
:
"10"
,
"memory"
:
"20Gi"
},
workers
=
1
,
image
=
DYNAMO_IMAGE
,
)
class
Planner
:
def
__init__
(
self
):
configure_dynamo_logging
(
service_name
=
"Planner"
)
logger
.
info
(
"Starting planner"
)
self
.
runtime
=
dynamo_context
[
"runtime"
]
config
=
ServiceConfig
.
get_instance
()
# Get namespace directly from dynamo_context as it contains the active namespace
self
.
namespace
=
dynamo_context
[
"namespace"
]
config_instance
=
config
.
get
(
"Planner"
,
{})
self
.
args
=
argparse
.
Namespace
(
namespace
=
self
.
namespace
,
environment
=
config_instance
.
get
(
"environment"
,
LoadPlannerDefaults
.
environment
),
no_operation
=
config_instance
.
get
(
"no-operation"
,
LoadPlannerDefaults
.
no_operation
),
log_dir
=
config_instance
.
get
(
"log-dir"
,
LoadPlannerDefaults
.
log_dir
),
adjustment_interval
=
config_instance
.
get
(
"adjustment-interval"
,
LoadPlannerDefaults
.
adjustment_interval
),
metric_pulling_interval
=
config_instance
.
get
(
"metric-pulling-interval"
,
LoadPlannerDefaults
.
metric_pulling_interval
),
max_gpu_budget
=
config_instance
.
get
(
"max-gpu-budget"
,
LoadPlannerDefaults
.
max_gpu_budget
),
min_endpoint
=
config_instance
.
get
(
"min-endpoint"
,
LoadPlannerDefaults
.
min_endpoint
),
decode_kv_scale_up_threshold
=
config_instance
.
get
(
"decode-kv-scale-up-threshold"
,
LoadPlannerDefaults
.
decode_kv_scale_up_threshold
,
),
decode_kv_scale_down_threshold
=
config_instance
.
get
(
"decode-kv-scale-down-threshold"
,
LoadPlannerDefaults
.
decode_kv_scale_down_threshold
,
),
prefill_queue_scale_up_threshold
=
config_instance
.
get
(
"prefill-queue-scale-up-threshold"
,
LoadPlannerDefaults
.
prefill_queue_scale_up_threshold
,
),
prefill_queue_scale_down_threshold
=
config_instance
.
get
(
"prefill-queue-scale-down-threshold"
,
LoadPlannerDefaults
.
prefill_queue_scale_down_threshold
,
),
decode_engine_num_gpu
=
config_instance
.
get
(
"decode-engine-num-gpu"
,
LoadPlannerDefaults
.
decode_engine_num_gpu
),
prefill_engine_num_gpu
=
config_instance
.
get
(
"prefill-engine-num-gpu"
,
LoadPlannerDefaults
.
prefill_engine_num_gpu
),
)
@
async_on_start
async
def
async_init
(
self
):
import
asyncio
await
asyncio
.
sleep
(
30
)
logger
.
info
(
"Calling start_planner"
)
await
start_planner
(
self
.
runtime
,
self
.
args
)
logger
.
info
(
"Planner started"
)
@
endpoint
()
async
def
generate
(
self
,
request
:
RequestType
):
"""Dummy endpoint to satisfy that each component has an endpoint"""
yield
"mock endpoint"
examples/llm/components/prefill_worker.py
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
logging
import
os
import
signal
import
sys
from
pydantic
import
BaseModel
from
utils.nixl
import
NixlMetadataStore
from
utils.prefill_queue
import
PrefillQueue
from
utils.vllm
import
parse_vllm_args
from
vllm.entrypoints.openai.api_server
import
(
build_async_engine_client_from_engine_args
,
)
from
vllm.inputs.data
import
TokensPrompt
from
vllm.remote_prefill
import
RemotePrefillParams
,
RemotePrefillRequest
from
dynamo.sdk
import
async_on_start
,
dynamo_context
,
endpoint
,
service
logger
=
logging
.
getLogger
(
__name__
)
class
RequestType
(
BaseModel
):
text
:
str
@
service
(
dynamo
=
{
"namespace"
:
"dynamo"
,
},
resources
=
{
"gpu"
:
1
,
"cpu"
:
"10"
,
"memory"
:
"20Gi"
},
workers
=
1
,
)
class
PrefillWorker
:
def
__init__
(
self
):
class_name
=
self
.
__class__
.
__name__
self
.
engine_args
=
parse_vllm_args
(
class_name
,
""
)
self
.
_loaded_metadata
=
set
()
self
.
initialized
=
False
if
self
.
engine_args
.
enable_chunked_prefill
is
not
False
:
logger
.
info
(
"Chunked prefill is not supported yet, setting to False"
)
self
.
engine_args
.
enable_chunked_prefill
=
False
if
self
.
engine_args
.
pipeline_parallel_size
!=
1
:
logger
.
info
(
"Pipeline parallel size is not supported yet, setting to 1"
)
self
.
engine_args
.
pipeline_parallel_size
=
1
if
self
.
engine_args
.
disable_async_output_proc
is
not
True
:
logger
.
info
(
"Async output processing is not supported yet, setting to True"
)
self
.
engine_args
.
disable_async_output_proc
=
True
if
self
.
engine_args
.
enforce_eager
is
not
True
:
logger
.
info
(
"Prefill must be done eagerly, setting to True"
)
self
.
engine_args
.
enforce_eager
=
True
if
self
.
engine_args
.
enable_prefix_caching
is
not
False
:
logger
.
info
(
"Prefix caching is not supported yet in prefill worker, setting to False"
)
self
.
engine_args
.
enable_prefix_caching
=
False
@
async_on_start
async
def
async_init
(
self
):
self
.
_engine_context
=
build_async_engine_client_from_engine_args
(
self
.
engine_args
)
if
self
.
_engine_context
is
not
None
:
self
.
engine_client
=
await
self
.
_engine_context
.
__aenter__
()
else
:
raise
RuntimeError
(
"Failed to initialize engine client"
)
runtime
=
dynamo_context
[
"runtime"
]
metadata
=
self
.
engine_client
.
nixl_metadata
self
.
_metadata_store
=
NixlMetadataStore
(
"dynamo"
,
runtime
)
await
self
.
_metadata_store
.
put
(
metadata
.
engine_id
,
metadata
)
self
.
task
=
asyncio
.
create_task
(
self
.
prefill_queue_handler
())
def
prefill_queue_handler_cb
(
fut
):
try
:
fut
.
result
()
logger
.
info
(
"prefill queue handler exited successfully"
)
except
Exception
as
e
:
logger
.
error
(
f
"[ERROR] prefill queue handler failed:
{
e
!
r
}
"
)
sys
.
exit
(
1
)
self
.
task
.
add_done_callback
(
prefill_queue_handler_cb
)
self
.
shutdown_requested
=
False
# Set up signal handler for graceful shutdown
# TODO: move to dynamo sdk
loop
=
asyncio
.
get_running_loop
()
def
signal_handler
():
# Schedule the shutdown coroutine instead of calling it directly
asyncio
.
create_task
(
self
.
graceful_shutdown
(
runtime
))
for
sig
in
(
signal
.
SIGTERM
,
signal
.
SIGINT
):
loop
.
add_signal_handler
(
sig
,
signal_handler
)
logger
.
info
(
"PrefillWorker initialized"
)
async
def
graceful_shutdown
(
self
,
runtime
):
logger
.
info
(
"Received shutdown signal, shutting down DistributedRuntime"
)
# first shutdown the vllm engine
self
.
shutdown_requested
=
True
await
asyncio
.
wait_for
(
self
.
task
,
timeout
=
None
)
# then shutdown the mock endpoint
runtime
.
shutdown
()
logger
.
info
(
"DistributedRuntime shutdown complete"
)
def
shutdown_vllm_engine
(
self
):
"""Shutdown the background loop"""
logger
.
info
(
"Shutting down vllm engine"
)
loop
=
asyncio
.
get_event_loop
()
try
:
self
.
engine_client
.
close
()
logger
.
info
(
"PrefillWorker shutdown complete"
)
except
Exception
as
e
:
logger
.
error
(
f
"Error during shutdown:
{
e
}
"
)
finally
:
loop
.
stop
()
async
def
prefill_queue_handler
(
self
):
logger
.
info
(
"Prefill queue handler entered"
)
prefill_queue_nats_server
=
os
.
getenv
(
"NATS_SERVER"
,
"nats://localhost:4222"
)
namespace
,
_
=
PrefillWorker
.
dynamo_address
()
# type: ignore
prefill_queue_stream_name
=
f
"
{
namespace
}
_prefill_queue"
logger
.
info
(
f
"Prefill queue:
{
prefill_queue_nats_server
}
:
{
prefill_queue_stream_name
}
"
)
self
.
initialized
=
True
# TODO: integrate prefill_queue to a dynamo endpoint
async
with
PrefillQueue
.
get_instance
(
nats_server
=
prefill_queue_nats_server
,
stream_name
=
prefill_queue_stream_name
,
)
as
prefill_queue
:
logger
.
info
(
"prefill queue handler started"
)
while
True
:
# TODO: this might add a small overhead to pull prefill from nats
# need to test and check how much overhead it is
prefill_request
=
await
prefill_queue
.
dequeue_prefill_request
()
if
prefill_request
is
not
None
:
logger
.
info
(
f
"Dequeued prefill request:
{
prefill_request
.
request_id
}
"
)
async
for
_
in
self
.
generate
(
prefill_request
):
pass
if
self
.
shutdown_requested
:
logger
.
info
(
"Shutdown requested, checking if engine has any pending prefill sending requests"
)
while
True
:
if
not
await
self
.
engine_client
.
has_unfinished_requests
():
break
logger
.
info
(
"Engine has pending prefill sending requests, rechecking in 1 second..."
)
await
asyncio
.
sleep
(
1
)
self
.
shutdown_vllm_engine
()
break
async
def
generate
(
self
,
request
:
RemotePrefillRequest
):
sampling_params
=
request
.
sampling_params
sampling_params
.
max_tokens
=
1
sampling_params
.
min_tokens
=
1
remote_prefill_params
=
RemotePrefillParams
(
is_remote_decode
=
True
,
decode_block_ids
=
request
.
block_ids
,
decode_engine_id
=
request
.
engine_id
,
decode_computed_block_ids
=
request
.
computed_block_ids
,
)
# TODO check if metadata has changed
# and reload - currently only loading once
if
request
.
engine_id
not
in
self
.
_loaded_metadata
:
remote_metadata
=
await
self
.
_metadata_store
.
get
(
request
.
engine_id
)
await
self
.
engine_client
.
add_remote_nixl_metadata
(
remote_metadata
)
logger
.
info
(
f
"Loaded nixl metadata from engine
{
request
.
engine_id
}
into "
f
"engine
{
self
.
engine_client
.
nixl_metadata
.
engine_id
}
"
)
self
.
_loaded_metadata
.
add
(
request
.
engine_id
)
async
for
_
in
self
.
engine_client
.
generate
(
request_id
=
request
.
request_id
,
prompt
=
TokensPrompt
(
prompt_token_ids
=
request
.
prompt_token_ids
),
sampling_params
=
sampling_params
,
remote_prefill_params
=
remote_prefill_params
,
):
yield
@
endpoint
()
async
def
mock
(
self
,
req
:
RequestType
):
yield
f
"mock_response:
{
req
}
"
examples/llm/components/processor.py
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
logging
import
uuid
from
enum
import
Enum
from
typing
import
Any
,
AsyncIterator
,
Dict
,
List
,
Tuple
,
Union
from
components.kv_router
import
Router
from
components.worker
import
VllmWorker
from
transformers
import
AutoTokenizer
from
utils.chat_processor
import
ChatProcessor
,
CompletionsProcessor
,
ProcessMixIn
from
utils.check_worker
import
check_required_workers
from
utils.protocol
import
LocalBlockHashes
,
MyRequestOutput
,
vLLMGenerateRequest
from
utils.vllm
import
RouterType
,
parse_vllm_args
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
,
CompletionRequest
from
vllm.outputs
import
RequestOutput
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
dynamo.llm
import
KvMetricsAggregator
,
compute_block_hash_for_seq_py
from
dynamo.runtime
import
EtcdKvCache
from
dynamo.sdk
import
async_on_start
,
depends
,
dynamo_context
,
endpoint
,
service
logger
=
logging
.
getLogger
(
__name__
)
class
RequestType
(
Enum
):
CHAT
=
"chat"
COMPLETION
=
"completion"
@
service
(
dynamo
=
{
"namespace"
:
"dynamo"
,
},
resources
=
{
"cpu"
:
"10"
,
"memory"
:
"20Gi"
},
workers
=
1
,
)
class
Processor
(
ProcessMixIn
):
"""
vLLM pre and post processing
"""
worker
=
depends
(
VllmWorker
)
router
=
depends
(
Router
)
def
__init__
(
self
):
class_name
=
self
.
__class__
.
__name__
self
.
engine_args
=
parse_vllm_args
(
class_name
,
""
)
self
.
model_config
=
self
.
engine_args
.
create_model_config
()
self
.
default_sampling_params
=
self
.
model_config
.
get_diff_sampling_param
()
self
.
tokenizer
=
self
.
_create_tokenizer
(
self
.
engine_args
)
self
.
chat_processor
=
ChatProcessor
(
self
.
tokenizer
,
self
.
model_config
)
self
.
completions_processor
=
CompletionsProcessor
(
self
.
tokenizer
,
self
.
model_config
)
self
.
min_workers
=
1
self
.
request_queue
:
asyncio
.
Queue
[
Dict
[
str
,
Any
]]
=
asyncio
.
Queue
()
self
.
request_futures
:
Dict
[
str
,
asyncio
.
Future
]
=
{}
self
.
num_worker_tasks
=
(
self
.
engine_args
.
router_num_threads
)
# Number of worker tasks to process the queue
self
.
worker_tasks
:
List
[
asyncio
.
Task
]
=
[]
print
(
f
"Processor init:
{
self
.
engine_args
.
router
}
"
)
def
_create_tokenizer
(
self
,
engine_args
:
AsyncEngineArgs
)
->
AnyTokenizer
:
"""Create a TokenizerGroup using engine arguments similar to VLLM's approach"""
model_path
=
engine_args
.
model
# Create the base tokenizer with VLLM's typical settings
base_tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
,
padding_side
=
"left"
,
truncation_side
=
"left"
,
use_fast
=
True
,
# VLLM might use the fast tokenizer for efficiency
)
return
base_tokenizer
@
async_on_start
async
def
async_init
(
self
):
runtime
=
dynamo_context
[
"runtime"
]
comp_ns
,
comp_name
=
VllmWorker
.
dynamo_address
()
# type: ignore
self
.
worker_client
=
(
await
runtime
.
namespace
(
comp_ns
)
.
component
(
comp_name
)
.
endpoint
(
"generate"
)
.
client
()
)
self
.
use_router
=
self
.
engine_args
.
router
in
(
RouterType
.
KV
,
RouterType
.
KV_LOAD
,
RouterType
.
APPROX_KV
,
)
if
self
.
use_router
:
router_ns
,
router_name
=
Router
.
dynamo_address
()
# type: ignore
self
.
router_client
=
(
await
runtime
.
namespace
(
router_ns
)
.
component
(
router_name
)
.
endpoint
(
"generate"
)
.
client
()
)
await
check_required_workers
(
self
.
worker_client
,
self
.
min_workers
)
kv_listener
=
runtime
.
namespace
(
"dynamo"
).
component
(
"VllmWorker"
)
await
kv_listener
.
create_service
()
self
.
metrics_aggregator
=
KvMetricsAggregator
(
kv_listener
)
self
.
etcd_kv_cache
=
await
EtcdKvCache
.
create
(
runtime
.
etcd_client
(),
f
"/
{
comp_ns
}
/processor/"
,
{
"router"
:
self
.
engine_args
.
router
},
)
# Start multiple worker tasks to process the queue
self
.
_start_worker_tasks
()
def
_start_worker_tasks
(
self
):
"""Start multiple worker tasks to process the queue concurrently"""
# Clear any existing worker tasks
for
task
in
self
.
worker_tasks
:
if
not
task
.
done
():
task
.
cancel
()
self
.
worker_tasks
=
[]
# Create new worker tasks
for
i
in
range
(
self
.
num_worker_tasks
):
task
=
asyncio
.
create_task
(
self
.
_process_queue
(
worker_id
=
i
))
self
.
worker_tasks
.
append
(
task
)
logger
.
info
(
f
"Started
{
self
.
num_worker_tasks
}
queue worker tasks"
)
async
def
_process_queue
(
self
,
worker_id
:
int
):
"""Background task to process the request queue"""
logger
.
info
(
f
"Queue worker
{
worker_id
}
started"
)
while
True
:
try
:
# Get the next request from the queue
request_data
=
await
self
.
request_queue
.
get
()
# Process the request
try
:
await
self
.
_process_request
(
request_data
)
except
Exception
as
e
:
logger
.
error
(
f
"Worker
{
worker_id
}
: Error processing request:
{
e
}
"
)
finally
:
# Mark the task as done
self
.
request_queue
.
task_done
()
except
asyncio
.
CancelledError
:
logger
.
info
(
f
"Queue worker
{
worker_id
}
was cancelled"
)
break
except
Exception
as
e
:
logger
.
error
(
f
"Worker
{
worker_id
}
: Unexpected error in queue processing:
{
e
}
"
)
# Sleep briefly to avoid tight error loops
await
asyncio
.
sleep
(
0.1
)
async
def
_get_kv_load
(
self
):
metrics
=
await
self
.
metrics_aggregator
.
get_metrics
()
kv_load
=
{}
for
end_point
in
metrics
.
endpoints
:
worker_id
=
end_point
.
worker_id
kv_load
[
worker_id
]
=
getattr
(
end_point
,
"gpu_cache_usage_perc"
,
0.0
)
return
kv_load
async
def
_get_pending_requests
(
self
):
metrics
=
await
self
.
metrics_aggregator
.
get_metrics
()
pending_requests
=
{}
for
end_point
in
metrics
.
endpoints
:
worker_id
=
end_point
.
worker_id
pending_requests
[
worker_id
]
=
getattr
(
endpoint
,
"num_requests_waiting"
,
0
)
return
pending_requests
async
def
_generate
(
self
,
raw_request
:
Union
[
CompletionRequest
,
ChatCompletionRequest
],
request_type
:
RequestType
,
):
request_id
=
str
(
uuid
.
uuid4
())
logger
.
debug
(
f
"Got raw request:
{
raw_request
}
"
)
# Create a future for this request
future
:
asyncio
.
Future
[
AsyncIterator
[
Any
]]
=
asyncio
.
Future
()
self
.
request_futures
[
request_id
]
=
future
# Enqueue the request with minimal processing
await
self
.
request_queue
.
put
(
{
"request_id"
:
request_id
,
"raw_request"
:
raw_request
,
"request_type"
:
request_type
,
}
)
try
:
# Wait for the future to complete and yield the results
generator
=
await
future
async
for
response
in
generator
:
yield
response
finally
:
# Clean up the future when done
if
request_id
in
self
.
request_futures
:
del
self
.
request_futures
[
request_id
]
async
def
_process_request
(
self
,
request_data
:
Dict
[
str
,
Any
]):
"""Process a single request from the queue"""
request_id
=
request_data
[
"request_id"
]
raw_request
=
request_data
[
"raw_request"
]
request_type
=
request_data
[
"request_type"
]
try
:
# Parse the raw request here instead of in _generate
(
request
,
conversation
,
prompt
,
engine_prompt
,
sampling_params
,
)
=
await
self
.
_parse_raw_request
(
raw_request
)
# Create an async generator function to process this request
async
def
process_and_stream
():
# TODO: queue request at processor when engines are full
router_mode
=
(
await
self
.
etcd_kv_cache
.
get
(
"router"
)).
decode
()
self
.
use_router
=
router_mode
in
(
RouterType
.
KV
,
RouterType
.
KV_LOAD
,
RouterType
.
APPROX_KV
,
)
prefix_hit_rate
=
0.0
# Default value
if
self
.
use_router
:
token_ids
=
engine_prompt
[
"prompt_token_ids"
]
router_generator
=
await
self
.
router_client
.
generate
(
LocalBlockHashes
(
hashes
=
compute_block_hash_for_seq_py
(
token_ids
,
self
.
engine_args
.
block_size
),
tokens
=
token_ids
,
num_tokens
=
len
(
token_ids
),
).
model_dump_json
()
)
decision
=
await
router_generator
.
__anext__
()
worker_id
,
prefix_hit_rate
=
decision
.
data
()
prefix_hit_rate
=
float
(
prefix_hit_rate
)
# Create request object once with default prefix_hit_rate
request_obj
=
vLLMGenerateRequest
(
engine_prompt
=
engine_prompt
,
sampling_params
=
sampling_params
,
request_id
=
request_id
,
prefix_hit_rate
=
prefix_hit_rate
,
).
model_dump_json
()
if
self
.
use_router
:
if
worker_id
==
""
:
engine_generator
=
await
self
.
worker_client
.
generate
(
request_obj
)
else
:
engine_generator
=
await
self
.
worker_client
.
direct
(
request_obj
,
int
(
worker_id
)
)
elif
router_mode
==
RouterType
.
RANDOM
:
engine_generator
=
await
self
.
worker_client
.
generate
(
request_obj
)
elif
router_mode
==
RouterType
.
ROUND_ROBIN
:
engine_generator
=
await
self
.
worker_client
.
round_robin
(
request_obj
)
output_generator
=
self
.
_generate_responses
(
engine_generator
,
request_type
)
# Stream responses directly to the caller
async
for
response
in
await
self
.
_stream_response
(
request
,
output_generator
,
request_id
,
conversation
):
yield
response
# Set the future result to our async generator
if
request_id
in
self
.
request_futures
:
self
.
request_futures
[
request_id
].
set_result
(
process_and_stream
())
except
Exception
as
e
:
logger
.
error
(
f
"Error processing request
{
request_id
}
:
{
e
}
"
)
# Set exception on the future if it still exists
if
(
request_id
in
self
.
request_futures
and
not
self
.
request_futures
[
request_id
].
done
()
):
self
.
request_futures
[
request_id
].
set_exception
(
e
)
async
def
_generate_responses
(
self
,
engine_generator
:
AsyncIterator
[
RequestOutput
],
request_type
:
RequestType
)
->
AsyncIterator
[
Union
[
RequestOutput
,
Tuple
[
int
,
RequestOutput
]]]:
prompt_idx
=
0
async
for
resp
in
engine_generator
:
# Deserialize the response from the engine
# Creates correct vLLM objects for each field
output
=
MyRequestOutput
.
model_validate_json
(
resp
.
data
())
# OpenAIServingChat.chat_completion_stream_generator() method expects a RequestOutput object
request_output
=
RequestOutput
(
request_id
=
output
.
request_id
,
prompt
=
output
.
prompt
,
prompt_token_ids
=
output
.
prompt_token_ids
,
prompt_logprobs
=
output
.
prompt_logprobs
,
outputs
=
output
.
outputs
,
finished
=
output
.
finished
,
metrics
=
output
.
metrics
,
)
if
request_type
==
RequestType
.
CHAT
:
# For chat requests, yield the request_output directly.
yield
request_output
elif
request_type
==
RequestType
.
COMPLETION
:
# Completion requests can have multiple prompts and stream generator requires the prompt index
yield
(
prompt_idx
,
request_output
)
else
:
raise
NotImplementedError
(
f
"Request type
{
request_type
}
not implemented"
)
@
endpoint
(
name
=
"chat/completions"
)
async
def
chat_completions
(
self
,
raw_request
:
ChatCompletionRequest
):
async
for
response
in
self
.
_generate
(
raw_request
,
RequestType
.
CHAT
):
yield
response
# @endpoint()
# async def completions(self, raw_request: CompletionRequest):
# async for response in self._generate(raw_request, RequestType.COMPLETION):
# yield response
examples/llm/components/worker.py
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
logging
import
os
import
signal
from
components.disagg_router
import
PyDisaggregatedRouter
from
components.prefill_worker
import
PrefillWorker
from
utils.nixl
import
NixlMetadataStore
from
utils.prefill_queue
import
PrefillQueue
from
utils.protocol
import
MyRequestOutput
,
vLLMGenerateRequest
from
utils.vllm
import
RouterType
,
parse_vllm_args
from
vllm.entrypoints.openai.api_server
import
(
build_async_engine_client_from_engine_args
,
)
from
vllm.remote_prefill
import
RemotePrefillParams
,
RemotePrefillRequest
from
vllm.sampling_params
import
RequestOutputKind
from
dynamo.llm
import
ForwardPassMetrics
,
KvStats
,
WorkerMetricsPublisher
,
WorkerStats
from
dynamo.sdk
import
async_on_start
,
depends
,
dynamo_context
,
endpoint
,
service
logger
=
logging
.
getLogger
(
__name__
)
@
service
(
dynamo
=
{
"namespace"
:
"dynamo"
,
},
resources
=
{
"gpu"
:
1
,
"cpu"
:
"10"
,
"memory"
:
"20Gi"
},
workers
=
1
,
)
class
VllmWorker
:
prefill_worker
=
depends
(
PrefillWorker
)
def
__init__
(
self
):
self
.
client
=
None
self
.
disaggregated_router
:
PyDisaggregatedRouter
=
None
# type: ignore
class_name
=
self
.
__class__
.
__name__
self
.
engine_args
=
parse_vllm_args
(
class_name
,
""
)
self
.
do_remote_prefill
=
self
.
engine_args
.
remote_prefill
self
.
_prefill_queue_nats_server
=
os
.
getenv
(
"NATS_SERVER"
,
"nats://localhost:4222"
)
self
.
namespace
,
_
=
VllmWorker
.
dynamo_address
()
# type: ignore
self
.
_prefill_queue_stream_name
=
f
"
{
self
.
namespace
}
_prefill_queue"
logger
.
info
(
f
"Prefill queue:
{
self
.
_prefill_queue_nats_server
}
:
{
self
.
_prefill_queue_stream_name
}
"
)
if
self
.
engine_args
.
remote_prefill
:
if
self
.
engine_args
.
enable_chunked_prefill
is
not
False
:
logger
.
info
(
"Chunked prefill is not supported yet, setting to False"
)
self
.
engine_args
.
enable_chunked_prefill
=
False
if
self
.
engine_args
.
preemption_mode
!=
"swap"
:
logger
.
info
(
"Preemption mode is not supported yet, setting to swap"
)
self
.
engine_args
.
preemption_mode
=
"swap"
if
self
.
engine_args
.
pipeline_parallel_size
!=
1
:
logger
.
info
(
"Pipeline parallel size is not supported yet, setting to 1"
)
self
.
engine_args
.
pipeline_parallel_size
=
1
if
self
.
engine_args
.
router
in
(
RouterType
.
KV
,
RouterType
.
APPROX_KV
):
if
not
self
.
engine_args
.
enable_prefix_caching
:
logger
.
info
(
"When using KV router, prefix caching must be enabled, setting to True"
)
self
.
engine_args
.
enable_prefix_caching
=
True
VLLM_WORKER_ID
=
dynamo_context
[
"endpoints"
][
0
].
lease_id
()
os
.
environ
[
"VLLM_WORKER_ID"
]
=
str
(
VLLM_WORKER_ID
)
os
.
environ
[
"VLLM_KV_NAMESPACE"
]
=
"dynamo"
os
.
environ
[
"VLLM_KV_COMPONENT"
]
=
class_name
self
.
metrics_publisher
=
WorkerMetricsPublisher
()
signal
.
signal
(
signal
.
SIGTERM
,
self
.
shutdown_vllm_engine
)
signal
.
signal
(
signal
.
SIGINT
,
self
.
shutdown_vllm_engine
)
@
async_on_start
async
def
async_init
(
self
):
self
.
_engine_context
=
build_async_engine_client_from_engine_args
(
self
.
engine_args
)
if
self
.
_engine_context
is
not
None
:
self
.
engine_client
=
await
self
.
_engine_context
.
__aenter__
()
else
:
raise
RuntimeError
(
"Failed to initialize engine client"
)
self
.
engine_client
.
set_metrics_publisher
(
self
.
metrics_publisher
)
# Initially send dummy metrics to kick start,
# vLLM will not update stat until forward pass is triggered
worker_stats
=
WorkerStats
(
0
,
# request_active_slots
1024
,
# request_total_slots
0
,
# num_requests_waiting
None
,
# data_parallel_rank
)
kv_stats
=
KvStats
(
0
,
# kv_active_blocks
1024
,
# kv_total_blocks
0.0
,
# gpu_cache_usage_perc
0.0
,
# gpu_prefix_cache_hit_rate
)
metrics
=
ForwardPassMetrics
(
worker_stats
=
worker_stats
,
kv_stats
=
kv_stats
,
spec_decode_stats
=
None
,
)
self
.
metrics_publisher
.
publish
(
metrics
)
task
=
asyncio
.
create_task
(
self
.
create_metrics_publisher_endpoint
())
task
.
add_done_callback
(
lambda
_
:
logger
.
info
(
"metrics publisher endpoint created"
)
)
runtime
=
dynamo_context
[
"runtime"
]
if
self
.
engine_args
.
remote_prefill
:
metadata
=
self
.
engine_client
.
nixl_metadata
metadata_store
=
NixlMetadataStore
(
"dynamo"
,
runtime
)
await
metadata_store
.
put
(
metadata
.
engine_id
,
metadata
)
if
self
.
engine_args
.
conditional_disagg
:
self
.
disaggregated_router
=
PyDisaggregatedRouter
(
runtime
,
self
.
namespace
,
max_local_prefill_length
=
self
.
engine_args
.
max_local_prefill_length
,
max_prefill_queue_size
=
self
.
engine_args
.
max_prefill_queue_size
,
)
await
self
.
disaggregated_router
.
async_init
()
else
:
self
.
disaggregated_router
=
None
# Set up signal handler for graceful shutdown
# TODO: move to dynamo sdk
loop
=
asyncio
.
get_running_loop
()
def
signal_handler
():
# Schedule the shutdown coroutine instead of calling it directly
asyncio
.
create_task
(
self
.
graceful_shutdown
(
runtime
))
for
sig
in
(
signal
.
SIGTERM
,
signal
.
SIGINT
):
loop
.
add_signal_handler
(
sig
,
signal_handler
)
logger
.
info
(
"VllmWorker has been initialized"
)
async
def
graceful_shutdown
(
self
,
runtime
):
logger
.
info
(
"Received shutdown signal, shutting down DistributedRuntime"
)
runtime
.
shutdown
()
logger
.
info
(
"DistributedRuntime shutdown complete"
)
def
shutdown_vllm_engine
(
self
,
signum
,
frame
):
"""Shutdown the background loop"""
logger
.
info
(
f
"Received signal
{
signum
}
, shutting down"
)
loop
=
asyncio
.
get_event_loop
()
try
:
self
.
engine_client
.
close
()
logger
.
info
(
"VllmWorker shutdown complete"
)
except
Exception
as
e
:
logger
.
error
(
f
"Error during shutdown:
{
e
}
"
)
finally
:
loop
.
stop
()
async
def
create_metrics_publisher_endpoint
(
self
):
component
=
dynamo_context
[
"component"
]
logger
.
info
(
"Creating metrics publisher endpoint with primary lease"
)
await
self
.
metrics_publisher
.
create_endpoint
(
component
)
def
get_remote_prefill_request_callback
(
self
):
# TODO: integrate prefill_queue to dynamo endpoint
async
def
callback
(
request
:
RemotePrefillRequest
):
async
with
PrefillQueue
.
get_instance
(
nats_server
=
self
.
_prefill_queue_nats_server
,
stream_name
=
self
.
_prefill_queue_stream_name
,
)
as
prefill_queue
:
await
prefill_queue
.
enqueue_prefill_request
(
request
)
return
callback
# TODO: use the same child lease for metrics publisher endpoint and generate endpoint
@
endpoint
()
async
def
generate
(
self
,
request
:
vLLMGenerateRequest
):
# TODO: consider prefix hit when deciding prefill locally or remotely
if
self
.
disaggregated_router
is
not
None
:
async
with
PrefillQueue
.
get_instance
(
nats_server
=
self
.
_prefill_queue_nats_server
,
stream_name
=
self
.
_prefill_queue_stream_name
,
)
as
prefill_queue
:
prefill_queue_size
=
await
prefill_queue
.
get_queue_size
()
disagg_router_decision
=
await
self
.
disaggregated_router
.
prefill_remote
(
len
(
request
.
engine_prompt
[
"prompt_token_ids"
]),
request
.
prefix_hit_rate
,
prefill_queue_size
,
)
else
:
# always prefill remotely if no disaggregated router is provided
disagg_router_decision
=
True
if
self
.
do_remote_prefill
and
disagg_router_decision
:
remote_prefill_params
=
RemotePrefillParams
(
is_remote_prefill
=
True
,
remote_prefill_request_callback
=
self
.
get_remote_prefill_request_callback
(),
)
logger
.
info
(
f
"Prefilling remotely for request
{
request
.
request_id
}
with length
{
len
(
request
.
engine_prompt
[
'prompt_token_ids'
])
}
"
)
else
:
remote_prefill_params
=
None
logger
.
info
(
f
"Prefilling locally for request
{
request
.
request_id
}
with length
{
len
(
request
.
engine_prompt
[
'prompt_token_ids'
])
}
"
)
# rust HTTP requires Delta streaming
request
.
sampling_params
.
output_kind
=
RequestOutputKind
.
DELTA
async
for
response
in
self
.
engine_client
.
generate
(
prompt
=
request
.
engine_prompt
,
sampling_params
=
request
.
sampling_params
,
request_id
=
request
.
request_id
,
remote_prefill_params
=
remote_prefill_params
,
):
yield
MyRequestOutput
(
request_id
=
response
.
request_id
,
prompt
=
response
.
prompt
,
prompt_token_ids
=
response
.
prompt_token_ids
,
prompt_logprobs
=
response
.
prompt_logprobs
,
outputs
=
response
.
outputs
,
finished
=
response
.
finished
,
).
model_dump_json
()
examples/llm/configs/agg.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size
:
64
max-model-len
:
16384
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
router
:
round-robin
router-num-threads
:
4
common-configs
:
[
model
,
block-size
,
max-model-len
]
VllmWorker
:
enforce-eager
:
true
max-num-batched-tokens
:
16384
enable-prefix-caching
:
true
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
]
Planner
:
environment
:
local
no-operation
:
true
\ No newline at end of file
examples/llm/configs/agg_router.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
router
:
kv
block-size
:
64
max-model-len
:
16384
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
common-configs
:
[
model
,
block-size
,
max-model-len
,
router
]
Router
:
min-workers
:
1
softmax-sample
:
true
common-configs
:
[
model
,
block-size
,
router
]
VllmWorker
:
enforce-eager
:
true
max-num-batched-tokens
:
16384
enable-prefix-caching
:
true
tensor-parallel-size
:
1
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
,
router
,
kv-transfer-config
]
Planner
:
environment
:
local
no-operation
:
true
\ No newline at end of file
examples/llm/configs/disagg.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size
:
64
max-model-len
:
16384
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
router
:
round-robin
common-configs
:
[
model
,
block-size
]
VllmWorker
:
remote-prefill
:
true
conditional-disagg
:
true
max-local-prefill-length
:
10
max-prefill-queue-size
:
2
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
]
PrefillWorker
:
max-num-batched-tokens
:
16384
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
]
Planner
:
environment
:
local
no-operation
:
true
examples/llm/configs/disagg_router.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size
:
64
max-model-len
:
16384
router
:
kv
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
common-configs
:
[
model
,
block-size
,
max-model-len
,
router
]
Router
:
min-workers
:
1
common-configs
:
[
model
,
block-size
,
router
]
VllmWorker
:
max-num-batched-tokens
:
16384
remote-prefill
:
true
conditional-disagg
:
true
max-local-prefill-length
:
10
max-prefill-queue-size
:
2
tensor-parallel-size
:
1
enable-prefix-caching
:
true
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
,
router
,
kv-transfer-config
]
PrefillWorker
:
max-num-batched-tokens
:
16384
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
]
Planner
:
environment
:
local
no-operation
:
true
\ No newline at end of file
examples/llm/configs/multinode-405b.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This configuration file is used in the multinode-examples.md file
# to start the 405B model on 3 nodes.
Frontend
:
served_model_name
:
nvidia/Llama-3.1-405B-Instruct-FP8
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
model
:
nvidia/Llama-3.1-405B-Instruct-FP8
block-size
:
64
max-model-len
:
8192
router
:
kv
Router
:
model
:
nvidia/Llama-3.1-405B-Instruct-FP8
min-workers
:
1
VllmWorker
:
model
:
nvidia/Llama-3.1-405B-Instruct-FP8
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
block-size
:
64
max-model-len
:
8192
max-num-seqs
:
16
remote-prefill
:
true
conditional-disagg
:
true
max-local-prefill-length
:
10
max-prefill-queue-size
:
2
gpu-memory-utilization
:
0.95
tensor-parallel-size
:
8
router
:
kv
quantization
:
modelopt
enable-prefix-caching
:
true
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
8'
PrefillWorker
:
model
:
nvidia/Llama-3.1-405B-Instruct-FP8
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
block-size
:
64
max-model-len
:
8192
max-num-seqs
:
16
gpu-memory-utilization
:
0.95
tensor-parallel-size
:
8
quantization
:
modelopt
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
8'
\ No newline at end of file
examples/llm/configs/multinode_agg_r1.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1
block-size
:
64
max-model-len
:
16384
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
router
:
round-robin
common-configs
:
[
model
,
block-size
,
max-model-len
]
VllmWorker
:
enforce-eager
:
true
max-num-batched-tokens
:
16384
enable-prefix-caching
:
true
router
:
random
tensor-parallel-size
:
16
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
1'
common-configs
:
[
model
,
block-size
,
max-model-len
]
examples/llm/configs/mutinode_disagg_r1.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Common
:
model
:
deepseek-ai/DeepSeek-R1
block-size
:
64
max-model-len
:
16384
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
tensor-parallel-size
:
16
disable-log-requests
:
true
Frontend
:
served_model_name
:
deepseek-ai/DeepSeek-R1
endpoint
:
dynamo.Processor.chat/completions
port
:
8000
Processor
:
router
:
round-robin
common-configs
:
[
model
,
block-size
]
VllmWorker
:
remote-prefill
:
true
conditional-disagg
:
false
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
16'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
,
tensor-parallel-size
,
disable-log-requests
]
PrefillWorker
:
max-num-batched-tokens
:
16384
ServiceArgs
:
workers
:
1
resources
:
gpu
:
'
16'
common-configs
:
[
model
,
block-size
,
max-model-len
,
kv-transfer-config
,
tensor-parallel-size
,
disable-log-requests
]
examples/llm/deploy/agg.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion
:
nvidia.com/v1alpha1
kind
:
DynamoGraphDeployment
metadata
:
name
:
llm-agg
spec
:
envs
:
-
name
:
DYN_DEPLOYMENT_CONFIG
value
:
'
{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","router-num-threads":4,"common-configs":["model","block-size","max-model-len"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
services
:
Frontend
:
dynamoNamespace
:
llm-agg
componentType
:
main
replicas
:
1
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
1"
memory
:
"
2Gi"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.agg:Frontend
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
Frontend
Processor
:
dynamoNamespace
:
llm-agg
componentType
:
worker
replicas
:
1
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
1"
memory
:
"
2Gi"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.agg:Processor
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
Processor
VllmWorker
:
envFromSecret
:
hf-token-secret
dynamoNamespace
:
llm-agg
replicas
:
1
resources
:
requests
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
limits
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.agg:VllmWorker
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
VllmWorker
examples/llm/deploy/agg_router.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion
:
nvidia.com/v1alpha1
kind
:
DynamoGraphDeployment
metadata
:
name
:
agg-router
spec
:
envs
:
-
name
:
DYN_DEPLOYMENT_CONFIG
value
:
'
{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","router":"kv","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"common-configs":["model","block-size","max-model-len","router"]},"Router":{"min-workers":1,"softmax-sample":true,"common-configs":["model","block-size","router"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"tensor-parallel-size":1,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","router","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
services
:
Frontend
:
dynamoNamespace
:
llm-agg-router
componentType
:
main
replicas
:
1
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
1"
memory
:
"
2Gi"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.agg_router:Frontend
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
Frontend
Processor
:
dynamoNamespace
:
llm-agg-router
componentType
:
worker
replicas
:
1
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
1"
memory
:
"
2Gi"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.agg_router:Processor
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
Processor
Router
:
dynamoNamespace
:
llm-agg-router
componentType
:
worker
replicas
:
1
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
1"
memory
:
"
2Gi"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.agg_router:Router
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
Router
VllmWorker
:
envFromSecret
:
hf-token-secret
dynamoNamespace
:
llm-agg-router
replicas
:
1
resources
:
requests
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
limits
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.agg_router:VllmWorker
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
VllmWorker
examples/llm/deploy/disagg.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion
:
nvidia.com/v1alpha1
kind
:
DynamoGraphDeployment
metadata
:
name
:
llm-disagg
spec
:
envs
:
-
name
:
DYN_DEPLOYMENT_CONFIG
value
:
'
{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","common-configs":["model","block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
services
:
Frontend
:
dynamoNamespace
:
llm-disagg
componentType
:
main
replicas
:
1
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
1"
memory
:
"
2Gi"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.disagg:Frontend
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
Frontend
Processor
:
dynamoNamespace
:
llm-disagg
componentType
:
worker
replicas
:
1
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
1"
memory
:
"
2Gi"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.disagg:Processor
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
Processor
VllmWorker
:
envFromSecret
:
hf-token-secret
dynamoNamespace
:
llm-disagg
replicas
:
1
resources
:
requests
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
limits
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.disagg:VllmWorker
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
VllmWorker
PrefillWorker
:
envFromSecret
:
hf-token-secret
dynamoNamespace
:
llm-disagg
replicas
:
1
resources
:
requests
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
limits
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.disagg:PrefillWorker
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
PrefillWorker
examples/llm/deploy/disagg_router.yaml
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
apiVersion
:
nvidia.com/v1alpha1
kind
:
DynamoGraphDeployment
metadata
:
name
:
disagg-router
spec
:
envs
:
-
name
:
DYN_DEPLOYMENT_CONFIG
value
:
'
{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"router":"kv","kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"common-configs":["model","block-size","max-model-len","router"]},"Router":{"min-workers":1,"common-configs":["model","block-size","router"]},"VllmWorker":{"max-num-batched-tokens":16384,"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"tensor-parallel-size":1,"enable-prefix-caching":true,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","router","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
services
:
Frontend
:
dynamoNamespace
:
llm-disagg-router
componentType
:
main
replicas
:
1
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
1"
memory
:
"
2Gi"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.disagg_router:Frontend
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
Frontend
Processor
:
dynamoNamespace
:
llm-disagg-router
componentType
:
worker
replicas
:
1
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
1"
memory
:
"
2Gi"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.disagg_router:Processor
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
Processor
Router
:
dynamoNamespace
:
llm-disagg-router
componentType
:
worker
replicas
:
1
resources
:
requests
:
cpu
:
"
1"
memory
:
"
2Gi"
limits
:
cpu
:
"
1"
memory
:
"
2Gi"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.disagg_router:Router
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
Router
VllmWorker
:
envFromSecret
:
hf-token-secret
dynamoNamespace
:
llm-disagg-router
replicas
:
1
resources
:
requests
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
limits
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.disagg_router:VllmWorker
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
VllmWorker
PrefillWorker
:
envFromSecret
:
hf-token-secret
dynamoNamespace
:
llm-disagg-router
replicas
:
1
resources
:
requests
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
limits
:
cpu
:
"
10"
memory
:
"
20Gi"
gpu
:
"
1"
extraPodSpec
:
mainContainer
:
image
:
nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
workingDir
:
/workspace/examples/llm
args
:
-
dynamo
-
serve
-
graphs.disagg_router:PrefillWorker
-
--system-app-port
-
"
5000"
-
--enable-system-app
-
--use-default-health-checks
-
--service-name
-
PrefillWorker
examples/llm/graphs/__init__.py
deleted
100644 → 0
View file @
c7080419
examples/llm/graphs/agg.py
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
components.frontend
import
Frontend
from
components.planner_service
import
Planner
from
components.processor
import
Processor
from
components.worker
import
VllmWorker
Frontend
.
link
(
Processor
).
link
(
VllmWorker
)
Frontend
.
link
(
Planner
)
examples/llm/graphs/agg_router.py
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
components.frontend
import
Frontend
from
components.kv_router
import
Router
from
components.planner_service
import
Planner
from
components.processor
import
Processor
from
components.worker
import
VllmWorker
Frontend
.
link
(
Processor
).
link
(
Router
).
link
(
VllmWorker
)
Frontend
.
link
(
Planner
)
examples/llm/graphs/disagg.py
deleted
100644 → 0
View file @
c7080419
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
components.frontend
import
Frontend
from
components.planner_service
import
Planner
from
components.prefill_worker
import
PrefillWorker
from
components.processor
import
Processor
from
components.worker
import
VllmWorker
Frontend
.
link
(
Processor
).
link
(
VllmWorker
).
link
(
PrefillWorker
)
Frontend
.
link
(
Planner
)
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment