Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
41ec2338
Commit
41ec2338
authored
Mar 17, 2025
by
ptarasiewiczNV
Committed by
GitHub
Mar 17, 2025
Browse files
docs: Update dynamo serve disagg example (#212)
Co-authored-by: ptarasiewicz@nvidia.com <Piotr Tarasiewicz>
parent
d788b63e
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
12 additions
and
17 deletions
+12
-17
examples/llm/components/prefill_worker.py
examples/llm/components/prefill_worker.py
+2
-3
examples/llm/components/worker.py
examples/llm/components/worker.py
+2
-2
examples/llm/configs/disagg.yaml
examples/llm/configs/disagg.yaml
+8
-12
No files found.
examples/llm/components/prefill_worker.py
View file @
41ec2338
...
...
@@ -25,7 +25,6 @@ from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args
,
)
from
vllm.inputs.data
import
TokensPrompt
from
vllm.logger
import
logger
as
vllm_logger
from
vllm.remote_prefill
import
RemotePrefillParams
,
RemotePrefillRequest
from
dynamo.sdk
import
(
...
...
@@ -76,7 +75,6 @@ class PrefillWorker:
if
self
.
engine_args
.
enforce_eager
is
not
True
:
print
(
"Prefill must be done eagerly, setting to True"
)
self
.
engine_args
.
enforce_eager
=
True
print
(
"PrefillWorker initialized"
)
@
async_on_start
async
def
async_init
(
self
):
...
...
@@ -93,6 +91,7 @@ class PrefillWorker:
await
self
.
_metadata_store
.
put
(
metadata
.
engine_id
,
metadata
)
task
=
asyncio
.
create_task
(
self
.
prefill_queue_handler
())
task
.
add_done_callback
(
lambda
_
:
print
(
"prefill queue handler created"
))
print
(
"PrefillWorker initialized"
)
async
def
prefill_queue_handler
(
self
):
print
(
"[DEBUG] prefill queue handler entered"
)
...
...
@@ -115,7 +114,7 @@ class PrefillWorker:
# need to test and check how much overhead it is
prefill_request
=
await
prefill_queue
.
dequeue_prefill_request
()
if
prefill_request
is
not
None
:
vllm_logger
.
info
(
f
"Dequeued prefill request:
{
prefill_request
}
"
)
print
(
f
"Dequeued prefill request:
{
prefill_request
.
request_id
}
"
)
async
for
_
in
self
.
generate
(
prefill_request
):
pass
...
...
examples/llm/components/worker.py
View file @
41ec2338
...
...
@@ -175,12 +175,12 @@ class VllmWorker:
is_remote_prefill
=
True
,
remote_prefill_request_callback
=
self
.
get_remote_prefill_request_callback
(),
)
vllm_logger
.
info
(
print
(
f
"Prefilling remotely for request
{
request
.
request_id
}
with length
{
len
(
request
.
engine_prompt
[
'prompt_token_ids'
])
}
"
)
else
:
remote_prefill_params
=
None
vllm_logger
.
info
(
print
(
f
"Prefilling locally for request
{
request
.
request_id
}
with length
{
len
(
request
.
engine_prompt
[
'prompt_token_ids'
])
}
"
)
...
...
examples/llm/configs/disagg.yaml
View file @
41ec2338
...
...
@@ -21,28 +21,24 @@ Frontend:
Processor
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size
:
64
max-model-len
:
16384
router
:
round-robin
VllmWorker
:
# vllm enging args
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
enforce-eager
:
true
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
block-size
:
64
max-model-len
:
16384
max-num-batched-tokens
:
16384
# dynamo args
remote-prefill
:
true
conditional-disagg
:
true
tensor-parallel-size
:
1
remote_prefill
:
true
max_local_prefill_length
:
10
max-local-prefill-length
:
10
# TODO - set all of these but model as default
PrefillWorker
:
# vllm enging args
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
enforce-eager
:
true
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
block-size
:
64
max-model-len
:
16384
max-num-batched-tokens
:
16384
# dynamo arg for local deployment
cuda-visible-device-offset
:
1
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment