Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
17827e1d
Commit
17827e1d
authored
Mar 26, 2025
by
ptarasiewiczNV
Committed by
GitHub
Mar 26, 2025
Browse files
feat: Decode -> Prefill cached kv transfer (#340)
parent
405222ce
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
408 additions
and
248 deletions
+408
-248
container/deps/vllm/vllm_v0.7.2-dynamo-kv-disagg-patch.patch
container/deps/vllm/vllm_v0.7.2-dynamo-kv-disagg-patch.patch
+396
-246
examples/llm/components/prefill_worker.py
examples/llm/components/prefill_worker.py
+7
-0
examples/llm/configs/disagg_router.yaml
examples/llm/configs/disagg_router.yaml
+5
-2
No files found.
container/deps/vllm/vllm_v0.7.2-dynamo-kv-disagg-patch.patch
View file @
17827e1d
This diff is collapsed.
Click to expand it.
examples/llm/components/prefill_worker.py
View file @
17827e1d
...
@@ -64,6 +64,12 @@ class PrefillWorker:
...
@@ -64,6 +64,12 @@ class PrefillWorker:
print
(
"Prefill must be done eagerly, setting to True"
)
print
(
"Prefill must be done eagerly, setting to True"
)
self
.
engine_args
.
enforce_eager
=
True
self
.
engine_args
.
enforce_eager
=
True
if
self
.
engine_args
.
enable_prefix_caching
is
not
False
:
print
(
"Prefix caching is not supported yet in prefill worker, setting to False"
)
self
.
engine_args
.
enable_prefix_caching
=
False
@
async_on_start
@
async_on_start
async
def
async_init
(
self
):
async
def
async_init
(
self
):
self
.
_engine_context
=
build_async_engine_client_from_engine_args
(
self
.
_engine_context
=
build_async_engine_client_from_engine_args
(
...
@@ -115,6 +121,7 @@ class PrefillWorker:
...
@@ -115,6 +121,7 @@ class PrefillWorker:
is_remote_decode
=
True
,
is_remote_decode
=
True
,
decode_block_ids
=
request
.
block_ids
,
decode_block_ids
=
request
.
block_ids
,
decode_engine_id
=
request
.
engine_id
,
decode_engine_id
=
request
.
engine_id
,
decode_computed_block_ids
=
request
.
computed_block_ids
,
)
)
# TODO check if metadata has changed
# TODO check if metadata has changed
...
...
examples/llm/configs/disagg_router.yaml
View file @
17827e1d
...
@@ -30,22 +30,25 @@ Router:
...
@@ -30,22 +30,25 @@ Router:
VllmWorker
:
VllmWorker
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
enforce-eager
:
true
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
block-size
:
64
block-size
:
64
max-model-len
:
16384
max-model-len
:
16384
max-num-batched-tokens
:
16384
max-num-batched-tokens
:
16384
remote-prefill
:
true
conditional-disagg
:
true
conditional-disagg
:
true
max-local-prefill-length
:
10
max-local-prefill-length
:
10
max-prefill-queue-size
:
2
max-prefill-queue-size
:
2
tensor-parallel-size
:
1
tensor-parallel-size
:
1
router
:
kv
router
:
kv
enable-prefix-caching
:
true
enable-prefix-caching
:
true
ServiceArgs
:
workers
:
1
resources
:
gpu
:
1
# TODO - set all of these but model as default
# TODO - set all of these but model as default
PrefillWorker
:
PrefillWorker
:
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
model
:
deepseek-ai/DeepSeek-R1-Distill-Llama-8B
enforce-eager
:
true
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
kv-transfer-config
:
'
{"kv_connector":"DynamoNixlConnector"}'
block-size
:
64
block-size
:
64
max-model-len
:
16384
max-model-len
:
16384
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment