Commit 17827e1d authored by ptarasiewiczNV's avatar ptarasiewiczNV Committed by GitHub
Browse files

feat: Decode -> Prefill cached kv transfer (#340)

parent 405222ce
...@@ -64,6 +64,12 @@ class PrefillWorker: ...@@ -64,6 +64,12 @@ class PrefillWorker:
print("Prefill must be done eagerly, setting to True") print("Prefill must be done eagerly, setting to True")
self.engine_args.enforce_eager = True self.engine_args.enforce_eager = True
if self.engine_args.enable_prefix_caching is not False:
print(
"Prefix caching is not supported yet in prefill worker, setting to False"
)
self.engine_args.enable_prefix_caching = False
@async_on_start @async_on_start
async def async_init(self): async def async_init(self):
self._engine_context = build_async_engine_client_from_engine_args( self._engine_context = build_async_engine_client_from_engine_args(
...@@ -115,6 +121,7 @@ class PrefillWorker: ...@@ -115,6 +121,7 @@ class PrefillWorker:
is_remote_decode=True, is_remote_decode=True,
decode_block_ids=request.block_ids, decode_block_ids=request.block_ids,
decode_engine_id=request.engine_id, decode_engine_id=request.engine_id,
decode_computed_block_ids=request.computed_block_ids,
) )
# TODO check if metadata has changed # TODO check if metadata has changed
......
...@@ -30,22 +30,25 @@ Router: ...@@ -30,22 +30,25 @@ Router:
VllmWorker: VllmWorker:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
enforce-eager: true
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}' kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
block-size: 64 block-size: 64
max-model-len: 16384 max-model-len: 16384
max-num-batched-tokens: 16384 max-num-batched-tokens: 16384
remote-prefill: true
conditional-disagg: true conditional-disagg: true
max-local-prefill-length: 10 max-local-prefill-length: 10
max-prefill-queue-size: 2 max-prefill-queue-size: 2
tensor-parallel-size: 1 tensor-parallel-size: 1
router: kv router: kv
enable-prefix-caching: true enable-prefix-caching: true
ServiceArgs:
workers: 1
resources:
gpu: 1
# TODO - set all of these but model as default # TODO - set all of these but model as default
PrefillWorker: PrefillWorker:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
enforce-eager: true
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}' kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
block-size: 64 block-size: 64
max-model-len: 16384 max-model-len: 16384
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment