Commit 17827e1d authored by ptarasiewiczNV's avatar ptarasiewiczNV Committed by GitHub
Browse files

feat: Decode -> Prefill cached kv transfer (#340)

parent 405222ce
......@@ -64,6 +64,12 @@ class PrefillWorker:
print("Prefill must be done eagerly, setting to True")
self.engine_args.enforce_eager = True
if self.engine_args.enable_prefix_caching is not False:
print(
"Prefix caching is not supported yet in prefill worker, setting to False"
)
self.engine_args.enable_prefix_caching = False
@async_on_start
async def async_init(self):
self._engine_context = build_async_engine_client_from_engine_args(
......@@ -115,6 +121,7 @@ class PrefillWorker:
is_remote_decode=True,
decode_block_ids=request.block_ids,
decode_engine_id=request.engine_id,
decode_computed_block_ids=request.computed_block_ids,
)
# TODO check if metadata has changed
......
......@@ -30,22 +30,25 @@ Router:
VllmWorker:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
enforce-eager: true
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
block-size: 64
max-model-len: 16384
max-num-batched-tokens: 16384
remote-prefill: true
conditional-disagg: true
max-local-prefill-length: 10
max-prefill-queue-size: 2
tensor-parallel-size: 1
router: kv
enable-prefix-caching: true
ServiceArgs:
workers: 1
resources:
gpu: 1
# TODO - set all of these but model as default
PrefillWorker:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
enforce-eager: true
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
block-size: 64
max-model-len: 16384
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment