"git@developer.sourcefind.cn:OpenDAS/dynamo.git" did not exist on "50aa390b11131a57218c3afb30a2c3aa044d314a"
Commit 41ec2338 authored by ptarasiewiczNV's avatar ptarasiewiczNV Committed by GitHub
Browse files

docs: Update dynamo serve disagg example (#212)

Co-authored-by: ptarasiewicz@nvidia.com <Piotr Tarasiewicz>
parent d788b63e
...@@ -25,7 +25,6 @@ from vllm.entrypoints.openai.api_server import ( ...@@ -25,7 +25,6 @@ from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args, build_async_engine_client_from_engine_args,
) )
from vllm.inputs.data import TokensPrompt from vllm.inputs.data import TokensPrompt
from vllm.logger import logger as vllm_logger
from vllm.remote_prefill import RemotePrefillParams, RemotePrefillRequest from vllm.remote_prefill import RemotePrefillParams, RemotePrefillRequest
from dynamo.sdk import ( from dynamo.sdk import (
...@@ -76,7 +75,6 @@ class PrefillWorker: ...@@ -76,7 +75,6 @@ class PrefillWorker:
if self.engine_args.enforce_eager is not True: if self.engine_args.enforce_eager is not True:
print("Prefill must be done eagerly, setting to True") print("Prefill must be done eagerly, setting to True")
self.engine_args.enforce_eager = True self.engine_args.enforce_eager = True
print("PrefillWorker initialized")
@async_on_start @async_on_start
async def async_init(self): async def async_init(self):
...@@ -93,6 +91,7 @@ class PrefillWorker: ...@@ -93,6 +91,7 @@ class PrefillWorker:
await self._metadata_store.put(metadata.engine_id, metadata) await self._metadata_store.put(metadata.engine_id, metadata)
task = asyncio.create_task(self.prefill_queue_handler()) task = asyncio.create_task(self.prefill_queue_handler())
task.add_done_callback(lambda _: print("prefill queue handler created")) task.add_done_callback(lambda _: print("prefill queue handler created"))
print("PrefillWorker initialized")
async def prefill_queue_handler(self): async def prefill_queue_handler(self):
print("[DEBUG] prefill queue handler entered") print("[DEBUG] prefill queue handler entered")
...@@ -115,7 +114,7 @@ class PrefillWorker: ...@@ -115,7 +114,7 @@ class PrefillWorker:
# need to test and check how much overhead it is # need to test and check how much overhead it is
prefill_request = await prefill_queue.dequeue_prefill_request() prefill_request = await prefill_queue.dequeue_prefill_request()
if prefill_request is not None: if prefill_request is not None:
vllm_logger.info(f"Dequeued prefill request: {prefill_request}") print(f"Dequeued prefill request: {prefill_request.request_id}")
async for _ in self.generate(prefill_request): async for _ in self.generate(prefill_request):
pass pass
......
...@@ -175,12 +175,12 @@ class VllmWorker: ...@@ -175,12 +175,12 @@ class VllmWorker:
is_remote_prefill=True, is_remote_prefill=True,
remote_prefill_request_callback=self.get_remote_prefill_request_callback(), remote_prefill_request_callback=self.get_remote_prefill_request_callback(),
) )
vllm_logger.info( print(
f"Prefilling remotely for request {request.request_id} with length {len(request.engine_prompt['prompt_token_ids'])}" f"Prefilling remotely for request {request.request_id} with length {len(request.engine_prompt['prompt_token_ids'])}"
) )
else: else:
remote_prefill_params = None remote_prefill_params = None
vllm_logger.info( print(
f"Prefilling locally for request {request.request_id} with length {len(request.engine_prompt['prompt_token_ids'])}" f"Prefilling locally for request {request.request_id} with length {len(request.engine_prompt['prompt_token_ids'])}"
) )
......
...@@ -21,28 +21,24 @@ Frontend: ...@@ -21,28 +21,24 @@ Frontend:
Processor: Processor:
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
block-size: 64
max-model-len: 16384
router: round-robin router: round-robin
VllmWorker: VllmWorker:
# vllm enging args
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
enforce-eager: true
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}' kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
block-size: 64
max-model-len: 16384 max-model-len: 16384
max-num-batched-tokens: 16384
# dynamo args
remote-prefill: true
conditional-disagg: true conditional-disagg: true
tensor-parallel-size: 1 max-local-prefill-length: 10
remote_prefill: true
max_local_prefill_length: 10
# TODO - set all of these but model as default
PrefillWorker: PrefillWorker:
# vllm enging args
model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B model: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
enforce-eager: true
kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}' kv-transfer-config: '{"kv_connector":"DynamoNixlConnector"}'
block-size: 64
max-model-len: 16384 max-model-len: 16384
max-num-batched-tokens: 16384
# dynamo arg for local deployment
cuda-visible-device-offset: 1 cuda-visible-device-offset: 1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment