Commit a092bcf4 authored by ptarasiewiczNV's avatar ptarasiewiczNV Committed by GitHub
Browse files

chore: Reduce conditional prefill logs (#121)

Co-authored-by: ptarasiewicz@nvidia.com <Piotr Tarasiewicz>
parent 1725c02d
...@@ -89,7 +89,7 @@ async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs): ...@@ -89,7 +89,7 @@ async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
else "vllm" else "vllm"
) )
vllm_logger.info( vllm_logger.info(
f"Prefill queue: {prefill_queue_nats_server}:{prefill_queue_stream_name}" "Prefill queue: %s:%s", prefill_queue_nats_server, prefill_queue_stream_name
) )
request_handler = RequestHandler(engine_client, metadata_store) request_handler = RequestHandler(engine_client, metadata_store)
...@@ -104,7 +104,9 @@ async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs): ...@@ -104,7 +104,9 @@ async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
# need to test and check how much overhead it is # need to test and check how much overhead it is
prefill_request = await prefill_queue.dequeue_prefill_request() prefill_request = await prefill_queue.dequeue_prefill_request()
if prefill_request is not None: if prefill_request is not None:
vllm_logger.info(f"Dequeued prefill request: {prefill_request}") vllm_logger.debug(
"Dequeued prefill request: %s", prefill_request.request_id
)
async for _ in request_handler.generate(prefill_request): async for _ in request_handler.generate(prefill_request):
pass pass
......
...@@ -60,7 +60,9 @@ class RequestHandler: ...@@ -60,7 +60,9 @@ class RequestHandler:
) )
self._prefill_queue_stream_name = model_name self._prefill_queue_stream_name = model_name
vllm_logger.info( vllm_logger.info(
f"Prefill queue: {self._prefill_queue_nats_server}:{self._prefill_queue_stream_name}" "Prefill queue: %s:%s",
self._prefill_queue_nats_server,
self._prefill_queue_stream_name,
) )
print("RequestHandler initialized") print("RequestHandler initialized")
...@@ -92,13 +94,17 @@ class RequestHandler: ...@@ -92,13 +94,17 @@ class RequestHandler:
is_remote_prefill=True, is_remote_prefill=True,
remote_prefill_request_callback=self.get_remote_prefill_request_callback(), remote_prefill_request_callback=self.get_remote_prefill_request_callback(),
) )
vllm_logger.info( vllm_logger.debug(
f"Prefilling remotely for request {request.request_id} with length {len(request.engine_prompt['prompt_token_ids'])}" "Prefilling remotely for request %s with length %s",
request.request_id,
len(request.engine_prompt["prompt_token_ids"]),
) )
else: else:
remote_prefill_params = None remote_prefill_params = None
vllm_logger.info( vllm_logger.debug(
f"Prefilling locally for request {request.request_id} with length {len(request.engine_prompt['prompt_token_ids'])}" "Prefilling locally for request %s with length %s",
request.request_id,
len(request.engine_prompt["prompt_token_ids"]),
) )
# rust HTTP requires Delta streaming # rust HTTP requires Delta streaming
...@@ -141,7 +147,7 @@ async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs): ...@@ -141,7 +147,7 @@ async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
# TODO: do we need these env vars? # TODO: do we need these env vars?
VLLM_WORKER_ID = endpoint.lease_id() VLLM_WORKER_ID = endpoint.lease_id()
os.environ["VLLM_WORKER_ID"] = str(VLLM_WORKER_ID) os.environ["VLLM_WORKER_ID"] = str(VLLM_WORKER_ID)
vllm_logger.info(f"Generate endpoint ID: {VLLM_WORKER_ID}") vllm_logger.info("Generate endpoint ID: %s", VLLM_WORKER_ID)
VLLM_KV_NAMESPACE = "dynamo-init" VLLM_KV_NAMESPACE = "dynamo-init"
os.environ["VLLM_KV_NAMESPACE"] = str(VLLM_KV_NAMESPACE) os.environ["VLLM_KV_NAMESPACE"] = str(VLLM_KV_NAMESPACE)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment