Commit a092bcf4 authored by ptarasiewiczNV's avatar ptarasiewiczNV Committed by GitHub
Browse files

chore: Reduce conditional prefill logs (#121)

Co-authored-by: ptarasiewicz@nvidia.com <Piotr Tarasiewicz>
parent 1725c02d
......@@ -89,7 +89,7 @@ async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
else "vllm"
)
vllm_logger.info(
f"Prefill queue: {prefill_queue_nats_server}:{prefill_queue_stream_name}"
"Prefill queue: %s:%s", prefill_queue_nats_server, prefill_queue_stream_name
)
request_handler = RequestHandler(engine_client, metadata_store)
......@@ -104,7 +104,9 @@ async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
# need to test and check how much overhead it is
prefill_request = await prefill_queue.dequeue_prefill_request()
if prefill_request is not None:
vllm_logger.info(f"Dequeued prefill request: {prefill_request}")
vllm_logger.debug(
"Dequeued prefill request: %s", prefill_request.request_id
)
async for _ in request_handler.generate(prefill_request):
pass
......
......@@ -60,7 +60,9 @@ class RequestHandler:
)
self._prefill_queue_stream_name = model_name
vllm_logger.info(
f"Prefill queue: {self._prefill_queue_nats_server}:{self._prefill_queue_stream_name}"
"Prefill queue: %s:%s",
self._prefill_queue_nats_server,
self._prefill_queue_stream_name,
)
print("RequestHandler initialized")
......@@ -92,13 +94,17 @@ class RequestHandler:
is_remote_prefill=True,
remote_prefill_request_callback=self.get_remote_prefill_request_callback(),
)
vllm_logger.info(
f"Prefilling remotely for request {request.request_id} with length {len(request.engine_prompt['prompt_token_ids'])}"
vllm_logger.debug(
"Prefilling remotely for request %s with length %s",
request.request_id,
len(request.engine_prompt["prompt_token_ids"]),
)
else:
remote_prefill_params = None
vllm_logger.info(
f"Prefilling locally for request {request.request_id} with length {len(request.engine_prompt['prompt_token_ids'])}"
vllm_logger.debug(
"Prefilling locally for request %s with length %s",
request.request_id,
len(request.engine_prompt["prompt_token_ids"]),
)
# rust HTTP requires Delta streaming
......@@ -141,7 +147,7 @@ async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
# TODO: do we need these env vars?
VLLM_WORKER_ID = endpoint.lease_id()
os.environ["VLLM_WORKER_ID"] = str(VLLM_WORKER_ID)
vllm_logger.info(f"Generate endpoint ID: {VLLM_WORKER_ID}")
vllm_logger.info("Generate endpoint ID: %s", VLLM_WORKER_ID)
VLLM_KV_NAMESPACE = "dynamo-init"
os.environ["VLLM_KV_NAMESPACE"] = str(VLLM_KV_NAMESPACE)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment