chore: Reduce conditional prefill logs (#121)

Co-authored-by: ptarasiewicz@nvidia.com <Piotr Tarasiewicz>

chore: Reduce conditional prefill logs (#121)
Co-authored-by: ptarasiewicz@nvidia.com <Piotr Tarasiewicz>
a092bcf4 · ptarasiewiczNV · GitHub · 1725c02d · a092bcf4 · a092bcf4
Commit a092bcf4 authored Mar 12, 2025 by ptarasiewiczNV Committed by GitHub Mar 12, 2025
Showing with 16 additions and 8 deletions

examples/python_rs/llm/vllm/prefill_worker.py examples/python_rs/llm/vllm/prefill_worker.py +4 -2

examples/python_rs/llm/vllm/worker.py examples/python_rs/llm/vllm/worker.py +12 -6

No files found.
--- a/examples/python_rs/llm/vllm/prefill_worker.py
+++ b/examples/python_rs/llm/vllm/prefill_worker.py
@@ -89,7 +89,7 @@ async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
            else "vllm"
        )
        vllm_logger.info(
-            f"Prefill queue: {prefill_queue_nats_server}:{prefill_queue_stream_name}"
+            "Prefill queue: %s:%s", prefill_queue_nats_server, prefill_queue_stream_name
        )

        request_handler = RequestHandler(engine_client, metadata_store)
@@ -104,7 +104,9 @@ async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
                # need to test and check how much overhead it is
                prefill_request = await prefill_queue.dequeue_prefill_request()
                if prefill_request is not None:
-                    vllm_logger.info(f"Dequeued prefill request: {prefill_request}")
+                    vllm_logger.debug(
+                        "Dequeued prefill request: %s", prefill_request.request_id
+                    )
                    async for _ in request_handler.generate(prefill_request):
                        pass


--- a/examples/python_rs/llm/vllm/worker.py
+++ b/examples/python_rs/llm/vllm/worker.py
@@ -60,7 +60,9 @@ class RequestHandler:
        )
        self._prefill_queue_stream_name = model_name
        vllm_logger.info(
-            f"Prefill queue: {self._prefill_queue_nats_server}:{self._prefill_queue_stream_name}"
+            "Prefill queue: %s:%s",
+            self._prefill_queue_nats_server,
+            self._prefill_queue_stream_name,
        )

        print("RequestHandler initialized")
@@ -92,13 +94,17 @@ class RequestHandler:
                is_remote_prefill=True,
                remote_prefill_request_callback=self.get_remote_prefill_request_callback(),
            )
-            vllm_logger.info(
-                f"Prefilling remotely for request {request.request_id} with length {len(request.engine_prompt['prompt_token_ids'])}"
+            vllm_logger.debug(
+                "Prefilling remotely for request %s with length %s",
+                request.request_id,
+                len(request.engine_prompt["prompt_token_ids"]),
            )
        else:
            remote_prefill_params = None
-            vllm_logger.info(
-                f"Prefilling locally for request {request.request_id} with length {len(request.engine_prompt['prompt_token_ids'])}"
+            vllm_logger.debug(
+                "Prefilling locally for request %s with length %s",
+                request.request_id,
+                len(request.engine_prompt["prompt_token_ids"]),
            )

        # rust HTTP requires Delta streaming
@@ -141,7 +147,7 @@ async def worker(runtime: DistributedRuntime, engine_args: AsyncEngineArgs):
        # TODO: do we need these env vars?
        VLLM_WORKER_ID = endpoint.lease_id()
        os.environ["VLLM_WORKER_ID"] = str(VLLM_WORKER_ID)
-        vllm_logger.info(f"Generate endpoint ID: {VLLM_WORKER_ID}")
+        vllm_logger.info("Generate endpoint ID: %s", VLLM_WORKER_ID)

        VLLM_KV_NAMESPACE = "dynamo-init"
        os.environ["VLLM_KV_NAMESPACE"] = str(VLLM_KV_NAMESPACE)