[CI] Disable non-lazy string operation on logging (#4326)

Co-authored-by: Danny Guinther <dguinther@neuralmagic.com>

[CI] Disable non-lazy string operation on logging (#4326)
Co-authored-by: Danny Guinther <dguinther@neuralmagic.com>
a88081bf · SangBin Cho · GitHub · 2f30e7c7 · a88081bf · a88081bf
Unverified Commit a88081bf authored Apr 26, 2024 by SangBin Cho Committed by GitHub Apr 26, 2024
20 changed files
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -98,9 +98,10 @@ autodoc_mock_imports = [
 for mock_target in autodoc_mock_imports:
    if mock_target in sys.modules:
        logger.info(
-            f"Potentially problematic mock target ({mock_target}) found; "
+            "Potentially problematic mock target (%s) found; "
            "autodoc_mock_imports cannot mock modules that have already "
-            "been loaded into sys.modules when the sphinx build starts.")
+            "been loaded into sys.modules when the sphinx build starts.",
+            mock_target)


 class MockedClassDocumenter(autodoc.ClassDocumenter):

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,6 +32,7 @@ select = [
    "SIM",
    # isort
    # "I",
+    "G",
 ]
 ignore = [
    # star imports

--- a/setup.py
+++ b/setup.py
@@ -63,7 +63,7 @@ class cmake_build_ext(build_ext):
        num_jobs = os.environ.get("MAX_JOBS", None)
        if num_jobs is not None:
            num_jobs = int(num_jobs)
-            logger.info(f"Using MAX_JOBS={num_jobs} as the number of jobs.")
+            logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs)
        else:
            try:
                # os.sched_getaffinity() isn't universally available, so fall
@@ -81,8 +81,9 @@ class cmake_build_ext(build_ext):
            nvcc_threads = os.getenv("NVCC_THREADS", None)
            if nvcc_threads is not None:
                nvcc_threads = int(nvcc_threads)
-                logger.info(f"Using NVCC_THREADS={nvcc_threads} as the number"
-                            " of nvcc threads.")
+                logger.info(
+                    "Using NVCC_THREADS=%d as the number of nvcc threads.",
+                    nvcc_threads)
            else:
                nvcc_threads = 1
            num_jobs = max(1, num_jobs // nvcc_threads)

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -167,9 +167,9 @@ class ModelConfig:
                    f"supported in ROCm.")
            if self.quantization != "marlin":
                logger.warning(
-                    f"{self.quantization} quantization is not fully "
+                    "%s quantization is not fully "
                    "optimized yet. The speed can be slower than "
-                    "non-quantized models.")
+                    "non-quantized models.", self.quantization)

    def _verify_cuda_graph(self) -> None:
        if self.max_context_len_to_capture is None:
@@ -360,7 +360,7 @@ class CacheConfig:
        if cpu_memory_usage > 0.7 * total_cpu_memory:
            raise ValueError("Too large swap space. " + msg)
        elif cpu_memory_usage > 0.4 * total_cpu_memory:
-            logger.warning("Possibly too large swap space. " + msg)
+            logger.warning("Possibly too large swap space. %s", msg)


 @dataclass
@@ -898,8 +898,8 @@ class LoRAConfig:
                "awq", "gptq"
        ]:
            # TODO support marlin and squeezellm
-            logger.warning(f"{model_config.quantization} quantization is not "
-                           "tested with LoRA yet.")
+            logger.warning("%s quantization is not tested with LoRA yet.",
+                           model_config.quantization)

    def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
        if scheduler_config.max_num_batched_tokens > 65528:
@@ -1008,7 +1008,7 @@ def _get_and_verify_dtype(
            pass
        else:
            # Casting between float16 and bfloat16 is allowed with a warning.
-            logger.warning(f"Casting {config_dtype} to {torch_dtype}.")
+            logger.warning("Casting %s to %s.", config_dtype, torch_dtype)

    return torch_dtype

@@ -1051,8 +1051,8 @@ def _get_and_verify_max_len(
        logger.warning(
            "The model's config.json does not contain any of the following "
            "keys to determine the original maximum length of the model: "
-            f"{possible_keys}. Assuming the model's maximum length is "
-            f"{default_max_len}.")
+            "%d. Assuming the model's maximum length is %d.", possible_keys,
+            default_max_len)
        derived_max_model_len = default_max_len

    rope_scaling = getattr(hf_config, "rope_scaling", None)

--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -617,8 +617,9 @@ class Scheduler:

            if num_new_tokens > self.prompt_limit:
                logger.warning(
-                    f"Input prompt ({num_new_tokens} tokens) is too long"
-                    f" and exceeds limit of {self.prompt_limit}")
+                    "Input prompt (%d tokens) is too long"
+                    " and exceeds limit of %d", num_new_tokens,
+                    self.prompt_limit)
                for seq in waiting_seqs:
                    seq.status = SequenceStatus.FINISHED_IGNORED
                ignored_seq_groups.append(seq_group)
@@ -631,8 +632,9 @@ class Scheduler:
                break
            elif can_allocate == AllocStatus.NEVER:
                logger.warning(
-                    f"Input prompt ({num_new_tokens} tokens) is too long"
-                    f" and exceeds the capacity of block_manager")
+                    "Input prompt (%d tokens) is too long"
+                    " and exceeds the capacity of block_manager",
+                    num_new_tokens)
                for seq in waiting_seqs:
                    seq.status = SequenceStatus.FINISHED_IGNORED
                ignored_seq_groups.append(seq_group)

--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -37,7 +37,7 @@ def init_custom_ar() -> None:
        return

    if world_size not in _SUPPORTED_WORLD_SIZES:
-        logger.warn(
+        logger.warning(
            "Custom allreduce is disabled due to an unsupported world size: "
            "%d. Supported world sizes: %s. To silence this warning, specify"
            " disable_custom_all_reduce=True explicitly.", world_size,
@@ -47,7 +47,7 @@ def init_custom_ar() -> None:
    # note: num dev can be larger than world_size if we're only using
    # first few GPUs
    if num_dev < world_size:
-        logger.warn(
+        logger.warning(
            "Cannot test GPU P2P because not all GPUs are visible to the "
            "current process. This might be the case if 'CUDA_VISIBLE_DEVICES'"
            " is set.")
@@ -62,7 +62,7 @@ def init_custom_ar() -> None:
    # this checks hardware and driver support for NVLink
    full_nvlink = _is_full_nvlink(device_ids)
    if world_size > 2 and not full_nvlink:
-        logger.warn(
+        logger.warning(
            "Custom allreduce is disabled because it's not supported on more"
            " than two PCIe-only GPUs. To silence this warning, specify"
            " disable_custom_all_reduce=True explicitly.")
@@ -71,7 +71,7 @@ def init_custom_ar() -> None:
    # this is expensive to compute at the first time
    # then we cache the result
    if not _can_p2p(rank, world_size):
-        logger.warn(
+        logger.warning(
            "Custom allreduce is disabled because your platform lacks GPU P2P"
            " capability or P2P test failed. To silence this warning, specify"
            " disable_custom_all_reduce=True explicitly.")

--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -43,15 +43,16 @@ try:
    nccl = ctypes.CDLL(so_file)
 except Exception as e:
    logger.error(
-        f"Failed to load NCCL library from {so_file} ."
+        "Failed to load NCCL library from %s ."
        "It is expected if you are not running on NVIDIA/AMD GPUs."
        "Otherwise, the nccl library might not exist, be corrupted "
-        f"or it does not support the current platform {platform.platform()}."
-        f"One solution is to download libnccl2 version 2.18 from "
-        f"https://developer.download.nvidia.com/compute/cuda/repos/ "
-        f"and extract the libnccl.so.2 file. If you already have the "
-        f"library, please set the environment variable VLLM_NCCL_SO_PATH"
-        " to point to the correct nccl library path.")
+        "or it does not support the current platform %s."
+        "One solution is to download libnccl2 version 2.18 from "
+        "https://developer.download.nvidia.com/compute/cuda/repos/ "
+        "and extract the libnccl.so.2 file. If you already have the "
+        "library, please set the environment variable VLLM_NCCL_SO_PATH"
+        " to point to the correct nccl library path.", so_file,
+        platform.platform())
    raise e

 # === export types and functions from nccl to Python ===

--- a/vllm/distributed/device_communicators/pynccl_utils.py
+++ b/vllm/distributed/device_communicators/pynccl_utils.py
@@ -14,7 +14,7 @@ try:
 except Exception as e:
    # in non-NVIDIA environments, we can't import the nccl module
    # e.g. when running on machines with AMD GPUs
-    logger.info(f"Failed to import NCCL library: {e}")
+    logger.info("Failed to import NCCL library: %s", e)
    logger.info("It is expected if you are not running on NVIDIA GPUs.")
    pass

@@ -40,7 +40,7 @@ def set_pynccl_stream(stream: torch.cuda.Stream):
 def init_process_group(group: Optional[ProcessGroup] = None) -> None:
    assert not is_initialized()
    global comm
-    logger.info(f"vLLM is using nccl=={ncclGetVersion()}")
+    logger.info("vLLM is using nccl==%s", ncclGetVersion())
    comm = NCCLCommunicator(group=group)



--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -57,8 +57,10 @@ def init_distributed_environment(
    local_rank: int = -1,
    backend: str = "nccl",
 ):
-    logger.debug(f"{world_size=} {rank=} {local_rank=} "
-                 f"{distributed_init_method=} {backend=}")
+    logger.debug(
+        "world_size=%d rank=%d local_rank=%d "
+        "distributed_init_method=%s backend=%s", world_size, rank, local_rank,
+        distributed_init_method, backend)
    if not torch.distributed.is_initialized():
        assert distributed_init_method is not None, (
            "distributed_init_method must be provided when initializing "

--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -112,7 +112,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
        and (not os.path.exists(path)):
        # only the local master process (with local_rank == 0) can
        #  enter this block to calculate the cache
-        logger.info(f"generating GPU P2P access cache for in {path}")
+        logger.info("generating GPU P2P access cache for in %s", path)
        cache = {}
        for _i in range(num_dev):
            for _j in range(num_dev):
@@ -126,7 +126,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
    if is_distributed:
        cpu_world_group = get_cpu_world_group()
        dist.barrier(cpu_world_group)
-    logger.info(f"reading GPU P2P access cache from {path}")
+    logger.info("reading GPU P2P access cache from %s", path)
    with open(path, "r") as f:
        cache = json.load(f)
    _gpu_p2p_access_cache = cache

--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -117,7 +117,7 @@ class RequestTracker:
        self._request_streams[request_id].put(request_output)
        if request_output.finished:
            if verbose:
-                logger.info(f"Finished request {request_id}.")
+                logger.info("Finished request %s.", request_id)
            self.abort_request(request_id)

    def process_exception(self,
@@ -128,7 +128,7 @@ class RequestTracker:
        """Propagate an exception from the engine."""
        self._request_streams[request_id].put(exception)
        if verbose:
-            logger.info(f"Finished request {request_id}.")
+            logger.info("Finished request %s.", request_id)
        self.abort_request(request_id)

    def add_request(self, request_id: str,
@@ -151,7 +151,7 @@ class RequestTracker:
    def abort_request(self, request_id: str, *, verbose: bool = False) -> None:
        """Abort a request during next background loop iteration."""
        if verbose:
-            logger.info(f"Aborted request {request_id}.")
+            logger.info("Aborted request %s.", request_id)

        self._finished_requests.put_nowait(request_id)

@@ -521,11 +521,11 @@ class AsyncLLMEngine:
                if shortened_token_ids is not None:
                    shortened_token_ids = shortened_token_ids[:self.
                                                              max_log_len]
-            logger.info(f"Received request {request_id}: "
-                        f"prompt: {shortened_prompt!r}, "
-                        f"sampling_params: {sampling_params}, "
-                        f"prompt_token_ids: {shortened_token_ids}, "
-                        f"lora_request: {lora_request}.")
+            logger.info(
+                "Received request %s: prompt: %r, "
+                "sampling_params: %s, prompt_token_ids: %s, "
+                "lora_request: %s.", request_id, shortened_prompt,
+                sampling_params, shortened_token_ids, lora_request)

        if not self.is_running:
            if self.start_engine_loop:
@@ -717,4 +717,4 @@ class AsyncLLMEngine:
                raise RuntimeError("Engine is dead.") from e
        else:
            await self.engine.check_health_async()
-        logger.debug(f"Health check took {time.perf_counter()-t}s")
+        logger.debug("Health check took %fs", time.perf_counter() - t)
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -96,29 +96,38 @@ class LLMEngine:
        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
    ) -> None:
        logger.info(
-            f"Initializing an LLM engine (v{vllm.__version__}) with config: "
-            f"model={model_config.model!r}, "
-            f"speculative_config={speculative_config!r}, "
-            f"tokenizer={model_config.tokenizer!r}, "
-            f"skip_tokenizer_init={model_config.skip_tokenizer_init}, "
-            f"tokenizer_mode={model_config.tokenizer_mode}, "
-            f"revision={model_config.revision}, "
-            f"tokenizer_revision={model_config.tokenizer_revision}, "
-            f"trust_remote_code={model_config.trust_remote_code}, "
-            f"dtype={model_config.dtype}, "
-            f"max_seq_len={model_config.max_model_len}, "
-            f"download_dir={load_config.download_dir!r}, "
-            f"load_format={load_config.load_format}, "
-            f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
-            f"disable_custom_all_reduce="
-            f"{parallel_config.disable_custom_all_reduce}, "
-            f"quantization={model_config.quantization}, "
-            f"enforce_eager={model_config.enforce_eager}, "
-            f"kv_cache_dtype={cache_config.cache_dtype}, "
-            f"quantization_param_path={model_config.quantization_param_path}, "
-            f"device_config={device_config.device}, "
-            f"decoding_config={decoding_config!r}, "
-            f"seed={model_config.seed})")
+            "Initializing an LLM engine (v%s) with config: "
+            "model=%r, speculative_config=%r, tokenizer=%r, "
+            "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
+            "tokenizer_revision=%s, trust_remote_code=%s, dtype=%s, "
+            "max_seq_len=%d, download_dir=%r, load_format=%s, "
+            "tensor_parallel_size=%d, disable_custom_all_reduce=%s"
+            "quantization=%s, enforce_eager=%s, kv_cache_dtype=%s, "
+            "quantization_param_path=%s, device_config=%s, "
+            "decoding_config=%r, seed=%d)",
+            vllm.__version__,
+            model_config.model,
+            speculative_config,
+            model_config.tokenizer,
+            model_config.skip_tokenizer_init,
+            model_config.tokenizer_mode,
+            model_config.revision,
+            model_config.tokenizer_revision,
+            model_config.trust_remote_code,
+            model_config.dtype,
+            model_config.max_model_len,
+            load_config.download_dir,
+            load_config.load_format,
+            parallel_config.tensor_parallel_size,
+            parallel_config.disable_custom_all_reduce,
+            model_config.quantization,
+            model_config.enforce_eager,
+            cache_config.cache_dtype,
+            model_config.quantization_param_path,
+            device_config.device,
+            decoding_config,
+            model_config.seed,
+        )
        # TODO(woosuk): Print more configs in debug mode.

        self.model_config = model_config
@@ -237,8 +246,10 @@ class LLMEngine:

        if self.cache_config.num_gpu_blocks_override is not None:
            num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override
-            logger.info(f"Overriding {num_gpu_blocks=} with "
-                        f"{num_gpu_blocks_override=}")
+            logger.info(
+                "Overriding num_gpu_blocks=%d with "
+                "num_gpu_blocks_override=%d", num_gpu_blocks,
+                num_gpu_blocks_override)
            num_gpu_blocks = num_gpu_blocks_override

        self.cache_config.num_gpu_blocks = num_gpu_blocks

--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -227,14 +227,19 @@ class StatLogger:

            # Log to stdout.
            logger.info(
-                f"Avg prompt throughput: {prompt_throughput:.1f} tokens/s, "
-                f"Avg generation throughput: "
-                f"{generation_throughput:.1f} tokens/s, "
-                f"Running: {stats.num_running} reqs, "
-                f"Swapped: {stats.num_swapped} reqs, "
-                f"Pending: {stats.num_waiting} reqs, "
-                f"GPU KV cache usage: {stats.gpu_cache_usage * 100:.1f}%, "
-                f"CPU KV cache usage: {stats.cpu_cache_usage * 100:.1f}%")
+                "Avg prompt throughput: %.1f tokens/s, "
+                "Avg generation throughput: %.1f tokens/s, "
+                "Running: %d reqs, Swapped: %d reqs, "
+                "Pending: %d reqs, GPU KV cache usage: %.1f%, "
+                "CPU KV cache usage: %.1f%",
+                prompt_throughput,
+                generation_throughput,
+                stats.num_running,
+                stats.num_swapped,
+                stats.num_waiting,
+                stats.gpu_cache_usage * 100,
+                stats.cpu_cache_usage * 100,
+            )

            # Reset tracked stats for next interval.
            self.num_prompt_tokens = []

--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -148,8 +148,8 @@ if __name__ == "__main__":
            raise ValueError(f"Invalid middleware {middleware}. "
                             f"Must be a function or a class.")

-    logger.info(f"vLLM API server version {vllm.__version__}")
-    logger.info(f"args: {args}")
+    logger.info("vLLM API server version %s", vllm.__version__)
+    logger.info("args: %s", args)

    if args.served_model_name is not None:
        served_model_names = args.served_model_name

--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -57,8 +57,7 @@ class OpenAIServingChat(OpenAIServing):
                tokenize=False,
                add_generation_prompt=request.add_generation_prompt)
        except Exception as e:
-            logger.error(
-                f"Error in applying chat template from request: {str(e)}")
+            logger.error("Error in applying chat template from request: %s", e)
            return self.create_error_response(str(e))

        request_id = f"cmpl-{random_uuid()}"
@@ -338,11 +337,11 @@ class OpenAIServingChat(OpenAIServing):
                tokenizer.chat_template = codecs.decode(
                    chat_template, "unicode_escape")

-            logger.info(
-                f"Using supplied chat template:\n{tokenizer.chat_template}")
+            logger.info("Using supplied chat template:\n%s",
+                        tokenizer.chat_template)
        elif tokenizer.chat_template is not None:
-            logger.info(
-                f"Using default chat template:\n{tokenizer.chat_template}")
+            logger.info("Using default chat template:\n%s",
+                        tokenizer.chat_template)
        else:
            logger.warning(
                "No chat template provided. Chat API will not work.")
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -69,7 +69,7 @@ class CPUExecutor(ExecutorBase):
        # NOTE: `cpu block` for CPU backend is located on CPU memory but is
        # referred as `gpu block`. Because we want to reuse the existing block
        # management procedure.
-        logger.info(f"# CPU blocks: {num_gpu_blocks}")
+        logger.info("# CPU blocks: %d", num_gpu_blocks)
        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)

    def execute_model(self,

--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -116,8 +116,8 @@ class GPUExecutor(ExecutorBase):
        # NOTE: This is logged in the executor because there can be >1 worker
        # with other executors. We could log in the engine level, but work
        # remains to abstract away the device for non-GPU configurations.
-        logger.info(f"# GPU blocks: {num_gpu_blocks}, "
-                    f"# CPU blocks: {num_cpu_blocks}")
+        logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
+                    num_cpu_blocks)

        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)


--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -214,8 +214,8 @@ class RayGPUExecutor(ExecutorBase):
        # NOTE: We log here to avoid multiple logs when number of workers is
        # greater than one. We could log in the engine, but not all executors
        # have GPUs.
-        logger.info(f"# GPU blocks: {num_gpu_blocks}, "
-                    f"# CPU blocks: {num_cpu_blocks}")
+        logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
+                    num_cpu_blocks)

        self.cache_config.num_gpu_blocks = num_gpu_blocks
        self.cache_config.num_cpu_blocks = num_cpu_blocks

--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -43,9 +43,9 @@ try:
            return output

 except ImportError as e:
-    logger.warning(f"Failed to import Ray with {e!r}. "
-                   "For distributed inference, please install Ray with "
-                   "`pip install ray`.")
+    logger.warning(
+        "Failed to import Ray with %r. For distributed inference, "
+        "please install Ray with `pip install ray`.", e)
    ray = None  # type: ignore
    RayWorkerWrapper = None  # type: ignore


--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -126,7 +126,7 @@ def enable_trace_function_call(log_file_path: str,
        "VLLM_TRACE_FUNCTION is enabled. It will record every"
        " function executed by Python. This will slow down the code. It "
        "is suggested to be used for debugging hang or crashes only.")
-    logger.info(f"Trace frame log is saved to {log_file_path}")
+    logger.info("Trace frame log is saved to %s", log_file_path)
    if root_dir is None:
        # by default, this is the vllm root directory
        root_dir = os.path.dirname(os.path.dirname(__file__))