[CI] Disable non-lazy string operation on logging (#4326)

Co-authored-by: Danny Guinther <dguinther@neuralmagic.com>

[CI] Disable non-lazy string operation on logging (#4326)
Co-authored-by: Danny Guinther <dguinther@neuralmagic.com>
a88081bf · SangBin Cho · GitHub · 2f30e7c7 · a88081bf · a88081bf
Unverified Commit a88081bf authored Apr 26, 2024 by SangBin Cho Committed by GitHub Apr 26, 2024
11 changed files
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -345,8 +345,8 @@ class LoRAModelManager:
        index, _ = first_free_slot
        self._active_loras[lora_id] = None
        lora_model = self._registered_loras[lora_id]
-        logger.debug(
+        logger.debug("Activating LoRA. int id: %d, slot index: %d",
-            f"Activating LoRA. int id: {lora_model.id}, slot index: {index}")
+                     lora_model.id, index)
        self.lora_index_to_id[index] = lora_model.id
        for module_name, module in self.modules.items():
            module_lora = lora_model.get_lora(module_name)
@@ -567,7 +567,7 @@ class LoRALRUCache(LRUCache[LoRAModel]):
        self.deactivate_lora_fn = deactivate_lora_fn
    def _on_remove(self, key: int, value: LoRAModel):
-        logger.debug(f"Removing LoRA. int id: {key}")
+        logger.debug("Removing LoRA. int id: %d", key)
        self.deactivate_lora_fn(key)
        return super()._on_remove(key, value)

--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -296,8 +296,8 @@ def get_moe_configs(E: int, N: int,
        os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
    if os.path.exists(config_file_path):
        with open(config_file_path) as f:
-            logger.info(
+            logger.info("Using configuration from %s for MoE layer.",
-                f"Using configuration from {config_file_path} for MoE layer.")
+                        config_file_path)
            # If a configuration has been found, return it
            return {int(key): val for key, val in json.load(f).items()}

--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -334,10 +334,10 @@ class TensorizerAgent:
        per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
        after_mem = get_mem_usage()
        deserializer.close()
-        logger.info(f"Deserialized {total_bytes_str} in "
+        logger.info("Deserialized %s in %0.2fs, %f/s", total_bytes_str,
-                    f"{end - start:0.2f}s, {per_second}/s")
+                    end - start, per_second)
-        logger.info(f"Memory usage before: {before_mem}")
+        logger.info("Memory usage before: %s", before_mem)
-        logger.info(f"Memory usage after: {after_mem}")
+        logger.info("Memory usage after: %s", after_mem)
        self._check_tensors_on_meta_device()
        self._resize_lora_embeddings()

--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -190,7 +190,7 @@ def download_weights_from_hf(model_name_or_path: str,
            allow_patterns = [pattern]
            break
-    logger.info(f"Using model weights format {allow_patterns}")
+    logger.info("Using model weights format %s", allow_patterns)
    # Use file lock to prevent multiple processes from
    # downloading the same model weights at the same time.
    with get_lock(model_name_or_path, cache_dir):
@@ -310,17 +310,17 @@ def kv_cache_scales_loader(
            return layer_scales_map.items()
    except FileNotFoundError:
-        logger.error(f"File or directory '{filename}' not found.")
+        logger.error("File or directory '%s' not found.", filename)
    except json.JSONDecodeError:
-        logger.error(f"Error decoding JSON in file '{filename}'.")
+        logger.error("Error decoding JSON in file '%s'.", filename)
    except Exception as e:
-        logger.error(f"An error occurred while reading '{filename}': {e}")
+        logger.error("An error occurred while reading '%s': %s", filename, e)
    # This section is reached if and only if any of the excepts are hit
    # Return an empty iterable (list) => no KV cache scales are loaded
    # which ultimately defaults to 1.0 scales
-    logger.warning("Defaulting to KV cache scaling factors = 1.0 "
+    logger.warning(
-                   f"for all layers in TP rank {tp_rank} "
+        "Defaulting to KV cache scaling factors = 1.0 for all "
-                   "as an error occurred during loading.")
+        "layers in TP rank %d as an error occurred during loading.", tp_rank)
    return []

--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -91,8 +91,8 @@ class ModelRegistry:
                    "ROCm for now.")
            if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
                logger.warning(
-                    f"Model architecture {model_arch} is partially supported "
+                    "Model architecture %s is partially supported by ROCm: %s",
-                    "by ROCm: " + _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])
+                    model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])
        module_name, model_cls_name = _MODELS[model_arch]
        module = importlib.import_module(
@@ -107,9 +107,9 @@ class ModelRegistry:
    def register_model(model_arch: str, model_cls: Type[nn.Module]):
        if model_arch in _MODELS:
            logger.warning(
-                f"Model architecture {model_arch} is already registered, "
+                "Model architecture %s is already registered, and will be "
-                "and will be overwritten by the new model "
+                "overwritten by the new model class %s.", model_arch,
-                f"class {model_cls.__name__}.")
+                model_cls.__name__)
        global _OOT_MODELS
        _OOT_MODELS[model_arch] = model_cls

--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -55,10 +55,10 @@ def _get_gemma_act_fn(
                "in the config JSON file when it was initially released. "
                "Changing the activation function to approximate GeLU "
                "(`gelu_pytorch_tanh`). If you want to use the legacy "
-                f"`{hidden_act}`, edit the config JSON to set "
+                "`%s`, edit the config JSON to set "
-                f"`hidden_activation={hidden_act}` instead of `hidden_act`. "
+                "`hidden_activation=%s` instead of `hidden_act`. "
                "See https://github.com/huggingface/transformers/pull/29402 "
-                "for more details.")
+                "for more details.", hidden_act, hidden_act)
        return GeluAndMul(approximate="tanh")
    elif hidden_activation == "gelu_pytorch_tanh":
        return GeluAndMul(approximate="tanh")

--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -183,7 +183,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
            "speculative decoding "
            "requires non-None seq_group_metadata_list")
-        logger.info(f"spec_decode_worker.execute_model {num_lookahead_slots=}")
+        logger.info("spec_decode_worker.execute_model num_lookahead_slots=%d",
+                    num_lookahead_slots)
        # If no spec tokens, call the proposer and scorer workers normally.
        # Used for prefill.

--- a/vllm/transformers_utils/configs/dbrx.py
+++ b/vllm/transformers_utils/configs/dbrx.py
@@ -72,9 +72,10 @@ class DbrxAttentionConfig(PretrainedConfig):
            and config_dict["model_type"] != cls.model_type
        ):
            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                "You are using a model of type %s to instantiate a model of "
-                + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+                "type %s. This is not supported for all configurations of "
-            )
+                "models and can yield errors.",
+                config_dict["model_type"], cls.model_type)
        return cls.from_dict(config_dict, **kwargs)
@@ -151,9 +152,9 @@ class DbrxFFNConfig(PretrainedConfig):
            and config_dict["model_type"] != cls.model_type
        ):
            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                "You are using a model of type %s to instantiate a model of "
-                + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+                "type %s. This is not supported for all "
-            )
+                "configurations of models and can yield errors.", config_dict["model_type"], cls.model_type)
        return cls.from_dict(config_dict, **kwargs)

--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -138,9 +138,8 @@ def get_lora_tokenizer(lora_request: LoRARequest, *args,
        # No tokenizer was found in the LoRA folder,
        # use base model tokenizer
        logger.warning(
-            f"No tokenizer found in {lora_request.lora_local_path}, "
+            "No tokenizer found in %s, using base model tokenizer instead. "
-            "using base model tokenizer instead. "
+            "(Exception: %s)", lora_request.lora_local_path, e)
-            f"(Exception: {str(e)})")
        tokenizer = None
    return tokenizer

--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -289,8 +289,9 @@ def get_open_port() -> int:
 def update_environment_variables(envs: Dict[str, str]):
    for k, v in envs.items():
        if k in os.environ and os.environ[k] != v:
-            logger.warning(f"Overwriting environment variable {k} "
+            logger.warning(
-                           f"from '{os.environ[k]}' to '{v}'")
+                "Overwriting environment variable %s "
+                "from '%s' to '%s'", k, os.environ[k], v)
        os.environ[k] = v
@@ -310,11 +311,12 @@ def get_nvcc_cuda_version() -> Optional[Version]:
    if not cuda_home:
        cuda_home = '/usr/local/cuda'
        if os.path.isfile(cuda_home + '/bin/nvcc'):
-            logger.info(f'CUDA_HOME is not found in the environment. '
+            logger.info(
-                        f'Using {cuda_home} as CUDA_HOME.')
+                'CUDA_HOME is not found in the environment. '
+                'Using %s as CUDA_HOME.', cuda_home)
        else:
-            logger.warning(
+            logger.warning('Not found nvcc in %s. Skip cuda version check!',
-                f'Not found nvcc in {cuda_home}. Skip cuda version check!')
+                           cuda_home)
            return None
    nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"],
                                          universal_newlines=True)
@@ -599,8 +601,8 @@ def find_nccl_library():
    # manually load the nccl library
    if so_file:
        logger.info(
-            f"Found nccl from environment variable VLLM_NCCL_SO_PATH={so_file}"
+            "Found nccl from environment variable VLLM_NCCL_SO_PATH=%s",
-        )
+            so_file)
    else:
        if torch.version.cuda is not None:
            so_file = vllm_nccl_path or find_library("libnccl.so.2")
@@ -608,7 +610,7 @@ def find_nccl_library():
            so_file = find_library("librccl.so.1")
        else:
            raise ValueError("NCCL only supports CUDA and ROCm backends.")
-        logger.info(f"Found nccl from library {so_file}")
+        logger.info("Found nccl from library %s", so_file)
    return so_file

--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -170,8 +170,8 @@ class ModelRunner:
            )
        self.model_memory_usage = m.consumed_memory
-        logger.info(f"Loading model weights took "
+        logger.info("Loading model weights took %.4f GB",
-                    f"{self.model_memory_usage / float(2**30):.4f} GB")
+                    self.model_memory_usage / float(2**30))
        if self.lora_config:
            assert hasattr(self.model, "supported_lora_modules"
@@ -196,16 +196,17 @@ class ModelRunner:
                    self.model.load_kv_cache_scales(
                        self.model_config.quantization_param_path)
                else:
-                    raise RuntimeError("Using FP8 KV cache and scaling "
+                    raise RuntimeError(
-                                       "factors provided but model "
+                        "Using FP8 KV cache and scaling factors provided but "
-                                       f"{self.model.__class__} does not "
+                        "model %s does not support loading scaling factors.",
-                                       "support loading scaling factors.")
+                        self.model.__class__)
            else:
-                logger.warn("Using FP8 KV cache but no scaling factors "
+                logger.warning(
+                    "Using FP8 KV cache but no scaling factors "
                    "provided. Defaulting to scaling factors of 1.0. "
                    "This may lead to less accurate results!")
        elif self.model_config.quantization_param_path is not None:
-            logger.warn("KV cache scaling factors provided, "
+            logger.warning("KV cache scaling factors provided, "
                           "but the KV cache data type is not FP8. "
                           "KV cache scaling factors will not be used.")
@@ -1054,7 +1055,7 @@ class ModelRunner:
        end_time = time.perf_counter()
        elapsed_time = end_time - start_time
        # This usually takes < 10 seconds.
-        logger.info(f"Graph capturing finished in {elapsed_time:.0f} secs.")
+        logger.info("Graph capturing finished in %.0f secs.", elapsed_time)
    def __del__(self) -> None:
        # Delete the CUDA graphs before deleting the pynccl communicator.