Unverified Commit a88081bf authored by SangBin Cho's avatar SangBin Cho Committed by GitHub
Browse files

[CI] Disable non-lazy string operation on logging (#4326)


Co-authored-by: default avatarDanny Guinther <dguinther@neuralmagic.com>
parent 2f30e7c7
...@@ -345,8 +345,8 @@ class LoRAModelManager: ...@@ -345,8 +345,8 @@ class LoRAModelManager:
index, _ = first_free_slot index, _ = first_free_slot
self._active_loras[lora_id] = None self._active_loras[lora_id] = None
lora_model = self._registered_loras[lora_id] lora_model = self._registered_loras[lora_id]
logger.debug( logger.debug("Activating LoRA. int id: %d, slot index: %d",
f"Activating LoRA. int id: {lora_model.id}, slot index: {index}") lora_model.id, index)
self.lora_index_to_id[index] = lora_model.id self.lora_index_to_id[index] = lora_model.id
for module_name, module in self.modules.items(): for module_name, module in self.modules.items():
module_lora = lora_model.get_lora(module_name) module_lora = lora_model.get_lora(module_name)
...@@ -567,7 +567,7 @@ class LoRALRUCache(LRUCache[LoRAModel]): ...@@ -567,7 +567,7 @@ class LoRALRUCache(LRUCache[LoRAModel]):
self.deactivate_lora_fn = deactivate_lora_fn self.deactivate_lora_fn = deactivate_lora_fn
def _on_remove(self, key: int, value: LoRAModel): def _on_remove(self, key: int, value: LoRAModel):
logger.debug(f"Removing LoRA. int id: {key}") logger.debug("Removing LoRA. int id: %d", key)
self.deactivate_lora_fn(key) self.deactivate_lora_fn(key)
return super()._on_remove(key, value) return super()._on_remove(key, value)
......
...@@ -296,8 +296,8 @@ def get_moe_configs(E: int, N: int, ...@@ -296,8 +296,8 @@ def get_moe_configs(E: int, N: int,
os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name) os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
if os.path.exists(config_file_path): if os.path.exists(config_file_path):
with open(config_file_path) as f: with open(config_file_path) as f:
logger.info( logger.info("Using configuration from %s for MoE layer.",
f"Using configuration from {config_file_path} for MoE layer.") config_file_path)
# If a configuration has been found, return it # If a configuration has been found, return it
return {int(key): val for key, val in json.load(f).items()} return {int(key): val for key, val in json.load(f).items()}
......
...@@ -334,10 +334,10 @@ class TensorizerAgent: ...@@ -334,10 +334,10 @@ class TensorizerAgent:
per_second = convert_bytes(deserializer.total_tensor_bytes / duration) per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
after_mem = get_mem_usage() after_mem = get_mem_usage()
deserializer.close() deserializer.close()
logger.info(f"Deserialized {total_bytes_str} in " logger.info("Deserialized %s in %0.2fs, %f/s", total_bytes_str,
f"{end - start:0.2f}s, {per_second}/s") end - start, per_second)
logger.info(f"Memory usage before: {before_mem}") logger.info("Memory usage before: %s", before_mem)
logger.info(f"Memory usage after: {after_mem}") logger.info("Memory usage after: %s", after_mem)
self._check_tensors_on_meta_device() self._check_tensors_on_meta_device()
self._resize_lora_embeddings() self._resize_lora_embeddings()
......
...@@ -190,7 +190,7 @@ def download_weights_from_hf(model_name_or_path: str, ...@@ -190,7 +190,7 @@ def download_weights_from_hf(model_name_or_path: str,
allow_patterns = [pattern] allow_patterns = [pattern]
break break
logger.info(f"Using model weights format {allow_patterns}") logger.info("Using model weights format %s", allow_patterns)
# Use file lock to prevent multiple processes from # Use file lock to prevent multiple processes from
# downloading the same model weights at the same time. # downloading the same model weights at the same time.
with get_lock(model_name_or_path, cache_dir): with get_lock(model_name_or_path, cache_dir):
...@@ -310,17 +310,17 @@ def kv_cache_scales_loader( ...@@ -310,17 +310,17 @@ def kv_cache_scales_loader(
return layer_scales_map.items() return layer_scales_map.items()
except FileNotFoundError: except FileNotFoundError:
logger.error(f"File or directory '{filename}' not found.") logger.error("File or directory '%s' not found.", filename)
except json.JSONDecodeError: except json.JSONDecodeError:
logger.error(f"Error decoding JSON in file '{filename}'.") logger.error("Error decoding JSON in file '%s'.", filename)
except Exception as e: except Exception as e:
logger.error(f"An error occurred while reading '{filename}': {e}") logger.error("An error occurred while reading '%s': %s", filename, e)
# This section is reached if and only if any of the excepts are hit # This section is reached if and only if any of the excepts are hit
# Return an empty iterable (list) => no KV cache scales are loaded # Return an empty iterable (list) => no KV cache scales are loaded
# which ultimately defaults to 1.0 scales # which ultimately defaults to 1.0 scales
logger.warning("Defaulting to KV cache scaling factors = 1.0 " logger.warning(
f"for all layers in TP rank {tp_rank} " "Defaulting to KV cache scaling factors = 1.0 for all "
"as an error occurred during loading.") "layers in TP rank %d as an error occurred during loading.", tp_rank)
return [] return []
......
...@@ -91,8 +91,8 @@ class ModelRegistry: ...@@ -91,8 +91,8 @@ class ModelRegistry:
"ROCm for now.") "ROCm for now.")
if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS: if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS:
logger.warning( logger.warning(
f"Model architecture {model_arch} is partially supported " "Model architecture %s is partially supported by ROCm: %s",
"by ROCm: " + _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch]) model_arch, _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch])
module_name, model_cls_name = _MODELS[model_arch] module_name, model_cls_name = _MODELS[model_arch]
module = importlib.import_module( module = importlib.import_module(
...@@ -107,9 +107,9 @@ class ModelRegistry: ...@@ -107,9 +107,9 @@ class ModelRegistry:
def register_model(model_arch: str, model_cls: Type[nn.Module]): def register_model(model_arch: str, model_cls: Type[nn.Module]):
if model_arch in _MODELS: if model_arch in _MODELS:
logger.warning( logger.warning(
f"Model architecture {model_arch} is already registered, " "Model architecture %s is already registered, and will be "
"and will be overwritten by the new model " "overwritten by the new model class %s.", model_arch,
f"class {model_cls.__name__}.") model_cls.__name__)
global _OOT_MODELS global _OOT_MODELS
_OOT_MODELS[model_arch] = model_cls _OOT_MODELS[model_arch] = model_cls
......
...@@ -55,10 +55,10 @@ def _get_gemma_act_fn( ...@@ -55,10 +55,10 @@ def _get_gemma_act_fn(
"in the config JSON file when it was initially released. " "in the config JSON file when it was initially released. "
"Changing the activation function to approximate GeLU " "Changing the activation function to approximate GeLU "
"(`gelu_pytorch_tanh`). If you want to use the legacy " "(`gelu_pytorch_tanh`). If you want to use the legacy "
f"`{hidden_act}`, edit the config JSON to set " "`%s`, edit the config JSON to set "
f"`hidden_activation={hidden_act}` instead of `hidden_act`. " "`hidden_activation=%s` instead of `hidden_act`. "
"See https://github.com/huggingface/transformers/pull/29402 " "See https://github.com/huggingface/transformers/pull/29402 "
"for more details.") "for more details.", hidden_act, hidden_act)
return GeluAndMul(approximate="tanh") return GeluAndMul(approximate="tanh")
elif hidden_activation == "gelu_pytorch_tanh": elif hidden_activation == "gelu_pytorch_tanh":
return GeluAndMul(approximate="tanh") return GeluAndMul(approximate="tanh")
......
...@@ -183,7 +183,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase): ...@@ -183,7 +183,8 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
"speculative decoding " "speculative decoding "
"requires non-None seq_group_metadata_list") "requires non-None seq_group_metadata_list")
logger.info(f"spec_decode_worker.execute_model {num_lookahead_slots=}") logger.info("spec_decode_worker.execute_model num_lookahead_slots=%d",
num_lookahead_slots)
# If no spec tokens, call the proposer and scorer workers normally. # If no spec tokens, call the proposer and scorer workers normally.
# Used for prefill. # Used for prefill.
......
...@@ -72,9 +72,10 @@ class DbrxAttentionConfig(PretrainedConfig): ...@@ -72,9 +72,10 @@ class DbrxAttentionConfig(PretrainedConfig):
and config_dict["model_type"] != cls.model_type and config_dict["model_type"] != cls.model_type
): ):
logger.warning( logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " "You are using a model of type %s to instantiate a model of "
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." "type %s. This is not supported for all configurations of "
) "models and can yield errors.",
config_dict["model_type"], cls.model_type)
return cls.from_dict(config_dict, **kwargs) return cls.from_dict(config_dict, **kwargs)
...@@ -151,9 +152,9 @@ class DbrxFFNConfig(PretrainedConfig): ...@@ -151,9 +152,9 @@ class DbrxFFNConfig(PretrainedConfig):
and config_dict["model_type"] != cls.model_type and config_dict["model_type"] != cls.model_type
): ):
logger.warning( logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " "You are using a model of type %s to instantiate a model of "
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." "type %s. This is not supported for all "
) "configurations of models and can yield errors.", config_dict["model_type"], cls.model_type)
return cls.from_dict(config_dict, **kwargs) return cls.from_dict(config_dict, **kwargs)
......
...@@ -138,9 +138,8 @@ def get_lora_tokenizer(lora_request: LoRARequest, *args, ...@@ -138,9 +138,8 @@ def get_lora_tokenizer(lora_request: LoRARequest, *args,
# No tokenizer was found in the LoRA folder, # No tokenizer was found in the LoRA folder,
# use base model tokenizer # use base model tokenizer
logger.warning( logger.warning(
f"No tokenizer found in {lora_request.lora_local_path}, " "No tokenizer found in %s, using base model tokenizer instead. "
"using base model tokenizer instead. " "(Exception: %s)", lora_request.lora_local_path, e)
f"(Exception: {str(e)})")
tokenizer = None tokenizer = None
return tokenizer return tokenizer
......
...@@ -289,8 +289,9 @@ def get_open_port() -> int: ...@@ -289,8 +289,9 @@ def get_open_port() -> int:
def update_environment_variables(envs: Dict[str, str]): def update_environment_variables(envs: Dict[str, str]):
for k, v in envs.items(): for k, v in envs.items():
if k in os.environ and os.environ[k] != v: if k in os.environ and os.environ[k] != v:
logger.warning(f"Overwriting environment variable {k} " logger.warning(
f"from '{os.environ[k]}' to '{v}'") "Overwriting environment variable %s "
"from '%s' to '%s'", k, os.environ[k], v)
os.environ[k] = v os.environ[k] = v
...@@ -310,11 +311,12 @@ def get_nvcc_cuda_version() -> Optional[Version]: ...@@ -310,11 +311,12 @@ def get_nvcc_cuda_version() -> Optional[Version]:
if not cuda_home: if not cuda_home:
cuda_home = '/usr/local/cuda' cuda_home = '/usr/local/cuda'
if os.path.isfile(cuda_home + '/bin/nvcc'): if os.path.isfile(cuda_home + '/bin/nvcc'):
logger.info(f'CUDA_HOME is not found in the environment. ' logger.info(
f'Using {cuda_home} as CUDA_HOME.') 'CUDA_HOME is not found in the environment. '
'Using %s as CUDA_HOME.', cuda_home)
else: else:
logger.warning( logger.warning('Not found nvcc in %s. Skip cuda version check!',
f'Not found nvcc in {cuda_home}. Skip cuda version check!') cuda_home)
return None return None
nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"], nvcc_output = subprocess.check_output([cuda_home + "/bin/nvcc", "-V"],
universal_newlines=True) universal_newlines=True)
...@@ -599,8 +601,8 @@ def find_nccl_library(): ...@@ -599,8 +601,8 @@ def find_nccl_library():
# manually load the nccl library # manually load the nccl library
if so_file: if so_file:
logger.info( logger.info(
f"Found nccl from environment variable VLLM_NCCL_SO_PATH={so_file}" "Found nccl from environment variable VLLM_NCCL_SO_PATH=%s",
) so_file)
else: else:
if torch.version.cuda is not None: if torch.version.cuda is not None:
so_file = vllm_nccl_path or find_library("libnccl.so.2") so_file = vllm_nccl_path or find_library("libnccl.so.2")
...@@ -608,7 +610,7 @@ def find_nccl_library(): ...@@ -608,7 +610,7 @@ def find_nccl_library():
so_file = find_library("librccl.so.1") so_file = find_library("librccl.so.1")
else: else:
raise ValueError("NCCL only supports CUDA and ROCm backends.") raise ValueError("NCCL only supports CUDA and ROCm backends.")
logger.info(f"Found nccl from library {so_file}") logger.info("Found nccl from library %s", so_file)
return so_file return so_file
......
...@@ -170,8 +170,8 @@ class ModelRunner: ...@@ -170,8 +170,8 @@ class ModelRunner:
) )
self.model_memory_usage = m.consumed_memory self.model_memory_usage = m.consumed_memory
logger.info(f"Loading model weights took " logger.info("Loading model weights took %.4f GB",
f"{self.model_memory_usage / float(2**30):.4f} GB") self.model_memory_usage / float(2**30))
if self.lora_config: if self.lora_config:
assert hasattr(self.model, "supported_lora_modules" assert hasattr(self.model, "supported_lora_modules"
...@@ -196,16 +196,17 @@ class ModelRunner: ...@@ -196,16 +196,17 @@ class ModelRunner:
self.model.load_kv_cache_scales( self.model.load_kv_cache_scales(
self.model_config.quantization_param_path) self.model_config.quantization_param_path)
else: else:
raise RuntimeError("Using FP8 KV cache and scaling " raise RuntimeError(
"factors provided but model " "Using FP8 KV cache and scaling factors provided but "
f"{self.model.__class__} does not " "model %s does not support loading scaling factors.",
"support loading scaling factors.") self.model.__class__)
else: else:
logger.warn("Using FP8 KV cache but no scaling factors " logger.warning(
"Using FP8 KV cache but no scaling factors "
"provided. Defaulting to scaling factors of 1.0. " "provided. Defaulting to scaling factors of 1.0. "
"This may lead to less accurate results!") "This may lead to less accurate results!")
elif self.model_config.quantization_param_path is not None: elif self.model_config.quantization_param_path is not None:
logger.warn("KV cache scaling factors provided, " logger.warning("KV cache scaling factors provided, "
"but the KV cache data type is not FP8. " "but the KV cache data type is not FP8. "
"KV cache scaling factors will not be used.") "KV cache scaling factors will not be used.")
...@@ -1054,7 +1055,7 @@ class ModelRunner: ...@@ -1054,7 +1055,7 @@ class ModelRunner:
end_time = time.perf_counter() end_time = time.perf_counter()
elapsed_time = end_time - start_time elapsed_time = end_time - start_time
# This usually takes < 10 seconds. # This usually takes < 10 seconds.
logger.info(f"Graph capturing finished in {elapsed_time:.0f} secs.") logger.info("Graph capturing finished in %.0f secs.", elapsed_time)
def __del__(self) -> None: def __del__(self) -> None:
# Delete the CUDA graphs before deleting the pynccl communicator. # Delete the CUDA graphs before deleting the pynccl communicator.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment