Unverified Commit 68d535ef authored by Jun Duan's avatar Jun Duan Committed by GitHub
Browse files

[Misc] Capture and log the time of loading weights (#13666)

parent c6ed9386
...@@ -1048,6 +1048,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1048,6 +1048,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
def load_model(self) -> None: def load_model(self) -> None:
logger.info("Starting to load model %s...", self.model_config.model) logger.info("Starting to load model %s...", self.model_config.model)
with DeviceMemoryProfiler() as m: # noqa: SIM117 with DeviceMemoryProfiler() as m: # noqa: SIM117
time_before_load = time.perf_counter()
self.model = get_model(vllm_config=self.vllm_config) self.model = get_model(vllm_config=self.vllm_config)
if self.lora_config: if self.lora_config:
self.model = self.load_lora_model(self.model, self.model = self.load_lora_model(self.model,
...@@ -1055,10 +1056,11 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1055,10 +1056,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self.scheduler_config, self.scheduler_config,
self.lora_config, self.lora_config,
self.device) self.device)
time_after_load = time.perf_counter()
self.model_memory_usage = m.consumed_memory self.model_memory_usage = m.consumed_memory
logger.info("Loading model weights took %.4f GB", logger.info("Loading model weights took %.4f GB and %.6f seconds",
self.model_memory_usage / float(2**30)) self.model_memory_usage / float(2**30),
time_after_load - time_before_load)
def _get_prompt_logprobs_dict( def _get_prompt_logprobs_dict(
self, self,
......
...@@ -1109,11 +1109,14 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): ...@@ -1109,11 +1109,14 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
def load_model(self) -> None: def load_model(self) -> None:
logger.info("Starting to load model %s...", self.model_config.model) logger.info("Starting to load model %s...", self.model_config.model)
with DeviceMemoryProfiler(self.device) as m: with DeviceMemoryProfiler(self.device) as m:
time_before_load = time.perf_counter()
self.model = get_model(vllm_config=self.vllm_config) self.model = get_model(vllm_config=self.vllm_config)
time_after_load = time.perf_counter()
self.model_memory_usage = m.consumed_memory self.model_memory_usage = m.consumed_memory
logger.info("Loading model weights took %.4f GB", logger.info("Loading model weights took %.4f GB and %.6f seconds",
self.model_memory_usage / float(2**30)) self.model_memory_usage / float(2**30),
time_after_load - time_before_load)
if self.lora_config: if self.lora_config:
assert supports_lora( assert supports_lora(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment