"vscode:/vscode.git/clone" did not exist on "6e650f56a16618db87147d97f699fa407ed1205d"
Unverified Commit 49252cf5 authored by Roger Wang's avatar Roger Wang Committed by GitHub
Browse files

[MM] Allow skipping memory profiling for multimodal models. (#22950)


Signed-off-by: default avatarRoger Wang <hey@rogerw.me>
Co-authored-by: default avatargemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
parent 3e6dd400
......@@ -388,6 +388,10 @@ class ModelConfig:
interleave_mm_strings: bool = False
"""Enable fully interleaved support for multimodal prompts, while using
--chat-template-content-format=string. Defaults to False."""
skip_mm_profiling: bool = False
"""When enabled, skips multimodal memory profiling and only profiles with
language backbone model during engine initialization.
"""
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
......@@ -837,7 +841,8 @@ class ModelConfig:
media_io_kwargs=self.media_io_kwargs,
mm_processor_kwargs=self.mm_processor_kwargs,
mm_processor_cache_gb=self.mm_processor_cache_gb,
interleave_mm_strings=self.interleave_mm_strings)
interleave_mm_strings=self.interleave_mm_strings,
skip_mm_profiling=self.skip_mm_profiling)
return None
......@@ -2511,6 +2516,16 @@ class MultiModalConfig:
Enable fully interleaved support for multimodal prompts.
"""
skip_mm_profiling: bool = False
"""
When enabled, skips multimodal memory profiling and only profiles with
language backbone model during engine initialization.
This reduces engine startup time but shifts the responsibility to users for
estimating the peak memory usage of the activation of multimodal encoder and
embedding cache.
"""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
......
......@@ -350,6 +350,7 @@ class EngineArgs:
MultiModalConfig.mm_processor_kwargs
disable_mm_preprocessor_cache: bool = False # DEPRECATED
mm_processor_cache_gb: int = MultiModalConfig.mm_processor_cache_gb
skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
# LoRA fields
enable_lora: bool = False
enable_lora_bias: bool = LoRAConfig.bias_enabled
......@@ -716,6 +717,8 @@ class EngineArgs:
multimodal_group.add_argument(
"--interleave-mm-strings",
**multimodal_kwargs["interleave_mm_strings"])
multimodal_group.add_argument("--skip-mm-profiling",
**multimodal_kwargs["skip_mm_profiling"])
# LoRA related configs
lora_kwargs = get_kwargs(LoRAConfig)
......@@ -918,6 +921,7 @@ class EngineArgs:
limit_mm_per_prompt=self.limit_mm_per_prompt,
interleave_mm_strings=self.interleave_mm_strings,
media_io_kwargs=self.media_io_kwargs,
skip_mm_profiling=self.skip_mm_profiling,
use_async_output_proc=not self.disable_async_output_proc,
config_format=self.config_format,
mm_processor_kwargs=self.mm_processor_kwargs,
......
......@@ -2479,6 +2479,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
def profile_run(self) -> None:
# Profile with multimodal encoder & encoder cache.
if self.supports_mm_inputs:
if self.model_config.multimodal_config.skip_mm_profiling:
logger.info(
"Skipping memory profiling for multimodal encoder and "
"encoder cache.")
else:
mm_budget = self.mm_budget
assert mm_budget is not None
......@@ -2498,8 +2503,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
logger.info(
"Encoder cache will be initialized with a budget of "
"%s tokens, and profiled with %s %s items of the maximum "
"feature size.",
"%s tokens, and profiled with %s %s items of the "
"maximum feature size.",
encoder_budget,
max_mm_items_per_batch,
dummy_modality,
......@@ -2512,7 +2517,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
)
# Run multimodal encoder.
dummy_encoder_outputs = self.model.get_multimodal_embeddings(
dummy_encoder_outputs = \
self.model.get_multimodal_embeddings(
**batched_dummy_mm_inputs)
sanity_check_mm_encoder_outputs(
......
......@@ -1529,6 +1529,11 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
) -> None:
# Profile with multimodal encoder & encoder cache.
if self.supports_mm_inputs:
if self.model_config.multimodal_config.skip_mm_profiling:
logger.info(
"Skipping memory profiling for multimodal encoder and "
"encoder cache.")
else:
mm_budget = self.mm_budget
assert mm_budget is not None
......@@ -1548,8 +1553,8 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
logger.info(
"Encoder cache will be initialized with a budget of "
"%s tokens, and profiled with %s %s items of the maximum "
"feature size.",
"%s tokens, and profiled with %s %s items of the "
"maximum feature size.",
encoder_budget,
max_mm_items_per_batch,
dummy_modality,
......@@ -1566,13 +1571,14 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
# impact of recompilation until it's fixed.
start = time.perf_counter()
xm.mark_step()
dummy_encoder_outputs = self.model.get_multimodal_embeddings(
dummy_encoder_outputs = \
self.model.get_multimodal_embeddings(
**batched_dummy_mm_inputs)
xm.mark_step()
xm.wait_device_ops()
end = time.perf_counter()
logger.info(
"Multimodal Encoder profiling finished in in %.2f [secs].",
"Multimodal Encoder profiling finished in %.2f [secs].",
end - start)
sanity_check_mm_encoder_outputs(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment