Unverified Commit f819265a authored by roikoren755's avatar roikoren755 Committed by GitHub
Browse files

Default to 'align' mamba cache mode for Mamba-based models when speculative...


Default to 'align' mamba cache mode for Mamba-based models when speculative decoding is enabled (#40454)
Signed-off-by: default avatarRoi Koren <roik@nvidia.com>
parent 936e0b79
......@@ -325,15 +325,26 @@ class MambaModelConfig(VerifyAndUpdateConfig):
if cache_config.enable_prefix_caching:
if cache_config.mamba_cache_mode == "none":
cache_config.mamba_cache_mode = (
"all" if model_config.supports_mamba_prefix_caching else "align"
)
logger.warning(
"Mamba cache mode is set to '%s' for %s by default "
"when prefix caching is enabled",
cache_config.mamba_cache_mode,
model_config.architecture,
)
if (
model_config.supports_mamba_prefix_caching
and vllm_config.speculative_config is not None
):
cache_config.mamba_cache_mode = "align"
logger.warning(
"Mamba cache mode is set to 'align' for %s by default "
"when prefix caching and speculative decoding are enabled",
model_config.architecture,
)
else:
cache_config.mamba_cache_mode = (
"all" if model_config.supports_mamba_prefix_caching else "align"
)
logger.warning(
"Mamba cache mode is set to '%s' for %s by default "
"when prefix caching is enabled",
cache_config.mamba_cache_mode,
model_config.architecture,
)
if (
cache_config.mamba_cache_mode == "all"
and not model_config.supports_mamba_prefix_caching
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment