[BugFix] Fix regression caused by mamba state dtype PR (#22998)

Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>

[BugFix] Fix regression caused by mamba state dtype PR (#22998)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
f5d412ba · Thomas Parnell · GitHub · 177e55e3 · f5d412ba · f5d412ba
Unverified Commit f5d412ba authored Aug 16, 2025 by Thomas Parnell Committed by GitHub Aug 15, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 12 additions and 4 deletions

vllm/model_executor/models/phi4flash.py vllm/model_executor/models/phi4flash.py +6 -2

vllm/model_executor/models/plamo2.py vllm/model_executor/models/plamo2.py +6 -2

No files found.
--- a/vllm/model_executor/models/phi4flash.py
+++ b/vllm/model_executor/models/phi4flash.py
@@ -650,8 +650,12 @@ class Phi4FlashForCausalLM(nn.Module, HasInnerState, IsHybrid, SupportsV0Only):
            num_mamba_layers = self.config.num_hidden_layers \
                // 2 // self.config.mb_per_layer + 1
            self.mamba_cache = MambaCacheManager(
-                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
+                self.vllm_config,
-                *self._get_mamba_cache_shape())
+                num_mamba_layers,
+                *self._get_mamba_cache_shape(),
+                self.lm_head.weight.dtype,
+                self.lm_head.weight.dtype,
+            )
        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
        attn_metadata = get_forward_context().attn_metadata

--- a/vllm/model_executor/models/plamo2.py
+++ b/vllm/model_executor/models/plamo2.py
@@ -767,8 +767,12 @@ class Plamo2ForCausalLM(Plamo2PreTrainedModel, HasInnerState, SupportsPP,
                self.vllm_config.parallel_config, LayerBlockType.mamba)
            self.mamba_cache = MambaCacheManager(
-                self.vllm_config, self.lm_head.weight.dtype, num_mamba_layers,
+                self.vllm_config,
-                *self._get_mamba_cache_shape())
+                num_mamba_layers,
+                *self._get_mamba_cache_shape(),
+                self.lm_head.weight.dtype,
+                self.lm_head.weight.dtype,
+            )
        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)