[Model] New model support for microsoft/Phi-4-mini-flash-reasoning (#20702)

Signed-off-by: Congcong Chen <congcongchen@microsoft.com>

[Model] New model support for microsoft/Phi-4-mini-flash-reasoning (#20702)
Signed-off-by: Congcong Chen <congcongchen@microsoft.com>
2c11a738 · Congcong Chen · GitHub · b639327a · 2c11a738 · 2c11a738
Unverified Commit 2c11a738 authored Jul 12, 2025 by Congcong Chen Committed by GitHub Jul 12, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 28 additions and 2 deletions

vllm/worker/model_runner.py vllm/worker/model_runner.py +4 -0

vllm/worker/worker.py vllm/worker/worker.py +24 -2

No files found.
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1112,6 +1112,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
            (self.max_batchsize_to_capture, self.get_max_block_per_batch()),
            dtype=np.int32)
+        self.cross_layer_shared_graph_block_tables = np.zeros(
+            (self.max_batchsize_to_capture, self.get_max_block_per_batch()),
+            dtype=np.int32)
        # Attention-free but stateful models like Mamba need a placeholder attn
        # backend, as the attention metadata is needed to manage internal state.
        # However we must bypass attention selection altogether for some models

--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -9,7 +9,8 @@ import torch
 import torch.distributed
 import vllm.envs as envs
-from vllm.config import VllmConfig
+from vllm.attention.layer import Attention
+from vllm.config import VllmConfig, get_layers_from_vllm_config
 from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.distributed import (ensure_model_parallel_initialized,
                              init_distributed_environment,
@@ -345,8 +346,29 @@ class Worker(LocalOrDistributedWorkerBase):
            self.cache_engine[ve].gpu_cache
            for ve in range(self.parallel_config.pipeline_parallel_size)
        ]
+        # Layer pairings for cross-layer KV sharing.
+        # If an Attention layer `layer_name` is in the keys of this dict, it
+        # means this layer will perform attention using the keys and values
+        # from the KV cache of `shared_kv_cache_layers[layer_name]`.
+        shared_kv_cache_layers: dict[str, str] = {}
+        attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
+        for layer_name, attn_module in attn_layers.items():
+            if (kv_tgt_layer :=
+                    attn_module.kv_sharing_target_layer_name) is not None:
+                # The layer doesn't need its own KV cache and will use that of
+                # the target layer. We skip creating a KVCacheSpec for it, so
+                # that KV cache management logic will act as this layer does
+                # not exist, and doesn't allocate KV cache for the layer. This
+                # enables the memory saving of cross-layer kv sharing, allowing
+                # a given amount of memory to accommodate longer context lengths
+                # or enable more requests to be processed simultaneously.
+                shared_kv_cache_layers[layer_name] = kv_tgt_layer
        bind_kv_cache(self.compilation_config.static_forward_context,
-                      self.gpu_cache)
+                      self.gpu_cache, shared_kv_cache_layers)
    def _warm_up_model(self) -> None:
        # warm up sizes that are not in cudagraph capture sizes,