Unverified Commit 2c11a738 authored by Congcong Chen's avatar Congcong Chen Committed by GitHub
Browse files

[Model] New model support for microsoft/Phi-4-mini-flash-reasoning (#20702)


Signed-off-by: default avatarCongcong Chen <congcongchen@microsoft.com>
parent b639327a
...@@ -1112,6 +1112,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): ...@@ -1112,6 +1112,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
(self.max_batchsize_to_capture, self.get_max_block_per_batch()), (self.max_batchsize_to_capture, self.get_max_block_per_batch()),
dtype=np.int32) dtype=np.int32)
self.cross_layer_shared_graph_block_tables = np.zeros(
(self.max_batchsize_to_capture, self.get_max_block_per_batch()),
dtype=np.int32)
# Attention-free but stateful models like Mamba need a placeholder attn # Attention-free but stateful models like Mamba need a placeholder attn
# backend, as the attention metadata is needed to manage internal state. # backend, as the attention metadata is needed to manage internal state.
# However we must bypass attention selection altogether for some models # However we must bypass attention selection altogether for some models
......
...@@ -9,7 +9,8 @@ import torch ...@@ -9,7 +9,8 @@ import torch
import torch.distributed import torch.distributed
import vllm.envs as envs import vllm.envs as envs
from vllm.config import VllmConfig from vllm.attention.layer import Attention
from vllm.config import VllmConfig, get_layers_from_vllm_config
from vllm.device_allocator.cumem import CuMemAllocator from vllm.device_allocator.cumem import CuMemAllocator
from vllm.distributed import (ensure_model_parallel_initialized, from vllm.distributed import (ensure_model_parallel_initialized,
init_distributed_environment, init_distributed_environment,
...@@ -345,8 +346,29 @@ class Worker(LocalOrDistributedWorkerBase): ...@@ -345,8 +346,29 @@ class Worker(LocalOrDistributedWorkerBase):
self.cache_engine[ve].gpu_cache self.cache_engine[ve].gpu_cache
for ve in range(self.parallel_config.pipeline_parallel_size) for ve in range(self.parallel_config.pipeline_parallel_size)
] ]
# Layer pairings for cross-layer KV sharing.
# If an Attention layer `layer_name` is in the keys of this dict, it
# means this layer will perform attention using the keys and values
# from the KV cache of `shared_kv_cache_layers[layer_name]`.
shared_kv_cache_layers: dict[str, str] = {}
attn_layers = get_layers_from_vllm_config(self.vllm_config, Attention)
for layer_name, attn_module in attn_layers.items():
if (kv_tgt_layer :=
attn_module.kv_sharing_target_layer_name) is not None:
# The layer doesn't need its own KV cache and will use that of
# the target layer. We skip creating a KVCacheSpec for it, so
# that KV cache management logic will act as this layer does
# not exist, and doesn't allocate KV cache for the layer. This
# enables the memory saving of cross-layer kv sharing, allowing
# a given amount of memory to accommodate longer context lengths
# or enable more requests to be processed simultaneously.
shared_kv_cache_layers[layer_name] = kv_tgt_layer
bind_kv_cache(self.compilation_config.static_forward_context, bind_kv_cache(self.compilation_config.static_forward_context,
self.gpu_cache) self.gpu_cache, shared_kv_cache_layers)
def _warm_up_model(self) -> None: def _warm_up_model(self) -> None:
# warm up sizes that are not in cudagraph capture sizes, # warm up sizes that are not in cudagraph capture sizes,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment