sync v0.15.1

de889cb6 · zhuwenwen · c721b814 · de889cb6 · de889cb6 · de889cb6
Commit de889cb6 authored Feb 05, 2026 by zhuwenwen
4 changed files
--- a/vllm/model_executor/models/mistral.py
+++ b/vllm/model_executor/models/mistral.py
@@ -156,8 +156,16 @@ class MistralDecoderLayer(LlamaDecoderLayer):
        )
        self.layer_idx = int(prefix.split(sep=".")[-1])
+        quant_config = self.get_quant_config(vllm_config)
        config = config or vllm_config.model_config.hf_config
+        do_fusion = getattr(
+            quant_config, "enable_quantization_scaling_fusion", False
+        ) and vllm_config.cache_config.cache_dtype.startswith("fp8")
+        if do_fusion:
+            self.input_layernorm.quant_scaling_from = self.self_attn.qkv_proj
+            self.post_attention_layernorm.quant_scaling_from = self.mlp.gate_up_proj
        if getattr(config, "ada_rms_norm_t_cond", False):
            self.ada_rms_norm_t_cond = nn.Sequential(
                ColumnParallelLinear(
@@ -339,4 +347,4 @@ class MistralForCausalLM(LlamaForCausalLM):
            elif item in mapping and mapping[item] not in name:
                name = name.replace(item, mapping[item])
        return name, loaded_weight
\ No newline at end of file
--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -284,6 +284,7 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
            num_splits = torch.zeros((B + 1,), dtype=dtype, device=device)
            scheduler_metadata.tile_scheduler_metadata = tile_scheduler_metadata
            scheduler_metadata.num_splits = num_splits
        if self.kv_cache_dtype.startswith("fp8"):
            o, lse = flash_mla_with_kvcache_fp8(
                q=q,

--- a/vllm/v1/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/v1/attention/ops/chunked_prefill_paged_decode.py
@@ -302,7 +302,6 @@ def chunked_prefill_paged_decode(
    block_size = value_cache.shape[3]
    num_seqs = len(seq_lens)
    num_query_heads = query.shape[1]
-    # key may be None in cross-attention decode (already cached from encoder)
    num_kv_heads = key.shape[1]
    num_queries_per_kv = query.shape[1] // key.shape[1]
    head_size = query.shape[2]

--- a/vllm/v1/attention/ops/flashmla.py
+++ b/vllm/v1/attention/ops/flashmla.py
@@ -22,7 +22,7 @@ else:
 if current_platform.is_cuda():
    try:
        import vllm._flashmla_extension_C  # noqa: F401
        _flashmla_extension_C_AVAILABLE = True
    except ImportError:
        _flashmla_extension_C_AVAILABLE = False