Commit fc7980db authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.15.1' into v0.15.1-ori

parents 3eab7fef 1892993b
...@@ -26,7 +26,6 @@ from vllm.model_executor.layers.fused_moe import ( ...@@ -26,7 +26,6 @@ from vllm.model_executor.layers.fused_moe import (
) )
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
FusedMoEQuantConfig, FusedMoEQuantConfig,
RoutingMethodType,
) )
from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod from vllm.model_executor.layers.fused_moe.layer import UnquantizedFusedMoEMethod
from vllm.model_executor.layers.fused_moe.oracle.fp8 import ( from vllm.model_executor.layers.fused_moe.oracle.fp8 import (
...@@ -964,17 +963,9 @@ class Fp8MoEMethod(FusedMoEMethodBase): ...@@ -964,17 +963,9 @@ class Fp8MoEMethod(FusedMoEMethodBase):
if self.block_quant: if self.block_quant:
import vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe # noqa: E501, F401 import vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe # noqa: E501, F401
e_score_correction_bias = (
layer.e_score_correction_bias.to(x.dtype)
if layer.e_score_correction_bias is not None
else None
)
routing_method_type = layer.routing_method_type
return torch.ops.vllm.flashinfer_fused_moe_blockscale_fp8( return torch.ops.vllm.flashinfer_fused_moe_blockscale_fp8(
routing_logits=router_logits.to(torch.float32) routing_logits=router_logits,
if routing_method_type == RoutingMethodType.DeepSeekV3 routing_bias=layer.e_score_correction_bias,
else router_logits,
routing_bias=e_score_correction_bias,
x=x, x=x,
w13_weight=layer.w13_weight, w13_weight=layer.w13_weight,
w13_weight_scale_inv=layer.w13_weight_scale_inv, w13_weight_scale_inv=layer.w13_weight_scale_inv,
...@@ -988,7 +979,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): ...@@ -988,7 +979,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
expert_offset=layer.ep_rank * layer.local_num_experts, expert_offset=layer.ep_rank * layer.local_num_experts,
local_num_experts=layer.local_num_experts, local_num_experts=layer.local_num_experts,
block_shape=self.weight_block_size, block_shape=self.weight_block_size,
routing_method_type=routing_method_type, routing_method_type=layer.routing_method_type,
routed_scaling=layer.routed_scaling_factor, routed_scaling=layer.routed_scaling_factor,
) )
else: else:
......
...@@ -28,6 +28,7 @@ def rocm_per_tensor_float_w8a8_scaled_mm_impl( ...@@ -28,6 +28,7 @@ def rocm_per_tensor_float_w8a8_scaled_mm_impl(
A.shape[0] == 1 A.shape[0] == 1
and B.shape[1] % 16 == 0 and B.shape[1] % 16 == 0
and ((bias is None) or (bias.dtype == out_dtype)) and ((bias is None) or (bias.dtype == out_dtype))
and A.is_contiguous()
): ):
output = ops.wvSplitKQ( output = ops.wvSplitKQ(
B.t(), B.t(),
......
...@@ -6,7 +6,6 @@ from typing import TYPE_CHECKING ...@@ -6,7 +6,6 @@ from typing import TYPE_CHECKING
import torch import torch
import vllm.envs as envs
import vllm.model_executor.layers.fused_moe.modular_kernel as mk import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.logger import init_logger from vllm.logger import init_logger
...@@ -22,10 +21,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( ...@@ -22,10 +21,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
swizzle_blockscale, swizzle_blockscale,
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.flashinfer import (
has_flashinfer_cutedsl_grouped_gemm_nt_masked,
has_flashinfer_cutlass_fused_moe,
)
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.model_executor.layers.fused_moe.oracle.nvfp4 import ( from vllm.model_executor.layers.fused_moe.oracle.nvfp4 import (
...@@ -36,8 +31,6 @@ logger = init_logger(__name__) ...@@ -36,8 +31,6 @@ logger = init_logger(__name__)
__all__ = [ __all__ = [
"is_flashinfer_fp4_cutlass_moe_available",
"is_flashinfer_fp4_cutedsl_moe_available",
"reorder_w1w3_to_w3w1", "reorder_w1w3_to_w3w1",
] ]
...@@ -122,26 +115,6 @@ def is_supported_config_trtllm( ...@@ -122,26 +115,6 @@ def is_supported_config_trtllm(
return True, None return True, None
def is_flashinfer_fp4_cutlass_moe_available() -> bool:
"""Return `True` when FlashInfer CUTLASS NV-FP4 kernels can be used."""
return (
envs.VLLM_USE_FLASHINFER_MOE_FP4
and has_flashinfer_cutlass_fused_moe()
and current_platform.is_cuda()
and current_platform.has_device_capability(100)
)
def is_flashinfer_fp4_cutedsl_moe_available() -> bool:
"""Return ``True`` when FlashInfer CUTEDSL NV-FP4 kernels can be used."""
return (
envs.VLLM_USE_FLASHINFER_MOE_FP4
and has_flashinfer_cutedsl_grouped_gemm_nt_masked()
and current_platform.is_cuda()
and current_platform.is_device_capability_family(100)
)
def reorder_w1w3_to_w3w1( def reorder_w1w3_to_w3w1(
weight: torch.Tensor, scale: torch.Tensor, dim: int = -2 weight: torch.Tensor, scale: torch.Tensor, dim: int = -2
) -> tuple[torch.Tensor, torch.Tensor]: ) -> tuple[torch.Tensor, torch.Tensor]:
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization.utils.flashinfer_fp4_moe import (
is_flashinfer_fp4_cutedsl_moe_available,
is_flashinfer_fp4_cutlass_moe_available,
)
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
is_fp4_marlin_supported,
)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
cutlass_fp4_supported,
)
__all__ = ["detect_nvfp4_moe_support", "NvFp4Support"]
_logger = init_logger(__name__)
@dataclass(frozen=True)
class NvFp4Support:
"""Result container for NV-FP4 capability probing."""
cutlass_supported: bool
allow_flashinfer: bool
use_marlin: bool
def detect_nvfp4_moe_support(class_name: str = "") -> NvFp4Support:
"""Detect platform support for NV-FP4 fused-MoE path"""
cutlass_supported = cutlass_fp4_supported()
allow_flashinfer = cutlass_supported and (
is_flashinfer_fp4_cutlass_moe_available()
or is_flashinfer_fp4_cutedsl_moe_available()
)
if allow_flashinfer:
_logger.info_once(
"Using FlashInfer kernels for %s.", class_name or "NVFP4 path"
)
else:
if envs.VLLM_USE_FLASHINFER_MOE_FP4:
_logger.warning_once(
"FlashInfer kernels unavailable for %s on current platform.",
class_name or "NVFP4 path",
)
use_marlin = False
if not cutlass_supported:
if is_fp4_marlin_supported():
use_marlin = True
_logger.info_once("Falling back to Marlin FP4 MoE kernel.")
else:
raise ValueError(
"Current platform does not support NVFP4 quantization. "
"Please use Blackwell GPUs or enable FlashInfer."
)
return NvFp4Support(
cutlass_supported=cutlass_supported,
allow_flashinfer=allow_flashinfer,
use_marlin=use_marlin,
)
...@@ -146,6 +146,7 @@ def rocm_unquantized_gemm_impl( ...@@ -146,6 +146,7 @@ def rocm_unquantized_gemm_impl(
and n <= 128 and n <= 128
and k > 512 and k > 512
and math.ceil(k / 512) * math.ceil(m / 16) < get_cu_count() and math.ceil(k / 512) * math.ceil(m / 16) < get_cu_count()
and x.is_contiguous()
) )
# k == 2880 and (m == 640 or m == 128)) # k == 2880 and (m == 640 or m == 128))
) )
...@@ -165,6 +166,7 @@ def rocm_unquantized_gemm_impl( ...@@ -165,6 +166,7 @@ def rocm_unquantized_gemm_impl(
and on_gfx9() and on_gfx9()
and x.dtype in [torch.float16, torch.bfloat16] and x.dtype in [torch.float16, torch.bfloat16]
and k % 8 == 0 and k % 8 == 0
and x.is_contiguous()
) )
if use_skinny is not True: if use_skinny is not True:
......
...@@ -466,6 +466,7 @@ def load_weights_using_from_2_way_softmax( ...@@ -466,6 +466,7 @@ def load_weights_using_from_2_way_softmax(
language_model = _get_language_model_for_seq_cls(model) language_model = _get_language_model_for_seq_cls(model)
is_vlm = language_model is not model is_vlm = language_model is not model
using_vlm_head = is_vlm and hasattr(language_model, "score")
language_model.lm_head = ParallelLMHead( language_model.lm_head = ParallelLMHead(
text_config.vocab_size, text_config.hidden_size, quant_config=quant_config text_config.vocab_size, text_config.hidden_size, quant_config=quant_config
...@@ -506,14 +507,16 @@ def load_weights_using_from_2_way_softmax( ...@@ -506,14 +507,16 @@ def load_weights_using_from_2_way_softmax(
torch.float32 torch.float32
) - lm_head_weight.data[[false_id]].to(torch.float32) ) - lm_head_weight.data[[false_id]].to(torch.float32)
score_layer = language_model.score if is_vlm else model.score score_layer = language_model.score if using_vlm_head else model.score
param = score_layer.weight param = score_layer.weight
weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, score_weight) weight_loader(param, score_weight)
del language_model.lm_head del language_model.lm_head
score_weight_name = "language_model.score.weight" if is_vlm else "score.weight" score_weight_name = (
"language_model.score.weight" if using_vlm_head else "score.weight"
)
loaded_weights.add(score_weight_name) loaded_weights.add(score_weight_name)
lm_head_name = "lm_head.weight" lm_head_name = "lm_head.weight"
...@@ -537,6 +540,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te ...@@ -537,6 +540,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
language_model = _get_language_model_for_seq_cls(model) language_model = _get_language_model_for_seq_cls(model)
is_vlm = language_model is not model is_vlm = language_model is not model
using_vlm_head = is_vlm and hasattr(language_model, "score")
language_model.lm_head = ParallelLMHead( language_model.lm_head = ParallelLMHead(
text_config.vocab_size, text_config.hidden_size, quant_config=quant_config text_config.vocab_size, text_config.hidden_size, quant_config=quant_config
...@@ -572,14 +576,16 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te ...@@ -572,14 +576,16 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens] token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
score_weight = language_model.lm_head.weight.data[token_ids] score_weight = language_model.lm_head.weight.data[token_ids]
score_layer = language_model.score if is_vlm else model.score score_layer = language_model.score if using_vlm_head else model.score
param = score_layer.weight param = score_layer.weight
weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, score_weight) weight_loader(param, score_weight)
del language_model.lm_head del language_model.lm_head
score_weight_name = "language_model.score.weight" if is_vlm else "score.weight" score_weight_name = (
"language_model.score.weight" if using_vlm_head else "score.weight"
)
loaded_weights.add(score_weight_name) loaded_weights.add(score_weight_name)
lm_head_name = "lm_head.weight" lm_head_name = "lm_head.weight"
......
...@@ -107,6 +107,7 @@ class MiniMaxM2MoE(nn.Module): ...@@ -107,6 +107,7 @@ class MiniMaxM2MoE(nn.Module):
renormalize=True, renormalize=True,
quant_config=quant_config, quant_config=quant_config,
prefix=f"{prefix}.experts", prefix=f"{prefix}.experts",
router_logits_dtype=torch.float32,
) )
self.gate = ReplicatedLinear( self.gate = ReplicatedLinear(
......
...@@ -11,7 +11,6 @@ import math ...@@ -11,7 +11,6 @@ import math
from collections.abc import Iterable, Mapping, Sequence from collections.abc import Iterable, Mapping, Sequence
from typing import Annotated, Literal from typing import Annotated, Literal
import cv2
import numpy as np import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
...@@ -416,6 +415,8 @@ class NemotronParseImageProcessor: ...@@ -416,6 +415,8 @@ class NemotronParseImageProcessor:
else: else:
self.target_height = self.target_width = int(self.final_size) self.target_height = self.target_width = int(self.final_size)
import cv2
self.transform = A.Compose( self.transform = A.Compose(
[ [
A.PadIfNeeded( A.PadIfNeeded(
...@@ -457,6 +458,8 @@ class NemotronParseImageProcessor: ...@@ -457,6 +458,8 @@ class NemotronParseImageProcessor:
new_height = int(new_width / aspect_ratio) new_height = int(new_width / aspect_ratio)
# Use cv2.INTER_LINEAR like the original # Use cv2.INTER_LINEAR like the original
import cv2
return cv2.resize( return cv2.resize(
image, (new_width, new_height), interpolation=cv2.INTER_LINEAR image, (new_width, new_height), interpolation=cv2.INTER_LINEAR
) )
......
...@@ -188,6 +188,7 @@ _TEXT_GENERATION_MODELS = { ...@@ -188,6 +188,7 @@ _TEXT_GENERATION_MODELS = {
"SeedOssForCausalLM": ("seed_oss", "SeedOssForCausalLM"), "SeedOssForCausalLM": ("seed_oss", "SeedOssForCausalLM"),
"Step1ForCausalLM": ("step1", "Step1ForCausalLM"), "Step1ForCausalLM": ("step1", "Step1ForCausalLM"),
"Step3TextForCausalLM": ("step3_text", "Step3TextForCausalLM"), "Step3TextForCausalLM": ("step3_text", "Step3TextForCausalLM"),
"Step3p5ForCausalLM": ("step3p5", "Step3p5ForCausalLM"),
"StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"), "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"),
"StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"), "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"),
"Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"), "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
...@@ -478,6 +479,7 @@ _SPECULATIVE_DECODING_MODELS = { ...@@ -478,6 +479,7 @@ _SPECULATIVE_DECODING_MODELS = {
"MedusaModel": ("medusa", "Medusa"), "MedusaModel": ("medusa", "Medusa"),
"OpenPanguMTPModel": ("openpangu_mtp", "OpenPanguMTP"), "OpenPanguMTPModel": ("openpangu_mtp", "OpenPanguMTP"),
"Qwen3NextMTP": ("qwen3_next_mtp", "Qwen3NextMTP"), "Qwen3NextMTP": ("qwen3_next_mtp", "Qwen3NextMTP"),
"Step3p5MTP": ("step3p5_mtp", "Step3p5MTP"),
# Temporarily disabled. # Temporarily disabled.
# # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1. # # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1.
# "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"), # "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
......
This diff is collapsed.
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Iterable
import torch
import torch.nn as nn
from transformers import PretrainedConfig
from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.model_executor.layers.layernorm import GemmaRMSNorm
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead,
VocabParallelEmbedding,
)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.sequence import IntermediateTensors
from .step3p5 import Step3p5DecoderLayer, get_spec_layer_idx_from_weight_name
from .utils import maybe_prefix
logger = init_logger(__name__)
class SharedHead(nn.Module):
def __init__(
self,
config: PretrainedConfig,
quant_config: QuantizationConfig | None = None,
) -> None:
super().__init__()
self.norm = GemmaRMSNorm(config.hidden_size, config.rms_norm_eps)
self.head = ParallelLMHead(
config.vocab_size, config.hidden_size, quant_config=quant_config
)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
return self.norm(hidden_states)
class Step3p5AMultiTokenPredictorLayer(nn.Module):
def __init__(
self,
vllm_config: VllmConfig,
prefix: str,
) -> None:
super().__init__()
config = vllm_config.model_config.hf_config
quant_config = vllm_config.quant_config
self.enorm = GemmaRMSNorm(config.hidden_size, config.rms_norm_eps)
self.hnorm = GemmaRMSNorm(config.hidden_size, config.rms_norm_eps)
self.eh_proj = nn.Linear(config.hidden_size * 2, config.hidden_size, bias=False)
self.shared_head = SharedHead(config=config, quant_config=quant_config)
self.mtp_block = Step3p5DecoderLayer(
vllm_config,
prefix=f"{prefix}.mtp_block",
)
def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
previous_hidden_states: torch.Tensor,
inputs_embeds: torch.Tensor | None = None,
spec_step_index: int = 0,
) -> torch.Tensor:
assert inputs_embeds is not None
inputs_embeds = self.enorm(inputs_embeds)
previous_hidden_states = self.hnorm(previous_hidden_states)
hidden_states = self.eh_proj(
torch.cat([inputs_embeds, previous_hidden_states], dim=-1)
)
hidden_states = self.mtp_block(positions=positions, hidden_states=hidden_states)
return hidden_states
class Step3p5AMultiTokenPredictor(nn.Module):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
self.embed_tokens = VocabParallelEmbedding(
config.vocab_size,
config.hidden_size,
)
self.mtp_start_layer_idx = config.num_hidden_layers
self.num_mtp_layers = config.num_nextn_predict_layers
# to map the exact layer index from weights
self.layers = torch.nn.ModuleDict(
{
str(idx): Step3p5AMultiTokenPredictorLayer(
vllm_config,
f"{prefix}.layers.{idx}",
)
for idx in range(
self.mtp_start_layer_idx,
self.mtp_start_layer_idx + self.num_mtp_layers,
)
}
)
self.logits_processor = LogitsProcessor(config.vocab_size)
def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
previous_hidden_states: torch.Tensor,
inputs_embeds: torch.Tensor | None = None,
spec_step_idx: int = 0,
) -> torch.Tensor:
if inputs_embeds is None:
inputs_embeds = self.embed_tokens(input_ids)
current_step_idx = spec_step_idx % self.num_mtp_layers
return self.layers[str(self.mtp_start_layer_idx + current_step_idx)](
input_ids,
positions,
previous_hidden_states,
inputs_embeds,
current_step_idx,
)
def compute_logits(
self,
hidden_states: torch.Tensor,
spec_step_idx: int = 0,
) -> torch.Tensor:
current_step_idx = spec_step_idx % self.num_mtp_layers
mtp_layer = self.layers[str(self.mtp_start_layer_idx + current_step_idx)]
logits = self.logits_processor(
mtp_layer.shared_head.head, mtp_layer.shared_head(hidden_states)
)
return logits
def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.embed_tokens(input_ids)
class Step3p5MTP(nn.Module):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
self.config = vllm_config.model_config.hf_config
self.vllm_config = vllm_config
self.model = Step3p5AMultiTokenPredictor(
vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
)
def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.model.embed_input_ids(input_ids)
def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
hidden_states: torch.Tensor,
intermediate_tensors: IntermediateTensors | None = None,
inputs_embeds: torch.Tensor | None = None,
spec_step_idx: int = 0,
) -> torch.Tensor:
hidden_states = self.model(
input_ids, positions, hidden_states, inputs_embeds, spec_step_idx
)
return hidden_states
def compute_logits(
self,
hidden_states: torch.Tensor,
spec_step_idx: int = 0,
) -> torch.Tensor | None:
return self.model.compute_logits(hidden_states, spec_step_idx)
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
("qkv_proj", "k_proj", "k"),
("qkv_proj", "v_proj", "v"),
("gate_up_proj", "gate_proj", 0),
("gate_up_proj", "up_proj", 1),
]
expert_params_mapping = [
(".moe.experts.w13_weight", ".moe.gate_proj.weight", "w1"),
(".moe.experts.w13_weight", ".moe.up_proj.weight", "w3"),
(".moe.experts.w2_weight", ".moe.down_proj.weight", "w2"),
]
params_dict = dict(self.named_parameters())
loaded_params: set[str] = set()
for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
spec_layer = get_spec_layer_idx_from_weight_name(self.config, name)
if "embed_tokens" not in name and spec_layer is None:
continue
name = self._rewrite_spec_layer_name(spec_layer, name)
for param_name, weight_name, shard_id in stacked_params_mapping:
# Skip non-stacked layers and experts (experts handled below).
if weight_name not in name:
continue
# We have mlp.experts[0].gate_proj in the checkpoint.
# Since we handle the experts below in expert_params_mapping,
# we need to skip here BEFORE we update the name, otherwise
# name will be updated to mlp.experts[0].gate_up_proj, which
# will then be updated below in expert_params_mapping
# for mlp.experts[0].gate_gate_up_proj, which breaks load.
if ("mlp.experts." in name) and name not in params_dict:
continue
if "experts" in name or "moe" in name:
continue
name = name.replace(weight_name, param_name)
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(param, loaded_weight, shard_id)
break
else:
for mapping in expert_params_mapping:
param_name, weight_name, shard_id = mapping
if weight_name not in name:
continue
name = name.replace(weight_name, param_name)
# Skip loading extra bias for GPTQ models.
if (
name.endswith(".bias") or name.endswith("_bias")
) and name not in params_dict:
continue
param = params_dict[name]
weight_loader = param.weight_loader
for expert_id in range(loaded_weight.shape[0]):
loaded_weight_expert = loaded_weight[expert_id]
weight_loader(
param,
loaded_weight_expert,
name,
shard_id=shard_id,
expert_id=expert_id,
)
loaded_params.add(name)
break
else:
# Skip loading extra bias for GPTQ models.
if (
name.endswith(".bias")
and name not in params_dict
or "tok_embeddings" in name
):
continue
if spec_layer is not None and ".transformer." in name:
name = name.replace(".transformer.", ".")
if "shared_head" in name:
name = name.replace("shared_head.output", "shared_head.head")
if "embed_tokens" in name:
assert (
hasattr(self.config, "num_nextn_predict_layers")
and self.config.num_nextn_predict_layers > 0
)
name = "model.embed_tokens.weight"
param = params_dict[name]
weight_loader = getattr(
param, "weight_loader", default_weight_loader
)
weight_loader(param, loaded_weight)
loaded_params.add(name)
params_need_to_load = set(params_dict.keys())
# Some KV cache scales are optional: checkpoints may omit them and vLLM
# will fall back to default scales during initialization.
optional_params = {
name
for name, param in params_dict.items()
if name.endswith((".k_scale", ".v_scale", ".q_scale", ".prob_scale"))
and getattr(param, "numel", lambda: 0)() == 1
and getattr(param, "requires_grad", False) is False
}
params_need_to_load -= optional_params
if params_need_to_load != loaded_params:
missing_params = list(params_need_to_load - loaded_params)
param_name_example = missing_params[0]
raise RuntimeError(
"Some parameters like "
f"{param_name_example} are not in the checkpoint and will falsely "
"use random initialization"
)
return loaded_params
def _rewrite_spec_layer_name(self, spec_layer: int, name: str) -> str:
"""
Rewrite the weight name to match the format of the original model.
Add .mtp_block for modules in transformer layer block for spec layer
"""
spec_layer_weight_names = [
"embed_tokens",
"enorm",
"hnorm",
"eh_proj",
"shared_head",
]
spec_layer_weight = False
for weight_name in spec_layer_weight_names:
if weight_name in name:
spec_layer_weight = True
break
if not spec_layer_weight:
# treat rest weights as weights for transformer layer block
name = name.replace(
f"model.layers.{spec_layer}.", f"model.layers.{spec_layer}.mtp_block."
)
return name
...@@ -84,6 +84,10 @@ _REASONING_PARSERS_TO_REGISTER = { ...@@ -84,6 +84,10 @@ _REASONING_PARSERS_TO_REGISTER = {
"step3_reasoning_parser", "step3_reasoning_parser",
"Step3ReasoningParser", "Step3ReasoningParser",
), ),
"step3p5": (
"step3p5_reasoning_parser",
"Step3p5ReasoningParser",
),
} }
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
from vllm.entrypoints.openai.responses.protocol import (
ResponsesRequest,
)
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
from vllm.tokenizers import TokenizerLike
class Step3p5ReasoningParser(BaseThinkingReasoningParser):
"""
Reasoning parser for Step3p5 model.
Step3p5 uses the <think>...</think> format, but it tends to emit an extra
newline immediately before and/or after the </think> token. This parser trims:
- the newline right before </think>
- the newline right after </think>
"""
@property
def start_token(self) -> str:
return "<think>"
@property
def end_token(self) -> str:
return "</think>"
def __init__(self, tokenizer: TokenizerLike, *args, **kwargs):
super().__init__(tokenizer, *args, **kwargs)
# Used to hold a trailing "\n" from reasoning content so we can decide
# whether it is immediately before </think>.
self._pending_reasoning_newline = False
# Used to delay the reasoning end detection.
# This is necessary to remove the newline appears immediately after </think>,
# which may cause the end detection to be delayed by one round.
self.end_offset = 1
def is_reasoning_end(self, input_ids: Sequence[int]) -> bool:
if self.end_token_id in input_ids and self.end_offset > 0:
self.end_offset -= 1
return False
return self.end_offset < 1
def is_reasoning_end_streaming(
self, input_ids: Sequence[int], delta_ids: Sequence[int]
) -> bool:
if self.end_token_id in input_ids and self.end_offset > 0:
self.end_offset -= 1
return False
return self.end_offset < 1
def extract_reasoning(
self,
model_output: str,
request: ChatCompletionRequest | ResponsesRequest,
) -> tuple[str | None, str | None]:
reasoning, content = super().extract_reasoning(model_output, request)
if reasoning is not None:
reasoning = reasoning.removesuffix("\n")
if content is not None:
content = content.removeprefix("\n")
return reasoning or None, content or None
def extract_reasoning_streaming(
self,
previous_text: str,
current_text: str,
delta_text: str,
previous_token_ids: Sequence[int],
current_token_ids: Sequence[int],
delta_token_ids: Sequence[int],
) -> DeltaMessage | None:
# Drop the immediate newline that models often emit after </think>.
if previous_text.endswith(self.end_token) and delta_text:
if delta_text == "\n":
return None
elif delta_text.startswith("\n"):
remaining = delta_text.removeprefix("\n")
return DeltaMessage(content=remaining) if remaining else None
ret = super().extract_reasoning_streaming(
previous_text,
current_text,
delta_text,
previous_token_ids,
current_token_ids,
delta_token_ids,
)
if ret is None:
return None
# Compatibility path for models that don't generate the start token:
# treat everything before </think> as reasoning and everything after
# as content.
if (
self.start_token_id not in previous_token_ids
and self.start_token_id not in delta_token_ids
):
if self.end_token_id in delta_token_ids:
end_index = delta_text.find(self.end_token)
reasoning = delta_text[:end_index]
content = delta_text[end_index + len(self.end_token) :]
ret = DeltaMessage(reasoning=reasoning, content=content or None)
elif self.end_token_id in previous_token_ids:
ret = DeltaMessage(content=delta_text)
else:
ret = DeltaMessage(reasoning=delta_text)
reasoning_to_output = ret.reasoning
content_to_output = ret.content
# Reasoning: handle the newline immediately before </think>.
if reasoning_to_output is not None:
if self._pending_reasoning_newline:
reasoning_to_output = "\n" + reasoning_to_output
self._pending_reasoning_newline = False
if reasoning_to_output.endswith("\n"):
reasoning_to_output = reasoning_to_output.removesuffix("\n")
if self.end_token in delta_text:
# Trailing "\n" is right before </think>, drop it.
self._pending_reasoning_newline = False
else:
# Hold the trailing "\n" until we know whether </think> follows.
self._pending_reasoning_newline = True
# Content: handle the newline immediately after </think>.
if content_to_output is not None:
# No need to get into parser again to remove newline after </think>.
self.end_offset -= 1
# If we have content, reasoning must have ended.
self._pending_reasoning_newline = False
if self.end_token in delta_text and content_to_output.startswith("\n"):
content_to_output = content_to_output.removeprefix("\n")
reasoning_to_output = reasoning_to_output or None
content_to_output = content_to_output or None
if reasoning_to_output is None and content_to_output is None:
return None
return DeltaMessage(reasoning=reasoning_to_output, content=content_to_output)
...@@ -134,6 +134,10 @@ _TOOL_PARSERS_TO_REGISTER = { ...@@ -134,6 +134,10 @@ _TOOL_PARSERS_TO_REGISTER = {
"step3_tool_parser", "step3_tool_parser",
"Step3ToolParser", "Step3ToolParser",
), ),
"step3p5": (
"step3p5_tool_parser",
"Step3p5ToolParser",
),
"xlam": ( "xlam": (
"xlam_tool_parser", "xlam_tool_parser",
"xLAMToolParser", "xLAMToolParser",
......
This diff is collapsed.
...@@ -96,6 +96,8 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict( ...@@ -96,6 +96,8 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
ultravox="UltravoxConfig", ultravox="UltravoxConfig",
step3_vl="Step3VLConfig", step3_vl="Step3VLConfig",
step3_text="Step3TextConfig", step3_text="Step3TextConfig",
step3p5="Step3p5Config",
qwen3_asr="Qwen3ASRConfig",
qwen3_next="Qwen3NextConfig", qwen3_next="Qwen3NextConfig",
lfm2_moe="Lfm2MoeConfig", lfm2_moe="Lfm2MoeConfig",
tarsier2="Tarsier2Config", tarsier2="Tarsier2Config",
......
...@@ -50,6 +50,8 @@ _CLASS_TO_MODULE: dict[str, str] = { ...@@ -50,6 +50,8 @@ _CLASS_TO_MODULE: dict[str, str] = {
"Step3VLConfig": "vllm.transformers_utils.configs.step3_vl", "Step3VLConfig": "vllm.transformers_utils.configs.step3_vl",
"Step3VisionEncoderConfig": "vllm.transformers_utils.configs.step3_vl", "Step3VisionEncoderConfig": "vllm.transformers_utils.configs.step3_vl",
"Step3TextConfig": "vllm.transformers_utils.configs.step3_vl", "Step3TextConfig": "vllm.transformers_utils.configs.step3_vl",
"Step3p5Config": "vllm.transformers_utils.configs.step3p5",
"Qwen3ASRConfig": "vllm.transformers_utils.configs.qwen3_asr",
"Qwen3NextConfig": "vllm.transformers_utils.configs.qwen3_next", "Qwen3NextConfig": "vllm.transformers_utils.configs.qwen3_next",
"Tarsier2Config": "vllm.transformers_utils.configs.tarsier2", "Tarsier2Config": "vllm.transformers_utils.configs.tarsier2",
# Special case: DeepseekV3Config is from HuggingFace Transformers # Special case: DeepseekV3Config is from HuggingFace Transformers
...@@ -90,6 +92,8 @@ __all__ = [ ...@@ -90,6 +92,8 @@ __all__ = [
"Step3VLConfig", "Step3VLConfig",
"Step3VisionEncoderConfig", "Step3VisionEncoderConfig",
"Step3TextConfig", "Step3TextConfig",
"Step3p5Config",
"Qwen3ASRConfig",
"Qwen3NextConfig", "Qwen3NextConfig",
"Tarsier2Config", "Tarsier2Config",
] ]
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
from transformers.configuration_utils import PretrainedConfig
class Step3p5Config(PretrainedConfig):
model_type = "step3p5"
def __init__(
self,
hidden_size: int = 5120,
intermediate_size: int = 13312,
num_attention_heads: int = 40,
num_attention_groups: int = 8,
num_hidden_layers: int = 48,
max_seq_len: int = 4096,
vocab_size: int = 65536,
rms_norm_eps: float = 1e-5,
moe_every_n_layer: int = 2,
use_moe: bool = False,
moe_intermediate_size: int = 10240,
moe_num_experts: int = 16,
moe_top_k: int = 4,
moe_layer_offset: int = 0,
rope_theta: float | list[float] | None = 500000,
rope_scaling: dict[str, Any] | None = None,
head_dim: int | None = None,
share_expert_dim: int | None = None,
norm_expert_weight: bool = True,
bos_token_id: list[int] | int | None = None,
eos_token_id: list[int] | int | None = None,
moe_router_activation: str = "softmax",
moe_router_scaling_factor: float = 1.0,
att_impl_type: str = "GQA",
use_head_wise_attn_gate: bool = False,
use_moe_router_bias: bool = True,
need_fp32_gate: bool = True,
layer_types: list[str] | None = None,
use_rope_layers: list[bool] | None = None,
yarn_only_types: list[str] | None = None,
attention_other_setting: dict[str, Any] | None = None,
num_nextn_predict_layers: int = 0,
swiglu_limits: list[float] | None = None,
swiglu_limits_shared: list[float] | None = None,
max_position_embeddings: int | None = None,
**kwargs,
):
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_attention_heads = num_attention_heads
self.num_attention_groups = num_attention_groups
self.num_hidden_layers = num_hidden_layers
self.max_seq_len = max_seq_len
self.vocab_size = vocab_size
self.rms_norm_eps = rms_norm_eps
self.use_moe = use_moe
self.moe_intermediate_size = moe_intermediate_size
self.moe_every_n_layer = moe_every_n_layer
self.moe_num_experts = moe_num_experts
self.num_experts_per_tok = moe_top_k
self.moe_top_k = moe_top_k
self.moe_layer_offset = moe_layer_offset
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.head_dim = head_dim
if share_expert_dim is None:
self.share_expert_dim = self.moe_intermediate_size * self.moe_top_k
else:
self.share_expert_dim = share_expert_dim
self.norm_expert_weight = norm_expert_weight
self.max_position_embeddings = max_position_embeddings
self.moe_router_activation = moe_router_activation
self.moe_router_scaling_factor = moe_router_scaling_factor
self.use_moe_router_bias = use_moe_router_bias
self.need_fp32_gate = need_fp32_gate
self.att_impl_type = att_impl_type
self.use_head_wise_attn_gate = use_head_wise_attn_gate
self.layer_types = layer_types
self.use_rope_layers = use_rope_layers
self.yarn_only_types = yarn_only_types
self.attention_other_setting = attention_other_setting
self.num_nextn_predict_layers = num_nextn_predict_layers
self.swiglu_limits = swiglu_limits
self.swiglu_limits_shared = swiglu_limits_shared
resolved_bos_token_id = 1 if bos_token_id is None else bos_token_id
resolved_eos_token_id = [2, 3] if eos_token_id is None else eos_token_id
self.bos_token_id = resolved_bos_token_id
self.eos_token_id = resolved_eos_token_id
super().__init__(
bos_token_id=resolved_bos_token_id,
eos_token_id=resolved_eos_token_id,
**kwargs,
)
...@@ -263,18 +263,6 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad ...@@ -263,18 +263,6 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
vllm_config: "VllmConfig", vllm_config: "VllmConfig",
kv_cache_spec: "AttentionSpec", kv_cache_spec: "AttentionSpec",
) -> AttentionCGSupport: ) -> AttentionCGSupport:
# FA2 does not support CUDA graphs with encoder-decoder models due to
# accuracy issues reported in https://github.com/vllm-project/vllm/issues/33091
if (
vllm_config.model_config.is_encoder_decoder
and get_flash_attn_version() == 2
):
logger.warning_once(
"FlashAttention2 does not support CUDA graphs with "
"encoder-decoder models due to accuracy issues reported in #33091. "
"Disabling CUDA graph."
)
return AttentionCGSupport.NEVER
return cls._cudagraph_support return cls._cudagraph_support
def __init__( def __init__(
......
...@@ -479,6 +479,16 @@ class HybridKVCacheCoordinator(KVCacheCoordinator): ...@@ -479,6 +479,16 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
hit_length = max_cache_hit_length hit_length = max_cache_hit_length
hit_blocks_by_group: list[list[KVCacheBlock] | None] = [None] * num_groups hit_blocks_by_group: list[list[KVCacheBlock] | None] = [None] * num_groups
# Simple hybrid (1 full attn + 1 other): one iteration suffices.
# Full attn is always first if it exists. This avoids EAGLE drops
# being applied multiple times to non-full-attn groups.
# FIXME (yifan): However, for complex hybrid models with multiple attn
# groups, we still have the EAGLE spiral block dropping problem. See
# discussion in issue https://github.com/vllm-project/vllm/issues/32802.
is_simple_hybrid = len(self.attention_groups) == 2 and isinstance(
self.attention_groups[0][0], FullAttentionSpec
)
while True: while True:
curr_hit_length = hit_length curr_hit_length = hit_length
...@@ -495,10 +505,6 @@ class HybridKVCacheCoordinator(KVCacheCoordinator): ...@@ -495,10 +505,6 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
# the last iteration. # the last iteration.
num_blocks = curr_hit_length // spec.block_size num_blocks = curr_hit_length // spec.block_size
curr_hit_length = num_blocks * spec.block_size curr_hit_length = num_blocks * spec.block_size
for group_id in group_ids:
blocks = hit_blocks_by_group[group_id]
assert blocks is not None
del blocks[num_blocks:]
else: else:
hit_blocks = manager_cls.find_longest_cache_hit( hit_blocks = manager_cls.find_longest_cache_hit(
block_hashes=_get_block_hashes(spec), block_hashes=_get_block_hashes(spec),
...@@ -513,10 +519,20 @@ class HybridKVCacheCoordinator(KVCacheCoordinator): ...@@ -513,10 +519,20 @@ class HybridKVCacheCoordinator(KVCacheCoordinator):
for group_id, blocks in zip(group_ids, hit_blocks): for group_id, blocks in zip(group_ids, hit_blocks):
hit_blocks_by_group[group_id] = blocks hit_blocks_by_group[group_id] = blocks
if curr_hit_length < hit_length: if curr_hit_length >= hit_length:
hit_length = curr_hit_length
else:
break break
hit_length = curr_hit_length
# Simple hybrid: exit after one iteration
if is_simple_hybrid:
break
# Truncate full attention blocks to final hit_length (if present)
spec, group_ids, _ = self.attention_groups[0]
if isinstance(spec, FullAttentionSpec):
num_blocks = hit_length // spec.block_size
for group_id in group_ids:
if (blks := hit_blocks_by_group[group_id]) is not None:
del blks[num_blocks:]
return tuple( return tuple(
blocks if blocks is not None else [] for blocks in hit_blocks_by_group blocks if blocks is not None else [] for blocks in hit_blocks_by_group
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment