[models] support step3v

583034f1 · zhuwenwen · 0adf9cda · 583034f1 · 583034f1 · 583034f1
Commit 583034f1 authored Oct 20, 2025 by zhuwenwen
20 changed files
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3418,6 +3418,8 @@ def _get_and_verify_max_len(
    possible_keys = [
        # OPT
        "max_position_embeddings",
+        # step3
+        "max_position_embedding",
        # GPT-2
        "n_positions",
        # MPT
@@ -3491,7 +3493,13 @@ def _get_and_verify_max_len(
        # loading HF config
        rope_type = rope_scaling["rope_type"]
        
-        if rope_type not in ("su", "longrope", "llama3"):
+        if rope_type == "ntk_bypart":
+            derived_max_model_len = min(
+                derived_max_model_len,
+                rope_scaling["real_length"] * rope_scaling["scaling_factor"]
+            ) if "real_length" in rope_scaling and "scaling_factor" in rope_scaling else derived_max_model_len
+
+        elif rope_type not in ("su", "longrope", "llama3"):
            if disable_sliding_window:
                # TODO(robertgshaw): Find a model that supports rope_scaling
                # with sliding window to see if this case should be allowed.
@@ -3548,6 +3556,8 @@ def _get_and_verify_max_len(
                logger.warning(
                    "%s Make sure the value is correct and within the "
                    "model context size.", msg)
+                if getattr(hf_config, "max_position_embedding", None) is not None:  # step3/3v
+                    hf_config.max_position_embedding = max_model_len
            else:
                raise ValueError(
                    f"{msg} To allow overriding this maximum, set "

--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -36,4 +36,7 @@ __all__ = [
    "xLAMToolParser", 
    "MinimaxToolParser",
    "Glm4MoeModelToolParser",
+    "Step1p5vMini2ToolParser", 
+    "Step1p5vMini2MsToolParser", 
+    "Step3ToolParser",
 ]
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -3,6 +3,7 @@
 """Custom activation functions."""
 import math
 from typing import Optional
+import optimus

 import torch
 import torch.nn as nn
@@ -53,6 +54,14 @@ class FatreluAndMul(CustomOp):
        return out


+class OptimusSiluAndMul(nn.Module):
+
+    def forward(self,
+                x: torch.Tensor,
+                output: Optional[torch.Tensor] = None) -> torch.Tensor:
+        return torch.ops.Optimus.SiluDot_forward(x, out=output)
+    
+    
 @CustomOp.register("silu_and_mul")
 class SiluAndMul(CustomOp):
    """An activation function for SwiGLU.

--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Custom normalization layers."""
 from typing import Optional, Union, Tuple
+import optimus  # noqa F401

 import torch
 import torch.nn as nn
@@ -298,6 +299,49 @@ class RMSNorm(CustomOp):
        return s


+class OptimusRMSNorm(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self,
+                x: torch.Tensor,
+                residual: Optional[torch.Tensor] = None,
+                output: Optional[torch.Tensor] = None,
+                fp16_out: bool = False) -> torch.Tensor:
+        if residual is not None:
+            assert output is None
+            from vllm import _custom_ops as ops
+
+            assert not fp16_out
+            ops.fused_add_rms_norm(
+                x,
+                residual,
+                self.weight.data,
+                self.variance_epsilon,
+            )
+            return x, residual
+        else:
+            if fp16_out:
+                if output is None:
+                    output = torch.empty_like(x).half()
+                else:
+                    output = output.half()
+            # return torch.ops.Optimus.rms_norm(x,
+            #                                   self.weight,
+            #                                   self.variance_epsilon,
+            #                                   out=output)
+            return torch.nn.functional.rms_norm(x,
+                                              self.weight,
+                                              self.variance_epsilon,
+                                              out=output)
+
 @CustomOp.register("gemma_rms_norm")
 class GemmaRMSNorm(CustomOp):
    """RMS normalization for Gemma.
@@ -363,3 +407,35 @@ class GemmaRMSNorm(CustomOp):
                self.forward_static)
            self._is_compiled = True
        return self.forward_native(x, residual)
+    
+    
+class OptimusLayerNorm(nn.Module):
+
+    def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self,
+                x: torch.Tensor,
+                residual: Optional[torch.Tensor] = None,
+                output: Optional[torch.Tensor] = None) -> torch.Tensor:
+        assert residual is None
+        # return torch.ops.Optimus.layer_norm(x,
+        #                                     self.weight,
+        #                                     self.bias,
+        #                                     eps=self.variance_epsilon,
+        #                                     out=output)
+        # return torch.nn.functional.layer_norm(x,
+        #                                     self.weight,
+        #                                     self.bias,
+        #                                     eps=self.variance_epsilon,
+        #                                     out=output)
+        return torch.nn.functional.layer_norm(
+                x,
+                self.weight.shape,  # normalized_shape 应为 weight 的形状
+                self.weight,
+                self.bias,
+                eps=self.variance_epsilon
+            )
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -3,7 +3,7 @@

 import itertools
 from abc import abstractmethod
-from typing import Any, Literal, Optional, Union
+from typing import Any, Literal, Optional, Union, List
 import vllm.envs as envs
 import torch
 import torch.nn as nn
@@ -269,6 +269,40 @@ class UnquantizedLinearMethod(LinearMethodBase):
                return dispatch_unquantized_gemm()(x, layer.weight, bias)


+class UnquantizedMoELinearMethod(LinearMethodBase):
+    """MoE Linear method without quantization.
+    """
+
+    def __init__(self):
+        self.quant_config = None
+
+    def create_weights(self,
+                       layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: List[int],
+                       input_size: int,
+                       output_size: int,
+                       params_dtype: torch.dtype,
+                       num_experts: Optional[int] = None,
+                       **extra_weight_attrs):
+        weight = Parameter(torch.empty(num_experts,
+                                       sum(output_partition_sizes),
+                                       input_size_per_partition,
+                                       device=torch.cuda.current_device(),
+                                       dtype=params_dtype),
+                           requires_grad=False)
+        set_weight_attrs(weight, {"input_dim": 2, "output_dim": 1})
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, extra_weight_attrs)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """Apply the weights to the input tensor."""
+        raise NotImplementedError
+    
+    
 class LinearBase(torch.nn.Module):
    """Base linear layer.

@@ -783,6 +817,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                param.shard_id.append(loaded_shard_id)
                param.shard_id_map[loaded_shard_id] = len(param.data_container)
                param.data_container.append(loaded_weight)
+                if len(param.data_container) == 2:
+                    self.qweight = param.materialize_nested()
                return

        param_data = param.data
@@ -986,6 +1022,175 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                                        shard_offset=shard_offset,
                                        shard_size=shard_size)

+
+class MergedColumnParallelMoELinear(MergedColumnParallelLinear):
+
+    def __init__(self,
+                 num_experts: int,
+                 input_size: int,
+                 output_sizes: List[int],
+                 params_dtype: Optional[torch.dtype] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        torch.nn.Module.__init__(self)
+        output_size = sum(output_sizes)
+        self.num_experts = num_experts
+        self.output_sizes = output_sizes
+        self.input_size = input_size
+        self.output_size = sum(output_sizes)
+        tp_size = get_tensor_model_parallel_world_size()
+        assert all(output_size % tp_size == 0 for output_size in output_sizes)
+        self.output_size_per_partition = divide(self.output_size, tp_size)
+        self.output_partition_sizes = [
+            divide(output_size, tp_size) for output_size in self.output_sizes
+        ]
+        self.gather_output = False
+        if output_sizes is None:
+            output_sizes = [output_size]
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+        if quant_config is None:
+            self.quant_method = UnquantizedMoELinearMethod()
+        else:
+            self.quant_method = quant_config.get_quant_method(self,
+                                                              prefix=prefix)
+            # FIXME(ys): hack for moe
+            if isinstance(self.quant_method, UnquantizedLinearMethod):
+                self.quant_method = UnquantizedMoELinearMethod()
+
+        assert self.quant_method is not None
+        self.quant_method.create_weights(self,
+                                         self.input_size,
+                                         self.output_partition_sizes,
+                                         self.input_size,
+                                         self.output_size,
+                                         self.params_dtype,
+                                         self.num_experts,
+                                         weight_loader=self.weight_loader)
+        self.register_parameter("bias", None)
+
+    def forward(self,
+                input_,
+                output: Optional[torch.Tensor] = None,
+                expert_idx: int = -1):
+        if isinstance(self.quant_method, UnquantizedMoELinearMethod):
+            # use optimus moe_ffn outside
+            return
+        bias = None
+        assert self.quant_method is not None
+
+        output = self.quant_method.apply(self,
+                                         input_,
+                                         bias,
+                                         expert_idx=expert_idx,
+                                         output=output)
+        return output
+
+
+class QKVReplicatedLinear(ReplicatedLinear):
+
+    def __init__(self,
+                 hidden_size: int,
+                 head_size: int,
+                 total_num_heads: int,
+                 total_num_kv_heads: Optional[int] = None,
+                 bias: bool = True,
+                 skip_bias_add: bool = False,
+                 params_dtype: Optional[torch.dtype] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = "",
+                 return_bias: bool = True):
+
+        nn.Module.__init__(self)
+        self.hidden_size = hidden_size
+        self.head_size = head_size
+        self.num_heads = total_num_heads
+        self.num_kv_heads = total_num_kv_heads if total_num_kv_heads else total_num_heads
+        self.input_size = self.hidden_size
+        self.output_size = (self.num_heads +
+                            2 * self.num_kv_heads) * self.head_size
+        self.skip_bias_add = skip_bias_add
+        self.return_bias = return_bias
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+        if quant_config is None:
+            self.quant_method: Optional[
+                QuantizeMethodBase] = UnquantizedLinearMethod()
+        else:
+            self.quant_method = quant_config.get_quant_method(self,
+                                                              prefix=prefix)
+
+        assert self.quant_method is not None
+        self.quant_method.create_weights(self,
+                                         self.input_size, [self.output_size],
+                                         self.input_size,
+                                         self.output_size,
+                                         self.params_dtype,
+                                         weight_loader=self.weight_loader)
+        if bias:
+            self.bias = Parameter(
+                torch.empty(self.output_size, dtype=self.params_dtype))
+            set_weight_attrs(self.bias, {
+                "output_dim": 0,
+                "weight_loader": self.weight_loader
+            })
+        else:
+            self.register_parameter("bias", None)
+
+    def weight_loader(self,
+                      param: Parameter,
+                      loaded_weight: torch.Tensor,
+                      loaded_shard_id: Optional[str] = None):
+        param_data = param.data
+        output_dim = getattr(param, "output_dim", None)
+        is_quantization = not isinstance(self.quant_method, UnquantizedLinearMethod)
+        if loaded_shard_id is None:
+            # Loaded weight is already packed.
+            assert param_data.shape == loaded_weight.shape
+            param_data.copy_(loaded_weight)
+            return
+
+        assert loaded_shard_id in ["q", "k", "v"]
+        if output_dim is not None:
+            if loaded_shard_id == "q":
+                shard_offset = 0
+                shard_size = self.num_heads * self.head_size
+            elif loaded_shard_id == "k":
+                shard_offset = self.num_heads * self.head_size
+                shard_size = self.num_kv_heads * self.head_size
+            elif loaded_shard_id == "v":
+                shard_offset = (self.num_heads +
+                                self.num_kv_heads) * self.head_size
+                shard_size = self.num_kv_heads * self.head_size
+            # If quantized, we need to adjust the offset and size to account
+            # for the packing.
+            packed_dim = getattr(param, "packed_dim", None)
+            if packed_dim == output_dim:
+                shard_size = shard_size // param.pack_factor
+                shard_offset = shard_offset // param.pack_factor
+                
+            if not envs.VLLM_USE_NN or is_quantization:
+                param_data = param_data.narrow(output_dim, shard_offset,
+                                           shard_size)
+            else:
+                param_data = param_data.narrow(int(not(output_dim)), shard_offset, 
+                                           shard_size)
+        else:
+            ignore_warning = getattr(param, "ignore_warning", False)
+            if not ignore_warning:
+                logger.warning(
+                    "Loading a weight without `output_dim` attribute in "
+                    "QKVReplicatedLinear, assume the weight is the same "
+                    "for all partitions.")
+        
+        if envs.VLLM_USE_NN and not is_quantization:
+            loaded_weight = loaded_weight.t()
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+        
+        
 class QKVParallelLinear(ColumnParallelLinear):
    """Linear layers for the attention's QKV transformation.

@@ -1185,6 +1390,8 @@ class QKVParallelLinear(ColumnParallelLinear):
                param.shard_id.append(loaded_shard_id)
                param.shard_id_map[loaded_shard_id] = len(param.data_container)
                param.data_container.append(loaded_weight)
+                if len(param.data_container) == 3:
+                    self.qweight = param.materialize_nested()
                return

        param_data = param.data
@@ -1495,7 +1702,7 @@ class RowParallelLinear(LinearBase):

    def forward(
        self, input_,
-        use_fused_silu_mul_quant: Optional[bool] = False
+        use_fused_silu_mul_quant: Optional[bool] = False,
    ) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
        if self.input_is_parallel:
            input_parallel = input_
@@ -1758,3 +1965,62 @@ class QKVCrossParallelLinear(LinearBase):
        s += f", tp_size={get_tensor_model_parallel_world_size()}"
        s += ", gather_output=False"
        return s
+    
+    
+class RowParallelMoELinear(RowParallelLinear):
+
+    def __init__(self,
+                 num_experts: int,
+                 input_size: int,
+                 output_size: int,
+                 params_dtype: Optional[torch.dtype] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        torch.nn.Module.__init__(self)
+        self.num_experts = num_experts
+        self.input_size = input_size
+        self.output_size = output_size
+        self.reduce_results = False
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.params_dtype = params_dtype
+        if quant_config is None:
+            self.quant_method: Optional[
+                QuantizeMethodBase] = UnquantizedMoELinearMethod()
+        else:
+            self.quant_method = quant_config.get_quant_method(self,
+                                                              prefix=prefix)
+            # FIXME(ys): hack for moe
+            if isinstance(self.quant_method, UnquantizedLinearMethod):
+                self.quant_method = UnquantizedMoELinearMethod()
+
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.input_size_per_partition = divide(input_size, self.tp_size)
+        assert self.quant_method is not None
+        self.quant_method.create_weights(self,
+                                         self.input_size_per_partition,
+                                         [self.output_size],
+                                         self.input_size,
+                                         self.output_size,
+                                         self.params_dtype,
+                                         self.num_experts,
+                                         weight_loader=self.weight_loader)
+        self.register_parameter("bias", None)
+
+    def forward(  # type: ignore[override]
+            self,
+            input_,
+            residual=None,
+            expert_idx: int = -1,
+            output: Optional[torch.Tensor] = None):
+        if isinstance(self.quant_method, UnquantizedMoELinearMethod):
+            # use optimus moe_ffn outside
+            return
+        bias = None
+        assert self.quant_method is not None
+        output = self.quant_method.apply(self,
+                                         input_,
+                                         bias,
+                                         expert_idx=expert_idx,
+                                         output=output)
+        return output
\ No newline at end of file
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -36,7 +36,8 @@ class LogitsProcessor(nn.Module):
                 org_vocab_size: Optional[int] = None,
                 scale: float = 1.0,
                 logits_as_input: bool = False,
-                 soft_cap: Optional[float] = None) -> None:
+                 soft_cap: Optional[float] = None,
+                 need_fp32_logits: bool = False) -> None:
        """
        Args:
            scale: A scaling factor to apply to the logits.
@@ -52,6 +53,7 @@ class LogitsProcessor(nn.Module):
        self.soft_cap = soft_cap
        # Whether to use gather or all-gather to gather the logits.
        self.use_all_gather = current_platform.use_all_gather()
+        self.need_fp32_logits = need_fp32_logits

    def forward(
        self,
@@ -106,6 +108,10 @@ class LogitsProcessor(nn.Module):
        embedding_bias: Optional[torch.Tensor],
    ) -> Optional[torch.Tensor]:
        # Get the logits for the next tokens.
+        if self.need_fp32_logits:
+            logits = torch.ops.OptimusMoe.matmul_fp32(hidden_states,
+                                                      lm_head.weight.t())
+        else:
            logits = lm_head.quant_method.apply(lm_head,
                                                hidden_states,
                                                bias=embedding_bias)

--- a/vllm/model_executor/layers/quantization/groupwise_quant.py
+++ b/vllm/model_executor/layers/quantization/groupwise_quant.py
--- a/vllm/model_executor/layers/quantization/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/quant_utils.py
+import torch
+
+
+@torch.jit.script
+def cal_scale(amax, fp_max, scale):
+    margin = 0
+    exp = torch.floor(torch.log2(fp_max / amax)) - margin
+    sf = torch.round(torch.pow(2, torch.abs(exp)))
+    sf = torch.where(amax > 0.0, sf, scale)
+    sf = torch.where(torch.isfinite(amax), sf, scale)
+    scale = torch.where(exp < 0, 1 / sf, sf)
+    scale_inv = torch.reciprocal(scale)
+    return scale, scale_inv
+
+
+instances = {}
+
+
+def singleton(cls):
+    global instances
+
+    def get_instance(*args, **kwargs):
+        if cls not in instances:
+            instances[cls] = cls(*args, **kwargs)
+        return instances[cls]
+
+    return get_instance
+
+
+def reset_singleton():
+    global instances
+    instances = {}
+
+
+@singleton
+class QuantFp8:
+
+    def __init__(self, device):
+        self.fp_max = torch.tensor([448.0], device=device)
+        self.device = device
+        self.scale = torch.tensor([1.0], device=self.device)
+        pass
+
+    @staticmethod
+    def quantize_v1(weight, bits):
+        if bits == 8:
+            amax = weight.abs().max()
+            fp_max = torch.tensor([448.0]).to(weight.device)
+            margin = 0
+            scale = torch.tensor([1.0]).to(weight.device)
+
+            exp = torch.floor(torch.log2(fp_max / amax)) - margin
+            sf = torch.round(torch.pow(2, torch.abs(exp)))
+            sf = torch.where(amax > 0.0, sf, scale)
+            sf = torch.where(torch.isfinite(amax), sf, scale)
+            scale = torch.where(exp < 0, 1 / sf, sf)
+
+            qweight = (weight.to(torch.float32) * scale).to(
+                torch.float8_e4m3fn)
+            scale = torch.reciprocal(scale)
+            # print(f"amax={amax},scalse={scale}")
+        else:
+            raise ValueError(f"Unsupported bit width: {bits}")
+        return qweight, scale
+
+    def quantize(self, weight, bits, weight_scale, use_offline_input_scales):
+        if bits == 8:
+            amax = torch.empty(1, dtype=torch.float32, device=self.device)
+            scale = torch.tensor([1.0], device=self.device)
+            torch.ops.OptimusFp8.abs_max_nan_to_inf(weight, amax)
+            if weight_scale is None or not use_offline_input_scales:
+                scale, scale_inv = cal_scale(amax, self.fp_max, scale)
+            else:
+                scale, scale_inv = weight_scale, torch.reciprocal(weight_scale)
+
+            qweight = torch.ops.OptimusFp8.quantize(weight, scale, None,
+                                                    torch.float8_e4m3fn)
+            # print(f"scale={scale},self.amax={self.amax}")
+            return qweight, scale_inv
+        else:
+            raise ValueError(f"Unsupported bit width: {bits}")
+
+    def get_quant_scale(self, tensor):
+        amax = torch.empty(1, dtype=torch.float32, device=tensor.device)
+        torch.ops.OptimusFp8.abs_max_nan_to_inf(tensor, amax)
+        scale, _ = cal_scale(amax, self.fp_max, self.scale)
+        return scale
+
+
+def quantize(weight, bits, weight_scale=None, use_offline_input_scales=True):
+    quant = QuantFp8(weight.device)
+    return quant.quantize(weight, bits, weight_scale, use_offline_input_scales)
+
+
+def dequant(weight, weight_scales):
+    return torch.ops.OptimusFp8.dequantize(weight, weight_scales,
+                                           torch.bfloat16)
+
+
+def experts_dequant(weights, weight_scales):
+    ret = torch.empty(*weights.shape,
+                      device=weights.device,
+                      dtype=torch.bfloat16)
+    for i in range(weights.shape[0]):
+        ret[i] = dequant(weights[i], weight_scales[i])
+    return ret
+
+
+def experts_quantize(weight, bits):
+    if bits == 8:
+        qweight_experts = torch.empty(*weight.shape,
+                                      dtype=torch.float8_e4m3fn,
+                                      device=weight.device)
+        scales = torch.empty(weight.shape[0],
+                             dtype=torch.float32,
+                             device=weight.device)
+        for idx in range(weight.shape[0]):
+            expert_weight = weight[idx]
+            qweight, scale = quantize(expert_weight, bits)
+            qweight_experts[idx] = qweight
+            scales[idx] = scale
+        return qweight_experts, scales
+    else:
+        raise ValueError(f"Unsupported bit width: {bits}")
+
+
+def dynamic_fp8_pertensor_quantize(tensor):
+    # amax = torch.empty(1, dtype=torch.float32, device=tensor.device)
+    # scale = torch.tensor([1.0], device=tensor.device)
+    # fp_max = torch.tensor([448.0], device=tensor.device)
+    # torch.ops.OptimusFp8.abs_max_nan_to_inf(tensor, amax)
+    # scale, _ = cal_scale(amax, fp_max, scale)
+    # return scale
+    quant = QuantFp8(tensor.device)
+    return quant.get_quant_scale(tensor)
\ No newline at end of file
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -797,3 +797,10 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:

    # If there were no matches, return the untouched param name
    return name
+
+
+def fp8_input_scales_loader(path: str):
+    with safe_open(path, framework="pt") as f:
+        for name in f.keys():  # noqa: SIM118
+            param = f.get_slice(name)
+            yield name, param
--- a/vllm/model_executor/models/mm_step1o.py
+++ b/vllm/model_executor/models/mm_step1o.py
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -134,6 +134,11 @@ _TEXT_GENERATION_MODELS = {
    # [Encoder-decoder]
    "BartModel": ("bart", "BartForConditionalGeneration"),
    "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"),
+    # step model
+    "Step1ForCausalLM": ("step1", "Step1ForCausalLM"),
+    "Step2ForCausalLM": ("step1", "Step1ForCausalLM"),
+    "Step1MoEForCausalLM": ("step1", "Step1ForCausalLM"),
+    "Step2MiniForCausalLM": ("step2_mini", "Step2MiniForCausalLM"),
 }

 _EMBEDDING_MODELS = {
@@ -174,6 +179,19 @@ _EMBEDDING_MODELS = {
    # input and output. I am adding it here because it piggy-backs on embedding
    # models for the time being.
    "PrithviGeoSpatialMAE": ("prithvi_geospatial_mae", "PrithviGeoSpatialMAE"),
+    # step model
+    "Step1ForSequenceClassification": ("step1",
+                                       "Step1ForSequenceClassification"),
+    "Step2ForClassification": ("step1", "Step1ForSequenceClassification"),
+    "Step2ForSequenceClassification": ("step2",
+                                       "Step2ForSequenceClassification"),
+    "Step2MiniForClassification": ("step2_mini",
+                                   "Step2MiniForSequenceClassification"),
+    "MMGPTQwen2RewardModel": ("mm_step1o", "MMGPTStep1oRewardModel"),
+    # Technically PrithviGeoSpatialMAE is a model that works on images, both in
+    # input and output. I am adding it here because it piggy-backs on embedding
+    # models for the time being.
+    "PrithviGeoSpatialMAE": ("prithvi_geospatial_mae", "PrithviGeoSpatialMAE"),
 }

 _CROSS_ENCODER_MODELS = {
@@ -251,6 +269,15 @@ _SPECULATIVE_DECODING_MODELS = {
    "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"),
    "MedusaModel": ("medusa", "Medusa"),
    "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
+    # step model
+    "MMGPTStep1ForCausalLMV2": ("mm_step1p5c_1u", "MMGPTStep1ForCausalLMV2"),
+    "MMGPTStep1ForCausalLMV3": ("mm_step1p5c_1u", "MMGPTStep1ForCausalLMV3"),
+    "MMGPTStep1ForCausalLMV4": ("mm_step1o", "MMGPTStep1oForCausalLM"),
+    "MMGPTQwen2ForCausalLM": ("mm_step1p5c_1u", "MMGPTStep1ForCausalLMV3"),
+    "MMGPTQwen2ForCausalLMV2": ("mm_step1o", "MMGPTStep1oForCausalLM"),
+    "MMGPTStep3vForCausalLM": ("mm_step1o", "MMGPTStep1oForCausalLM"),
+    "Step1AudioForCausalLM": ("mm_step_audio", "MMGPTStep1fForCausalLM"),
+    "StepAudioForCausalLMV2": ("mm_step_audio", "MMGPTStep1fForCausalLM"),
 }

 _TRANSFORMERS_MODELS = {

--- a/vllm/model_executor/models/step1.py
+++ b/vllm/model_executor/models/step1.py
--- a/vllm/model_executor/models/step2_mini.py
+++ b/vllm/model_executor/models/step2_mini.py
--- a/vllm/model_executor/models/step_encoder.py
+++ b/vllm/model_executor/models/step_encoder.py
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -41,6 +41,15 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
                                             OvisConfig, RWConfig,
                                             Step3TextConfig, Step3VLConfig, 
                                             SkyworkR1VChatConfig, SolarConfig,
+                                             MMGPTStep1Config,
+                                             MMGPTStep1ConfigV2, MPTConfig,
+                                             NemotronConfig, NVLM_D_Config,
+                                             RWConfig, SkyworkR1VChatConfig,
+                                             SolarConfig, Step1AudioConfig,
+                                             Step1Config, Step1oConfig,
+                                             Step2Config, Step2MiniConfig,
+                                             Step3vConfig,
+                                             StepAudioQwen2Config,
                                             Telechat2Config, UltravoxConfig)
 # yapf: enable
 from vllm.transformers_utils.utils import check_gguf_file
@@ -75,6 +84,20 @@ _CONFIG_REGISTRY_OVERRIDE_HF: dict[str, type[PretrainedConfig]] = {
    "mllama": MllamaConfig
 }

+_CUSTOM_CONFIG_STEP = {
+    "step1": Step1Config,
+    "step2": Step2Config,
+    "step2_mini": Step2MiniConfig,
+    "mmgpt_step1": MMGPTStep1Config,
+    "mmgpt_step1_v2": MMGPTStep1ConfigV2,
+    #"mmgpt_qwen2": MMGPTQwen2Config,
+    #"mmgpt_qwen2_v2": MMGPTQwen2ConfigV2,
+    "step1o": Step1oConfig,
+    "step1_audio": Step1AudioConfig,
+    "step_audio_qwen2": StepAudioQwen2Config,
+    "step3v": Step3vConfig,
+}
+
 _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {
    "chatglm": ChatGLMConfig,
    "cohere2": Cohere2Config,
@@ -100,7 +123,8 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {
    "ultravox": UltravoxConfig,
    "step3_vl": Step3VLConfig,
    "step3_text": Step3TextConfig,
-    **_CONFIG_REGISTRY_OVERRIDE_HF
+    **_CONFIG_REGISTRY_OVERRIDE_HF,
+    **_CUSTOM_CONFIG_STEP
 }

 _CONFIG_ATTRS_MAPPING: dict[str, str] = {

--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -32,6 +32,18 @@ from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig,
                                                      Step3VisionEncoderConfig,
                                                      Step3VLConfig)
 from vllm.transformers_utils.configs.ultravox import UltravoxConfig
+from vllm.transformers_utils.configs.mmgpt import (CLIPVisionConfig,
+                                                   MMGPTQwen2Config,
+                                                   MMGPTQwen2ConfigV2,
+                                                   MMGPTStep1Config,
+                                                   MMGPTStep1ConfigV2,
+                                                   SamViTConfig, Step1oConfig,
+                                                   Step3vConfig)
+from vllm.transformers_utils.configs.step import (Step1Config, Step2Config,
+                                                  Step2MiniConfig)
+from vllm.transformers_utils.configs.step1f import (Step1AudioConfig,
+                                                    Step1fAudioEncoderConfig,
+                                                    StepAudioQwen2Config)

 __all__ = [
    "ChatGLMConfig",
@@ -62,4 +74,21 @@ __all__ = [
    "Step3VLConfig",
    "Step3VisionEncoderConfig",
    "Step3TextConfig",
+    "Step1Config",
+    "Step2Config",
+    "Step2MiniConfig",
+    "CLIPVisionConfig",
+    "MMGPTBaiChuanConfig",
+    "MMGPTLlamaConfig",
+    "MMGPTLlamaConfigV2",
+    "MMGPTQwen2Config",
+    "MMGPTQwen2ConfigV2",
+    "MMGPTStep1Config",
+    "MMGPTStep1ConfigV2",
+    "Step3vConfig",
+    "SamViTConfig",
+    "Step1oConfig",
+    "Step1AudioConfig",
+    "Step1fAudioEncoderConfig",
+    "StepAudioQwen2Config",
 ]
--- a/vllm/transformers_utils/configs/mmgpt.py
+++ b/vllm/transformers_utils/configs/mmgpt.py
--- a/vllm/transformers_utils/configs/step.py
+++ b/vllm/transformers_utils/configs/step.py
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Dict, List, Optional, Union
+
+from transformers import PretrainedConfig
+
+
+class StepConfig(PretrainedConfig):
+    model_type = "step"
+
+    def __init__(
+        self,
+        hidden_size: int = 5120,
+        intermediate_size: int = 13312,
+        num_attention_heads: int = 40,
+        num_attention_groups: int = 8,
+        num_hidden_layers: int = 48,
+        max_seq_len: int = 4096,
+        vocab_size: int = 65536,
+        rms_norm_eps: float = 1e-5,
+        moe_every_n_layer:
+        int = 2,  # 2 means 50% layers use MoE, interleaved with normal non-MoE layers.
+        use_moe: bool = False,
+        moe_intermediate_size: int = 10240,
+        moe_num_experts: int = 16,
+        moe_top_k: int = 4,
+        max_pos_interp_ratio: float = 1,
+        alibi_slopes: Optional[List[float]] = None,
+        moe_layer_offset: int = 0,
+        moe_dynamic_exp_p: float = 1.0,
+        rope_theta: float = 500000,
+        rope_scaling: Optional[Dict[str, Any]] = None,
+        head_dim: Optional[int] = None,
+        max_position_embedding: int = 16384,
+        share_expert_dim: Optional[int] = None,
+        allgather_dtype: Optional[str] = None,
+        share_q_dim: Optional[int] = None,
+        norm_expert_weight: bool = True,
+        bos_token_id: Optional[Union[List[int], int]] = None,
+        eos_token_id: Optional[Union[List[int], int]] = None,
+        **kwargs,
+    ) -> None:
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.num_attention_groups = num_attention_groups
+        self.num_hidden_layers = num_hidden_layers
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.rms_norm_eps = rms_norm_eps
+        self.use_moe = use_moe
+        self.moe_intermediate_size = moe_intermediate_size
+        self.moe_every_n_layer = moe_every_n_layer
+        self.moe_num_experts = moe_num_experts
+        self.moe_top_k = moe_top_k
+        self.max_pos_interp_ratio = max_pos_interp_ratio
+        self.alibi_slopes = alibi_slopes
+        self.moe_layer_offset = moe_layer_offset
+        self.moe_dynamic_exp_p = moe_dynamic_exp_p
+
+        #for step2 mini
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.head_dim = head_dim
+        self.max_position_embedding = max_position_embedding
+        if share_expert_dim is None:
+            self.share_expert_dim = self.moe_intermediate_size * self.moe_top_k
+        else:
+            self.share_expert_dim = share_expert_dim        
+        self.share_q_dim = share_q_dim
+        self.norm_expert_weight = norm_expert_weight
+
+        self.allgather_dtype = allgather_dtype
+        self._verify_slopes()
+
+        super().__init__(
+            bos_token_id=1 if bos_token_id is None else bos_token_id,
+            eos_token_id=[2, 3] if eos_token_id is None else eos_token_id,
+            **kwargs)
+
+    def _verify_slopes(self):
+        if self.alibi_slopes is None:
+            return
+        if len(self.alibi_slopes) != self.num_attention_heads:
+            raise ValueError(
+                f"Number of alibi_slopes ({len(self.alibi_slopes)}) does not match num_attention_heads ({self.num_attention_heads})"
+            )
+
+
+class Step1Config(StepConfig):
+    model_type = "step1"
+
+
+class Step2Config(StepConfig):
+    model_type = "step2"
+
+    def __init__(self, use_offline_input_scales: bool = True, **kwargs):
+        self.use_offline_input_scales = use_offline_input_scales
+        super().__init__(**kwargs)
+
+
+class Step2MiniConfig(StepConfig):
+    model_type = "step2_mini"
\ No newline at end of file
--- a/vllm/transformers_utils/configs/step1f.py
+++ b/vllm/transformers_utils/configs/step1f.py
--- a/vllm/transformers_utils/detokenizer_utils.py
+++ b/vllm/transformers_utils/detokenizer_utils.py
@@ -4,6 +4,8 @@
 from typing import Optional

 from .tokenizer import AnyTokenizer
+# from vllm.transformers_utils.tokenizers.sentencepiece_tokenizer import (
+#     SentencePieceTokenizer)


 def _replace_none_with_empty(tokens: list[Optional[str]]):
@@ -171,6 +173,13 @@ def detokenize_incrementally(
    # The prefix text is necessary only to defeat cleanup algorithms in
    # the decode which decide to add a space or not depending on the
    # surrounding ids.
+    
+    # FIXME(ys): for step1 sentencepiece tokenizer, we need to handle the special tokens in convert_tokens_to_string
+    # if isinstance(tokenizer, SentencePieceTokenizer):
+    #     prefix_text = tokenizer.convert_tokens_to_string(
+    #         output_tokens[prefix_offset:read_offset], skip_special_tokens=skip_special_tokens)
+    #     new_text = tokenizer.convert_tokens_to_string(
+    #         output_tokens[prefix_offset:], skip_special_tokens=skip_special_tokens)
    if tokenizer.is_fast or not tokenizer.get_added_vocab():
        prefix_text = tokenizer.convert_tokens_to_string(
            output_tokens[prefix_offset:read_offset])