Commit 583034f1 authored by zhuwenwen's avatar zhuwenwen
Browse files

[models] support step3v

parent 0adf9cda
......@@ -3418,6 +3418,8 @@ def _get_and_verify_max_len(
possible_keys = [
# OPT
"max_position_embeddings",
# step3
"max_position_embedding",
# GPT-2
"n_positions",
# MPT
......@@ -3491,7 +3493,13 @@ def _get_and_verify_max_len(
# loading HF config
rope_type = rope_scaling["rope_type"]
if rope_type not in ("su", "longrope", "llama3"):
if rope_type == "ntk_bypart":
derived_max_model_len = min(
derived_max_model_len,
rope_scaling["real_length"] * rope_scaling["scaling_factor"]
) if "real_length" in rope_scaling and "scaling_factor" in rope_scaling else derived_max_model_len
elif rope_type not in ("su", "longrope", "llama3"):
if disable_sliding_window:
# TODO(robertgshaw): Find a model that supports rope_scaling
# with sliding window to see if this case should be allowed.
......@@ -3548,6 +3556,8 @@ def _get_and_verify_max_len(
logger.warning(
"%s Make sure the value is correct and within the "
"model context size.", msg)
if getattr(hf_config, "max_position_embedding", None) is not None: # step3/3v
hf_config.max_position_embedding = max_model_len
else:
raise ValueError(
f"{msg} To allow overriding this maximum, set "
......
......@@ -36,4 +36,7 @@ __all__ = [
"xLAMToolParser",
"MinimaxToolParser",
"Glm4MoeModelToolParser",
"Step1p5vMini2ToolParser",
"Step1p5vMini2MsToolParser",
"Step3ToolParser",
]
......@@ -3,6 +3,7 @@
"""Custom activation functions."""
import math
from typing import Optional
import optimus
import torch
import torch.nn as nn
......@@ -53,6 +54,14 @@ class FatreluAndMul(CustomOp):
return out
class OptimusSiluAndMul(nn.Module):
def forward(self,
x: torch.Tensor,
output: Optional[torch.Tensor] = None) -> torch.Tensor:
return torch.ops.Optimus.SiluDot_forward(x, out=output)
@CustomOp.register("silu_and_mul")
class SiluAndMul(CustomOp):
"""An activation function for SwiGLU.
......
......@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Custom normalization layers."""
from typing import Optional, Union, Tuple
import optimus # noqa F401
import torch
import torch.nn as nn
......@@ -298,6 +299,49 @@ class RMSNorm(CustomOp):
return s
class OptimusRMSNorm(nn.Module):
def __init__(
self,
hidden_size: int,
eps: float = 1e-6,
) -> None:
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
def forward(self,
x: torch.Tensor,
residual: Optional[torch.Tensor] = None,
output: Optional[torch.Tensor] = None,
fp16_out: bool = False) -> torch.Tensor:
if residual is not None:
assert output is None
from vllm import _custom_ops as ops
assert not fp16_out
ops.fused_add_rms_norm(
x,
residual,
self.weight.data,
self.variance_epsilon,
)
return x, residual
else:
if fp16_out:
if output is None:
output = torch.empty_like(x).half()
else:
output = output.half()
# return torch.ops.Optimus.rms_norm(x,
# self.weight,
# self.variance_epsilon,
# out=output)
return torch.nn.functional.rms_norm(x,
self.weight,
self.variance_epsilon,
out=output)
@CustomOp.register("gemma_rms_norm")
class GemmaRMSNorm(CustomOp):
"""RMS normalization for Gemma.
......@@ -363,3 +407,35 @@ class GemmaRMSNorm(CustomOp):
self.forward_static)
self._is_compiled = True
return self.forward_native(x, residual)
class OptimusLayerNorm(nn.Module):
def __init__(self, hidden_size: int, eps: float = 1e-6) -> None:
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.bias = nn.Parameter(torch.zeros(hidden_size))
self.variance_epsilon = eps
def forward(self,
x: torch.Tensor,
residual: Optional[torch.Tensor] = None,
output: Optional[torch.Tensor] = None) -> torch.Tensor:
assert residual is None
# return torch.ops.Optimus.layer_norm(x,
# self.weight,
# self.bias,
# eps=self.variance_epsilon,
# out=output)
# return torch.nn.functional.layer_norm(x,
# self.weight,
# self.bias,
# eps=self.variance_epsilon,
# out=output)
return torch.nn.functional.layer_norm(
x,
self.weight.shape, # normalized_shape 应为 weight 的形状
self.weight,
self.bias,
eps=self.variance_epsilon
)
......@@ -3,7 +3,7 @@
import itertools
from abc import abstractmethod
from typing import Any, Literal, Optional, Union
from typing import Any, Literal, Optional, Union, List
import vllm.envs as envs
import torch
import torch.nn as nn
......@@ -269,6 +269,40 @@ class UnquantizedLinearMethod(LinearMethodBase):
return dispatch_unquantized_gemm()(x, layer.weight, bias)
class UnquantizedMoELinearMethod(LinearMethodBase):
"""MoE Linear method without quantization.
"""
def __init__(self):
self.quant_config = None
def create_weights(self,
layer: torch.nn.Module,
input_size_per_partition: int,
output_partition_sizes: List[int],
input_size: int,
output_size: int,
params_dtype: torch.dtype,
num_experts: Optional[int] = None,
**extra_weight_attrs):
weight = Parameter(torch.empty(num_experts,
sum(output_partition_sizes),
input_size_per_partition,
device=torch.cuda.current_device(),
dtype=params_dtype),
requires_grad=False)
set_weight_attrs(weight, {"input_dim": 2, "output_dim": 1})
layer.register_parameter("weight", weight)
set_weight_attrs(weight, extra_weight_attrs)
def apply(self,
layer: torch.nn.Module,
x: torch.Tensor,
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
"""Apply the weights to the input tensor."""
raise NotImplementedError
class LinearBase(torch.nn.Module):
"""Base linear layer.
......@@ -783,6 +817,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
param.shard_id.append(loaded_shard_id)
param.shard_id_map[loaded_shard_id] = len(param.data_container)
param.data_container.append(loaded_weight)
if len(param.data_container) == 2:
self.qweight = param.materialize_nested()
return
param_data = param.data
......@@ -986,6 +1022,175 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
shard_offset=shard_offset,
shard_size=shard_size)
class MergedColumnParallelMoELinear(MergedColumnParallelLinear):
def __init__(self,
num_experts: int,
input_size: int,
output_sizes: List[int],
params_dtype: Optional[torch.dtype] = None,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = ""):
torch.nn.Module.__init__(self)
output_size = sum(output_sizes)
self.num_experts = num_experts
self.output_sizes = output_sizes
self.input_size = input_size
self.output_size = sum(output_sizes)
tp_size = get_tensor_model_parallel_world_size()
assert all(output_size % tp_size == 0 for output_size in output_sizes)
self.output_size_per_partition = divide(self.output_size, tp_size)
self.output_partition_sizes = [
divide(output_size, tp_size) for output_size in self.output_sizes
]
self.gather_output = False
if output_sizes is None:
output_sizes = [output_size]
if params_dtype is None:
params_dtype = torch.get_default_dtype()
self.params_dtype = params_dtype
if quant_config is None:
self.quant_method = UnquantizedMoELinearMethod()
else:
self.quant_method = quant_config.get_quant_method(self,
prefix=prefix)
# FIXME(ys): hack for moe
if isinstance(self.quant_method, UnquantizedLinearMethod):
self.quant_method = UnquantizedMoELinearMethod()
assert self.quant_method is not None
self.quant_method.create_weights(self,
self.input_size,
self.output_partition_sizes,
self.input_size,
self.output_size,
self.params_dtype,
self.num_experts,
weight_loader=self.weight_loader)
self.register_parameter("bias", None)
def forward(self,
input_,
output: Optional[torch.Tensor] = None,
expert_idx: int = -1):
if isinstance(self.quant_method, UnquantizedMoELinearMethod):
# use optimus moe_ffn outside
return
bias = None
assert self.quant_method is not None
output = self.quant_method.apply(self,
input_,
bias,
expert_idx=expert_idx,
output=output)
return output
class QKVReplicatedLinear(ReplicatedLinear):
def __init__(self,
hidden_size: int,
head_size: int,
total_num_heads: int,
total_num_kv_heads: Optional[int] = None,
bias: bool = True,
skip_bias_add: bool = False,
params_dtype: Optional[torch.dtype] = None,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
return_bias: bool = True):
nn.Module.__init__(self)
self.hidden_size = hidden_size
self.head_size = head_size
self.num_heads = total_num_heads
self.num_kv_heads = total_num_kv_heads if total_num_kv_heads else total_num_heads
self.input_size = self.hidden_size
self.output_size = (self.num_heads +
2 * self.num_kv_heads) * self.head_size
self.skip_bias_add = skip_bias_add
self.return_bias = return_bias
if params_dtype is None:
params_dtype = torch.get_default_dtype()
self.params_dtype = params_dtype
if quant_config is None:
self.quant_method: Optional[
QuantizeMethodBase] = UnquantizedLinearMethod()
else:
self.quant_method = quant_config.get_quant_method(self,
prefix=prefix)
assert self.quant_method is not None
self.quant_method.create_weights(self,
self.input_size, [self.output_size],
self.input_size,
self.output_size,
self.params_dtype,
weight_loader=self.weight_loader)
if bias:
self.bias = Parameter(
torch.empty(self.output_size, dtype=self.params_dtype))
set_weight_attrs(self.bias, {
"output_dim": 0,
"weight_loader": self.weight_loader
})
else:
self.register_parameter("bias", None)
def weight_loader(self,
param: Parameter,
loaded_weight: torch.Tensor,
loaded_shard_id: Optional[str] = None):
param_data = param.data
output_dim = getattr(param, "output_dim", None)
is_quantization = not isinstance(self.quant_method, UnquantizedLinearMethod)
if loaded_shard_id is None:
# Loaded weight is already packed.
assert param_data.shape == loaded_weight.shape
param_data.copy_(loaded_weight)
return
assert loaded_shard_id in ["q", "k", "v"]
if output_dim is not None:
if loaded_shard_id == "q":
shard_offset = 0
shard_size = self.num_heads * self.head_size
elif loaded_shard_id == "k":
shard_offset = self.num_heads * self.head_size
shard_size = self.num_kv_heads * self.head_size
elif loaded_shard_id == "v":
shard_offset = (self.num_heads +
self.num_kv_heads) * self.head_size
shard_size = self.num_kv_heads * self.head_size
# If quantized, we need to adjust the offset and size to account
# for the packing.
packed_dim = getattr(param, "packed_dim", None)
if packed_dim == output_dim:
shard_size = shard_size // param.pack_factor
shard_offset = shard_offset // param.pack_factor
if not envs.VLLM_USE_NN or is_quantization:
param_data = param_data.narrow(output_dim, shard_offset,
shard_size)
else:
param_data = param_data.narrow(int(not(output_dim)), shard_offset,
shard_size)
else:
ignore_warning = getattr(param, "ignore_warning", False)
if not ignore_warning:
logger.warning(
"Loading a weight without `output_dim` attribute in "
"QKVReplicatedLinear, assume the weight is the same "
"for all partitions.")
if envs.VLLM_USE_NN and not is_quantization:
loaded_weight = loaded_weight.t()
assert param_data.shape == loaded_weight.shape
param_data.copy_(loaded_weight)
class QKVParallelLinear(ColumnParallelLinear):
"""Linear layers for the attention's QKV transformation.
......@@ -1185,6 +1390,8 @@ class QKVParallelLinear(ColumnParallelLinear):
param.shard_id.append(loaded_shard_id)
param.shard_id_map[loaded_shard_id] = len(param.data_container)
param.data_container.append(loaded_weight)
if len(param.data_container) == 3:
self.qweight = param.materialize_nested()
return
param_data = param.data
......@@ -1495,7 +1702,7 @@ class RowParallelLinear(LinearBase):
def forward(
self, input_,
use_fused_silu_mul_quant: Optional[bool] = False
use_fused_silu_mul_quant: Optional[bool] = False,
) -> Union[torch.Tensor, tuple[torch.Tensor, Optional[Parameter]]]:
if self.input_is_parallel:
input_parallel = input_
......@@ -1758,3 +1965,62 @@ class QKVCrossParallelLinear(LinearBase):
s += f", tp_size={get_tensor_model_parallel_world_size()}"
s += ", gather_output=False"
return s
class RowParallelMoELinear(RowParallelLinear):
def __init__(self,
num_experts: int,
input_size: int,
output_size: int,
params_dtype: Optional[torch.dtype] = None,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = ""):
torch.nn.Module.__init__(self)
self.num_experts = num_experts
self.input_size = input_size
self.output_size = output_size
self.reduce_results = False
if params_dtype is None:
params_dtype = torch.get_default_dtype()
self.params_dtype = params_dtype
if quant_config is None:
self.quant_method: Optional[
QuantizeMethodBase] = UnquantizedMoELinearMethod()
else:
self.quant_method = quant_config.get_quant_method(self,
prefix=prefix)
# FIXME(ys): hack for moe
if isinstance(self.quant_method, UnquantizedLinearMethod):
self.quant_method = UnquantizedMoELinearMethod()
self.tp_size = get_tensor_model_parallel_world_size()
self.input_size_per_partition = divide(input_size, self.tp_size)
assert self.quant_method is not None
self.quant_method.create_weights(self,
self.input_size_per_partition,
[self.output_size],
self.input_size,
self.output_size,
self.params_dtype,
self.num_experts,
weight_loader=self.weight_loader)
self.register_parameter("bias", None)
def forward( # type: ignore[override]
self,
input_,
residual=None,
expert_idx: int = -1,
output: Optional[torch.Tensor] = None):
if isinstance(self.quant_method, UnquantizedMoELinearMethod):
# use optimus moe_ffn outside
return
bias = None
assert self.quant_method is not None
output = self.quant_method.apply(self,
input_,
bias,
expert_idx=expert_idx,
output=output)
return output
\ No newline at end of file
......@@ -36,7 +36,8 @@ class LogitsProcessor(nn.Module):
org_vocab_size: Optional[int] = None,
scale: float = 1.0,
logits_as_input: bool = False,
soft_cap: Optional[float] = None) -> None:
soft_cap: Optional[float] = None,
need_fp32_logits: bool = False) -> None:
"""
Args:
scale: A scaling factor to apply to the logits.
......@@ -52,6 +53,7 @@ class LogitsProcessor(nn.Module):
self.soft_cap = soft_cap
# Whether to use gather or all-gather to gather the logits.
self.use_all_gather = current_platform.use_all_gather()
self.need_fp32_logits = need_fp32_logits
def forward(
self,
......@@ -106,6 +108,10 @@ class LogitsProcessor(nn.Module):
embedding_bias: Optional[torch.Tensor],
) -> Optional[torch.Tensor]:
# Get the logits for the next tokens.
if self.need_fp32_logits:
logits = torch.ops.OptimusMoe.matmul_fp32(hidden_states,
lm_head.weight.t())
else:
logits = lm_head.quant_method.apply(lm_head,
hidden_states,
bias=embedding_bias)
......
This diff is collapsed.
import torch
@torch.jit.script
def cal_scale(amax, fp_max, scale):
margin = 0
exp = torch.floor(torch.log2(fp_max / amax)) - margin
sf = torch.round(torch.pow(2, torch.abs(exp)))
sf = torch.where(amax > 0.0, sf, scale)
sf = torch.where(torch.isfinite(amax), sf, scale)
scale = torch.where(exp < 0, 1 / sf, sf)
scale_inv = torch.reciprocal(scale)
return scale, scale_inv
instances = {}
def singleton(cls):
global instances
def get_instance(*args, **kwargs):
if cls not in instances:
instances[cls] = cls(*args, **kwargs)
return instances[cls]
return get_instance
def reset_singleton():
global instances
instances = {}
@singleton
class QuantFp8:
def __init__(self, device):
self.fp_max = torch.tensor([448.0], device=device)
self.device = device
self.scale = torch.tensor([1.0], device=self.device)
pass
@staticmethod
def quantize_v1(weight, bits):
if bits == 8:
amax = weight.abs().max()
fp_max = torch.tensor([448.0]).to(weight.device)
margin = 0
scale = torch.tensor([1.0]).to(weight.device)
exp = torch.floor(torch.log2(fp_max / amax)) - margin
sf = torch.round(torch.pow(2, torch.abs(exp)))
sf = torch.where(amax > 0.0, sf, scale)
sf = torch.where(torch.isfinite(amax), sf, scale)
scale = torch.where(exp < 0, 1 / sf, sf)
qweight = (weight.to(torch.float32) * scale).to(
torch.float8_e4m3fn)
scale = torch.reciprocal(scale)
# print(f"amax={amax},scalse={scale}")
else:
raise ValueError(f"Unsupported bit width: {bits}")
return qweight, scale
def quantize(self, weight, bits, weight_scale, use_offline_input_scales):
if bits == 8:
amax = torch.empty(1, dtype=torch.float32, device=self.device)
scale = torch.tensor([1.0], device=self.device)
torch.ops.OptimusFp8.abs_max_nan_to_inf(weight, amax)
if weight_scale is None or not use_offline_input_scales:
scale, scale_inv = cal_scale(amax, self.fp_max, scale)
else:
scale, scale_inv = weight_scale, torch.reciprocal(weight_scale)
qweight = torch.ops.OptimusFp8.quantize(weight, scale, None,
torch.float8_e4m3fn)
# print(f"scale={scale},self.amax={self.amax}")
return qweight, scale_inv
else:
raise ValueError(f"Unsupported bit width: {bits}")
def get_quant_scale(self, tensor):
amax = torch.empty(1, dtype=torch.float32, device=tensor.device)
torch.ops.OptimusFp8.abs_max_nan_to_inf(tensor, amax)
scale, _ = cal_scale(amax, self.fp_max, self.scale)
return scale
def quantize(weight, bits, weight_scale=None, use_offline_input_scales=True):
quant = QuantFp8(weight.device)
return quant.quantize(weight, bits, weight_scale, use_offline_input_scales)
def dequant(weight, weight_scales):
return torch.ops.OptimusFp8.dequantize(weight, weight_scales,
torch.bfloat16)
def experts_dequant(weights, weight_scales):
ret = torch.empty(*weights.shape,
device=weights.device,
dtype=torch.bfloat16)
for i in range(weights.shape[0]):
ret[i] = dequant(weights[i], weight_scales[i])
return ret
def experts_quantize(weight, bits):
if bits == 8:
qweight_experts = torch.empty(*weight.shape,
dtype=torch.float8_e4m3fn,
device=weight.device)
scales = torch.empty(weight.shape[0],
dtype=torch.float32,
device=weight.device)
for idx in range(weight.shape[0]):
expert_weight = weight[idx]
qweight, scale = quantize(expert_weight, bits)
qweight_experts[idx] = qweight
scales[idx] = scale
return qweight_experts, scales
else:
raise ValueError(f"Unsupported bit width: {bits}")
def dynamic_fp8_pertensor_quantize(tensor):
# amax = torch.empty(1, dtype=torch.float32, device=tensor.device)
# scale = torch.tensor([1.0], device=tensor.device)
# fp_max = torch.tensor([448.0], device=tensor.device)
# torch.ops.OptimusFp8.abs_max_nan_to_inf(tensor, amax)
# scale, _ = cal_scale(amax, fp_max, scale)
# return scale
quant = QuantFp8(tensor.device)
return quant.get_quant_scale(tensor)
\ No newline at end of file
......@@ -797,3 +797,10 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> Optional[str]:
# If there were no matches, return the untouched param name
return name
def fp8_input_scales_loader(path: str):
with safe_open(path, framework="pt") as f:
for name in f.keys(): # noqa: SIM118
param = f.get_slice(name)
yield name, param
This diff is collapsed.
......@@ -134,6 +134,11 @@ _TEXT_GENERATION_MODELS = {
# [Encoder-decoder]
"BartModel": ("bart", "BartForConditionalGeneration"),
"BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"),
# step model
"Step1ForCausalLM": ("step1", "Step1ForCausalLM"),
"Step2ForCausalLM": ("step1", "Step1ForCausalLM"),
"Step1MoEForCausalLM": ("step1", "Step1ForCausalLM"),
"Step2MiniForCausalLM": ("step2_mini", "Step2MiniForCausalLM"),
}
_EMBEDDING_MODELS = {
......@@ -174,6 +179,19 @@ _EMBEDDING_MODELS = {
# input and output. I am adding it here because it piggy-backs on embedding
# models for the time being.
"PrithviGeoSpatialMAE": ("prithvi_geospatial_mae", "PrithviGeoSpatialMAE"),
# step model
"Step1ForSequenceClassification": ("step1",
"Step1ForSequenceClassification"),
"Step2ForClassification": ("step1", "Step1ForSequenceClassification"),
"Step2ForSequenceClassification": ("step2",
"Step2ForSequenceClassification"),
"Step2MiniForClassification": ("step2_mini",
"Step2MiniForSequenceClassification"),
"MMGPTQwen2RewardModel": ("mm_step1o", "MMGPTStep1oRewardModel"),
# Technically PrithviGeoSpatialMAE is a model that works on images, both in
# input and output. I am adding it here because it piggy-backs on embedding
# models for the time being.
"PrithviGeoSpatialMAE": ("prithvi_geospatial_mae", "PrithviGeoSpatialMAE"),
}
_CROSS_ENCODER_MODELS = {
......@@ -251,6 +269,15 @@ _SPECULATIVE_DECODING_MODELS = {
"Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"),
"MedusaModel": ("medusa", "Medusa"),
"MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
# step model
"MMGPTStep1ForCausalLMV2": ("mm_step1p5c_1u", "MMGPTStep1ForCausalLMV2"),
"MMGPTStep1ForCausalLMV3": ("mm_step1p5c_1u", "MMGPTStep1ForCausalLMV3"),
"MMGPTStep1ForCausalLMV4": ("mm_step1o", "MMGPTStep1oForCausalLM"),
"MMGPTQwen2ForCausalLM": ("mm_step1p5c_1u", "MMGPTStep1ForCausalLMV3"),
"MMGPTQwen2ForCausalLMV2": ("mm_step1o", "MMGPTStep1oForCausalLM"),
"MMGPTStep3vForCausalLM": ("mm_step1o", "MMGPTStep1oForCausalLM"),
"Step1AudioForCausalLM": ("mm_step_audio", "MMGPTStep1fForCausalLM"),
"StepAudioForCausalLMV2": ("mm_step_audio", "MMGPTStep1fForCausalLM"),
}
_TRANSFORMERS_MODELS = {
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -41,6 +41,15 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
OvisConfig, RWConfig,
Step3TextConfig, Step3VLConfig,
SkyworkR1VChatConfig, SolarConfig,
MMGPTStep1Config,
MMGPTStep1ConfigV2, MPTConfig,
NemotronConfig, NVLM_D_Config,
RWConfig, SkyworkR1VChatConfig,
SolarConfig, Step1AudioConfig,
Step1Config, Step1oConfig,
Step2Config, Step2MiniConfig,
Step3vConfig,
StepAudioQwen2Config,
Telechat2Config, UltravoxConfig)
# yapf: enable
from vllm.transformers_utils.utils import check_gguf_file
......@@ -75,6 +84,20 @@ _CONFIG_REGISTRY_OVERRIDE_HF: dict[str, type[PretrainedConfig]] = {
"mllama": MllamaConfig
}
_CUSTOM_CONFIG_STEP = {
"step1": Step1Config,
"step2": Step2Config,
"step2_mini": Step2MiniConfig,
"mmgpt_step1": MMGPTStep1Config,
"mmgpt_step1_v2": MMGPTStep1ConfigV2,
#"mmgpt_qwen2": MMGPTQwen2Config,
#"mmgpt_qwen2_v2": MMGPTQwen2ConfigV2,
"step1o": Step1oConfig,
"step1_audio": Step1AudioConfig,
"step_audio_qwen2": StepAudioQwen2Config,
"step3v": Step3vConfig,
}
_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {
"chatglm": ChatGLMConfig,
"cohere2": Cohere2Config,
......@@ -100,7 +123,8 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = {
"ultravox": UltravoxConfig,
"step3_vl": Step3VLConfig,
"step3_text": Step3TextConfig,
**_CONFIG_REGISTRY_OVERRIDE_HF
**_CONFIG_REGISTRY_OVERRIDE_HF,
**_CUSTOM_CONFIG_STEP
}
_CONFIG_ATTRS_MAPPING: dict[str, str] = {
......
......@@ -32,6 +32,18 @@ from vllm.transformers_utils.configs.step3_vl import (Step3TextConfig,
Step3VisionEncoderConfig,
Step3VLConfig)
from vllm.transformers_utils.configs.ultravox import UltravoxConfig
from vllm.transformers_utils.configs.mmgpt import (CLIPVisionConfig,
MMGPTQwen2Config,
MMGPTQwen2ConfigV2,
MMGPTStep1Config,
MMGPTStep1ConfigV2,
SamViTConfig, Step1oConfig,
Step3vConfig)
from vllm.transformers_utils.configs.step import (Step1Config, Step2Config,
Step2MiniConfig)
from vllm.transformers_utils.configs.step1f import (Step1AudioConfig,
Step1fAudioEncoderConfig,
StepAudioQwen2Config)
__all__ = [
"ChatGLMConfig",
......@@ -62,4 +74,21 @@ __all__ = [
"Step3VLConfig",
"Step3VisionEncoderConfig",
"Step3TextConfig",
"Step1Config",
"Step2Config",
"Step2MiniConfig",
"CLIPVisionConfig",
"MMGPTBaiChuanConfig",
"MMGPTLlamaConfig",
"MMGPTLlamaConfigV2",
"MMGPTQwen2Config",
"MMGPTQwen2ConfigV2",
"MMGPTStep1Config",
"MMGPTStep1ConfigV2",
"Step3vConfig",
"SamViTConfig",
"Step1oConfig",
"Step1AudioConfig",
"Step1fAudioEncoderConfig",
"StepAudioQwen2Config",
]
This diff is collapsed.
# SPDX-License-Identifier: Apache-2.0
from typing import Any, Dict, List, Optional, Union
from transformers import PretrainedConfig
class StepConfig(PretrainedConfig):
model_type = "step"
def __init__(
self,
hidden_size: int = 5120,
intermediate_size: int = 13312,
num_attention_heads: int = 40,
num_attention_groups: int = 8,
num_hidden_layers: int = 48,
max_seq_len: int = 4096,
vocab_size: int = 65536,
rms_norm_eps: float = 1e-5,
moe_every_n_layer:
int = 2, # 2 means 50% layers use MoE, interleaved with normal non-MoE layers.
use_moe: bool = False,
moe_intermediate_size: int = 10240,
moe_num_experts: int = 16,
moe_top_k: int = 4,
max_pos_interp_ratio: float = 1,
alibi_slopes: Optional[List[float]] = None,
moe_layer_offset: int = 0,
moe_dynamic_exp_p: float = 1.0,
rope_theta: float = 500000,
rope_scaling: Optional[Dict[str, Any]] = None,
head_dim: Optional[int] = None,
max_position_embedding: int = 16384,
share_expert_dim: Optional[int] = None,
allgather_dtype: Optional[str] = None,
share_q_dim: Optional[int] = None,
norm_expert_weight: bool = True,
bos_token_id: Optional[Union[List[int], int]] = None,
eos_token_id: Optional[Union[List[int], int]] = None,
**kwargs,
) -> None:
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_attention_heads = num_attention_heads
self.num_attention_groups = num_attention_groups
self.num_hidden_layers = num_hidden_layers
self.max_seq_len = max_seq_len
self.vocab_size = vocab_size
self.rms_norm_eps = rms_norm_eps
self.use_moe = use_moe
self.moe_intermediate_size = moe_intermediate_size
self.moe_every_n_layer = moe_every_n_layer
self.moe_num_experts = moe_num_experts
self.moe_top_k = moe_top_k
self.max_pos_interp_ratio = max_pos_interp_ratio
self.alibi_slopes = alibi_slopes
self.moe_layer_offset = moe_layer_offset
self.moe_dynamic_exp_p = moe_dynamic_exp_p
#for step2 mini
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.head_dim = head_dim
self.max_position_embedding = max_position_embedding
if share_expert_dim is None:
self.share_expert_dim = self.moe_intermediate_size * self.moe_top_k
else:
self.share_expert_dim = share_expert_dim
self.share_q_dim = share_q_dim
self.norm_expert_weight = norm_expert_weight
self.allgather_dtype = allgather_dtype
self._verify_slopes()
super().__init__(
bos_token_id=1 if bos_token_id is None else bos_token_id,
eos_token_id=[2, 3] if eos_token_id is None else eos_token_id,
**kwargs)
def _verify_slopes(self):
if self.alibi_slopes is None:
return
if len(self.alibi_slopes) != self.num_attention_heads:
raise ValueError(
f"Number of alibi_slopes ({len(self.alibi_slopes)}) does not match num_attention_heads ({self.num_attention_heads})"
)
class Step1Config(StepConfig):
model_type = "step1"
class Step2Config(StepConfig):
model_type = "step2"
def __init__(self, use_offline_input_scales: bool = True, **kwargs):
self.use_offline_input_scales = use_offline_input_scales
super().__init__(**kwargs)
class Step2MiniConfig(StepConfig):
model_type = "step2_mini"
\ No newline at end of file
This diff is collapsed.
......@@ -4,6 +4,8 @@
from typing import Optional
from .tokenizer import AnyTokenizer
# from vllm.transformers_utils.tokenizers.sentencepiece_tokenizer import (
# SentencePieceTokenizer)
def _replace_none_with_empty(tokens: list[Optional[str]]):
......@@ -171,6 +173,13 @@ def detokenize_incrementally(
# The prefix text is necessary only to defeat cleanup algorithms in
# the decode which decide to add a space or not depending on the
# surrounding ids.
# FIXME(ys): for step1 sentencepiece tokenizer, we need to handle the special tokens in convert_tokens_to_string
# if isinstance(tokenizer, SentencePieceTokenizer):
# prefix_text = tokenizer.convert_tokens_to_string(
# output_tokens[prefix_offset:read_offset], skip_special_tokens=skip_special_tokens)
# new_text = tokenizer.convert_tokens_to_string(
# output_tokens[prefix_offset:], skip_special_tokens=skip_special_tokens)
if tokenizer.is_fast or not tokenizer.get_added_vocab():
prefix_text = tokenizer.convert_tokens_to_string(
output_tokens[prefix_offset:read_offset])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment