Unverified Commit 43ecd0a9 authored by Isotr0py's avatar Isotr0py Committed by GitHub
Browse files

[Chore] Clean up deepseek v2/v3 config copy (#28055)


Signed-off-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
parent 07d61451
...@@ -292,6 +292,7 @@ class DeepseekDecoderLayer(nn.Module): ...@@ -292,6 +292,7 @@ class DeepseekDecoderLayer(nn.Module):
rope_theta = getattr(config, "rope_theta", 10000) rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None) rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
moe_layer_freq = getattr(config, "moe_layer_freq", 1)
self.self_attn = DeepseekAttention( self.self_attn = DeepseekAttention(
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
...@@ -306,7 +307,7 @@ class DeepseekDecoderLayer(nn.Module): ...@@ -306,7 +307,7 @@ class DeepseekDecoderLayer(nn.Module):
if ( if (
config.n_routed_experts is not None config.n_routed_experts is not None
and layer_idx >= config.first_k_dense_replace and layer_idx >= config.first_k_dense_replace
and layer_idx % config.moe_layer_freq == 0 and layer_idx % moe_layer_freq == 0
): ):
self.mlp = DeepseekMoE( self.mlp = DeepseekMoE(
config=config, quant_config=quant_config, prefix=f"{prefix}.mlp" config=config, quant_config=quant_config, prefix=f"{prefix}.mlp"
......
...@@ -994,6 +994,7 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -994,6 +994,7 @@ class DeepseekV2DecoderLayer(nn.Module):
rope_theta = getattr(config, "rope_theta", 10000) rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None) rope_scaling = getattr(config, "rope_scaling", None)
max_position_embeddings = getattr(config, "max_position_embeddings", 8192) max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
moe_layer_freq = getattr(config, "moe_layer_freq", 1)
# DecoderLayers are created with `make_layers` which passes the prefix # DecoderLayers are created with `make_layers` which passes the prefix
# with the layer's index. # with the layer's index.
layer_idx = int(prefix.split(sep=".")[-1]) layer_idx = int(prefix.split(sep=".")[-1])
...@@ -1024,7 +1025,7 @@ class DeepseekV2DecoderLayer(nn.Module): ...@@ -1024,7 +1025,7 @@ class DeepseekV2DecoderLayer(nn.Module):
if ( if (
config.n_routed_experts is not None config.n_routed_experts is not None
and layer_idx >= config.first_k_dense_replace and layer_idx >= config.first_k_dense_replace
and layer_idx % config.moe_layer_freq == 0 and layer_idx % moe_layer_freq == 0
): ):
self.mlp = DeepseekV2MoE( self.mlp = DeepseekV2MoE(
config=config, config=config,
......
...@@ -50,7 +50,7 @@ from typing import Annotated, Any, Literal ...@@ -50,7 +50,7 @@ from typing import Annotated, Any, Literal
import torch import torch
from torch import nn from torch import nn
from transformers import BatchFeature from transformers import BatchFeature, DeepseekV2Config
from transformers.activations import GELUActivation from transformers.activations import GELUActivation
from vllm.config import VllmConfig from vllm.config import VllmConfig
...@@ -91,7 +91,6 @@ from vllm.multimodal.processing import ( ...@@ -91,7 +91,6 @@ from vllm.multimodal.processing import (
from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
from vllm.utils.tensor_schema import TensorSchema, TensorShape from vllm.utils.tensor_schema import TensorSchema, TensorShape
from .utils import PPMissingLayer, is_pp_missing_parameter, maybe_prefix from .utils import PPMissingLayer, is_pp_missing_parameter, maybe_prefix
......
...@@ -24,7 +24,7 @@ from huggingface_hub.utils import ( ...@@ -24,7 +24,7 @@ from huggingface_hub.utils import (
RepositoryNotFoundError, RepositoryNotFoundError,
RevisionNotFoundError, RevisionNotFoundError,
) )
from transformers import GenerationConfig, PretrainedConfig from transformers import DeepseekV3Config, GenerationConfig, PretrainedConfig
from transformers.models.auto.image_processing_auto import get_image_processor_config from transformers.models.auto.image_processing_auto import get_image_processor_config
from transformers.models.auto.modeling_auto import ( from transformers.models.auto.modeling_auto import (
MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
...@@ -68,16 +68,18 @@ def _get_hf_token() -> str | None: ...@@ -68,16 +68,18 @@ def _get_hf_token() -> str | None:
class LazyConfigDict(dict): class LazyConfigDict(dict):
def __getitem__(self, key): def __getitem__(self, key):
if isinstance(value := super().__getitem__(key), type):
return value
import vllm.transformers_utils.configs as configs import vllm.transformers_utils.configs as configs
return getattr(configs, super().__getitem__(key)) return getattr(configs, value)
_CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict( _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
chatglm="ChatGLMConfig", chatglm="ChatGLMConfig",
deepseek_vl_v2="DeepseekVLV2Config", deepseek_vl_v2="DeepseekVLV2Config",
deepseek_v3="DeepseekV3Config", deepseek_v32=DeepseekV3Config,
deepseek_v32="DeepseekV3Config",
flex_olmo="FlexOlmoConfig", flex_olmo="FlexOlmoConfig",
kimi_linear="KimiLinearConfig", kimi_linear="KimiLinearConfig",
kimi_vl="KimiVLConfig", kimi_vl="KimiVLConfig",
......
...@@ -8,7 +8,6 @@ Model configs may be defined in this directory for the following reasons: ...@@ -8,7 +8,6 @@ Model configs may be defined in this directory for the following reasons:
""" """
from vllm.transformers_utils.configs.chatglm import ChatGLMConfig from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
from vllm.transformers_utils.configs.deepseek_v3 import DeepseekV3Config
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekVLV2Config
from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig from vllm.transformers_utils.configs.dotsocr import DotsOCRConfig
from vllm.transformers_utils.configs.eagle import EAGLEConfig from vllm.transformers_utils.configs.eagle import EAGLEConfig
...@@ -43,7 +42,6 @@ from vllm.transformers_utils.configs.ultravox import UltravoxConfig ...@@ -43,7 +42,6 @@ from vllm.transformers_utils.configs.ultravox import UltravoxConfig
__all__ = [ __all__ = [
"ChatGLMConfig", "ChatGLMConfig",
"DeepseekVLV2Config", "DeepseekVLV2Config",
"DeepseekV3Config",
"DotsOCRConfig", "DotsOCRConfig",
"EAGLEConfig", "EAGLEConfig",
"FlexOlmoConfig", "FlexOlmoConfig",
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
class DeepseekV3Config(PretrainedConfig):
model_type = "deepseek_v3"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=129280,
hidden_size=7168,
intermediate_size=18432,
moe_intermediate_size=2048,
num_hidden_layers=61,
num_nextn_predict_layers=1,
num_attention_heads=128,
num_key_value_heads=128,
n_shared_experts=1,
n_routed_experts=256,
ep_size=1,
routed_scaling_factor=2.5,
kv_lora_rank=512,
q_lora_rank=1536,
qk_rope_head_dim=64,
v_head_dim=128,
qk_nope_head_dim=128,
topk_method="noaux_tc",
n_group=8,
topk_group=4,
num_experts_per_tok=8,
moe_layer_freq=1,
first_k_dense_replace=3,
norm_topk_prob=True,
scoring_func="sigmoid",
hidden_act="silu",
max_position_embeddings=4096,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=None,
bos_token_id=0,
eos_token_id=1,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.moe_intermediate_size = moe_intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_nextn_predict_layers = num_nextn_predict_layers
self.num_attention_heads = num_attention_heads
self.n_shared_experts = n_shared_experts
self.n_routed_experts = n_routed_experts
self.ep_size = ep_size
self.routed_scaling_factor = routed_scaling_factor
self.kv_lora_rank = kv_lora_rank
self.q_lora_rank = q_lora_rank
self.qk_rope_head_dim = qk_rope_head_dim
self.v_head_dim = v_head_dim
self.qk_nope_head_dim = qk_nope_head_dim
self.topk_method = topk_method
self.n_group = n_group
self.topk_group = topk_group
self.num_experts_per_tok = num_experts_per_tok
self.moe_layer_freq = moe_layer_freq
self.first_k_dense_replace = first_k_dense_replace
self.norm_topk_prob = norm_topk_prob
self.scoring_func = scoring_func
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268 # adapted from https://github.com/deepseek-ai/DeepSeek-VL2/blob/faf18023f24b962b32d9f0a2d89e402a8d383a78/deepseek_vl2/models/modeling_deepseek_vl_v2.py#L115-L268
from transformers.configuration_utils import PretrainedConfig from transformers import DeepseekV2Config, PretrainedConfig
class VisionEncoderConfig(PretrainedConfig): class VisionEncoderConfig(PretrainedConfig):
...@@ -87,106 +87,6 @@ class MlpProjectorConfig(PretrainedConfig): ...@@ -87,106 +87,6 @@ class MlpProjectorConfig(PretrainedConfig):
super().__init__(**kwargs) super().__init__(**kwargs)
class DeepseekV2Config(PretrainedConfig):
model_type = "deepseek_v2"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=102400,
hidden_size=4096,
intermediate_size=11008,
moe_intermediate_size=1407,
num_hidden_layers=30,
num_attention_heads=32,
num_key_value_heads=32,
n_shared_experts=None,
n_routed_experts=None,
ep_size=1,
routed_scaling_factor=1.0,
kv_lora_rank=512,
q_lora_rank=1536,
qk_rope_head_dim=64,
v_head_dim=128,
qk_nope_head_dim=128,
topk_method="gready",
n_group=None,
topk_group=None,
num_experts_per_tok=None,
moe_layer_freq=1,
first_k_dense_replace=0,
norm_topk_prob=False,
scoring_func="softmax",
aux_loss_alpha=0.001,
seq_aux=True,
hidden_act="silu",
max_position_embeddings=2048,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=None,
bos_token_id=100000,
eos_token_id=100001,
pretraining_tp=1,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
use_mla=True,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.moe_intermediate_size = moe_intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.n_shared_experts = n_shared_experts
self.n_routed_experts = n_routed_experts
self.ep_size = ep_size
self.routed_scaling_factor = routed_scaling_factor
self.kv_lora_rank = kv_lora_rank
self.q_lora_rank = q_lora_rank
self.qk_rope_head_dim = qk_rope_head_dim
self.v_head_dim = v_head_dim
self.qk_nope_head_dim = qk_nope_head_dim
self.topk_method = topk_method
self.n_group = n_group
self.topk_group = topk_group
self.num_experts_per_tok = num_experts_per_tok
self.moe_layer_freq = moe_layer_freq
self.first_k_dense_replace = first_k_dense_replace
self.norm_topk_prob = norm_topk_prob
self.scoring_func = scoring_func
self.aux_loss_alpha = aux_loss_alpha
self.seq_aux = seq_aux
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = float(rms_norm_eps)
self.pretraining_tp = pretraining_tp
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.use_mla = use_mla
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
class DeepseekVLV2Config(PretrainedConfig): class DeepseekVLV2Config(PretrainedConfig):
model_type = "deepseek_vl_v2" model_type = "deepseek_vl_v2"
vision_config: VisionEncoderConfig vision_config: VisionEncoderConfig
......
...@@ -3,9 +3,7 @@ ...@@ -3,9 +3,7 @@
import os import os
from transformers import AutoConfig, PretrainedConfig from transformers import AutoConfig, DeepseekV2Config, PretrainedConfig
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
class EAGLEConfig(PretrainedConfig): class EAGLEConfig(PretrainedConfig):
...@@ -20,12 +18,6 @@ class EAGLEConfig(PretrainedConfig): ...@@ -20,12 +18,6 @@ class EAGLEConfig(PretrainedConfig):
): ):
model_config: PretrainedConfig | DeepseekV2Config | None model_config: PretrainedConfig | DeepseekV2Config | None
if isinstance(model, dict): if isinstance(model, dict):
archs = model.get("architectures", [])
target_archs = ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]
if any(target_arch in archs for target_arch in target_archs):
# AutoConfig does not support DeepSeek MoE models yet
model_config = DeepseekV2Config(**model)
else:
model_config = AutoConfig.for_model(**model) model_config = AutoConfig.for_model(**model)
else: else:
model_config = model model_config = model
......
...@@ -2,9 +2,9 @@ ...@@ -2,9 +2,9 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py # Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
from transformers import DeepseekV2Config
from transformers.configuration_utils import PretrainedConfig from transformers.configuration_utils import PretrainedConfig
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
from vllm.transformers_utils.configs.moonvit import MoonViTConfig from vllm.transformers_utils.configs.moonvit import MoonViTConfig
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment