Unverified Commit 04d0123f authored by ZXN's avatar ZXN Committed by GitHub
Browse files

[Fix]: support deepseek-vl2-tiny model (#5552)


Co-authored-by: default avatarbppps <zouyu.zzx@alibaba-inc.com>
parent feda9b11
...@@ -182,7 +182,7 @@ class DeepseekVLV2Processor(ProcessorMixin): ...@@ -182,7 +182,7 @@ class DeepseekVLV2Processor(ProcessorMixin):
tokenized_str, images, seq_mask, spatial_crop = self.tokenize_with_images( tokenized_str, images, seq_mask, spatial_crop = self.tokenize_with_images(
messages, messages,
pil_images[image_index : image_index + image_token_cnt], pil_images[image_index : image_index + image_token_cnt],
bos=False, bos=True,
eos=True, eos=True,
cropping=len(pil_images) <= 2, cropping=len(pil_images) <= 2,
max_req_input_len=max_req_input_len, max_req_input_len=max_req_input_len,
......
...@@ -162,7 +162,9 @@ class ModelConfig: ...@@ -162,7 +162,9 @@ class ModelConfig:
self.attention_arch = AttentionArch.MLA self.attention_arch = AttentionArch.MLA
self.kv_lora_rank = self.hf_config.kv_lora_rank self.kv_lora_rank = self.hf_config.kv_lora_rank
self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
elif "DeepseekVL2ForCausalLM" in self.hf_config.architectures: elif "DeepseekVL2ForCausalLM" in self.hf_config.architectures and getattr(
self.hf_text_config, "use_mla", True
):
self.head_dim = 256 self.head_dim = 256
self.attention_arch = AttentionArch.MLA self.attention_arch = AttentionArch.MLA
self.kv_lora_rank = self.hf_text_config.kv_lora_rank self.kv_lora_rank = self.hf_text_config.kv_lora_rank
......
...@@ -463,6 +463,30 @@ def generate_embedding_convs( ...@@ -463,6 +463,30 @@ def generate_embedding_convs(
return convs return convs
# Models in which system adds modality tokens at prompt start automatically
# when media inputs exceed modality tokens in prompt (e.g. 3 images but 2 <image> tokens)
_MODELS_REQUIRING_MODALITY_SUPPLEMENT = {"deepseek-vl2"}
# adapted from https://github.com/vllm-project/vllm/blob/5124f5bf51b83e6f344c1bc6652e8c4d81313b34/vllm/entrypoints/chat_utils.py#L856
def _get_full_multimodal_text_prompt(
modality_token: str, modality_count: int, text_prompt: str
) -> str:
"""Combine multimodal prompts for a multimodal language model."""
# For any existing placeholder in the text prompt, we leave it as is
left: int = modality_count - text_prompt.count(modality_token)
if left < 0:
raise ValueError(
f"Found more '{modality_token}' placeholders in input prompt than "
"actual multimodal data items."
)
# NOTE: For now we always add missing modality_token at the front of
# the prompt. This may change to be customizable in the future.
return "\n".join([modality_token] * left + [text_prompt])
def generate_chat_conv( def generate_chat_conv(
request: ChatCompletionRequest, template_name: str request: ChatCompletionRequest, template_name: str
) -> Conversation: ) -> Conversation:
...@@ -520,6 +544,12 @@ def generate_chat_conv( ...@@ -520,6 +544,12 @@ def generate_chat_conv(
if conv.name != "qwen2-vl" if conv.name != "qwen2-vl"
else conv.image_token else conv.image_token
) )
add_token_as_needed: bool = (
conv.name in _MODELS_REQUIRING_MODALITY_SUPPLEMENT
)
if add_token_as_needed:
image_token = ""
audio_token = conv.audio_token audio_token = conv.audio_token
for content in message.content: for content in message.content:
if content.type == "text": if content.type == "text":
...@@ -533,7 +563,10 @@ def generate_chat_conv( ...@@ -533,7 +563,10 @@ def generate_chat_conv(
elif content.type == "audio_url": elif content.type == "audio_url":
real_content += audio_token real_content += audio_token
conv.append_audio(content.audio_url.url) conv.append_audio(content.audio_url.url)
if add_token_as_needed:
real_content = _get_full_multimodal_text_prompt(
conv.image_token, num_image_url, real_content
)
conv.append_message(conv.roles[0], real_content) conv.append_message(conv.roles[0], real_content)
elif msg_role == "assistant": elif msg_role == "assistant":
parsed_content = "" parsed_content = ""
......
...@@ -382,8 +382,14 @@ class DeepseekModel(nn.Module): ...@@ -382,8 +382,14 @@ class DeepseekModel(nn.Module):
input_ids: torch.Tensor, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
forward_batch: ForwardBatch, forward_batch: ForwardBatch,
input_embeds: torch.Tensor = None,
) -> torch.Tensor: ) -> torch.Tensor:
hidden_states = self.embed_tokens(input_ids)
if input_embeds is None:
hidden_states = self.embed_tokens(input_ids)
else:
hidden_states = input_embeds
residual = None residual = None
for i in range(len(self.layers)): for i in range(len(self.layers)):
layer = self.layers[i] layer = self.layers[i]
...@@ -416,14 +422,18 @@ class DeepseekForCausalLM(nn.Module): ...@@ -416,14 +422,18 @@ class DeepseekForCausalLM(nn.Module):
) )
self.logits_processor = LogitsProcessor(config) self.logits_processor = LogitsProcessor(config)
def get_input_embeddings(self) -> nn.Embedding:
return self.model.embed_tokens
@torch.no_grad() @torch.no_grad()
def forward( def forward(
self, self,
input_ids: torch.Tensor, input_ids: torch.Tensor,
positions: torch.Tensor, positions: torch.Tensor,
forward_batch: ForwardBatch, forward_batch: ForwardBatch,
input_embeds: torch.Tensor = None,
) -> torch.Tensor: ) -> torch.Tensor:
hidden_states = self.model(input_ids, positions, forward_batch) hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
return self.logits_processor( return self.logits_processor(
input_ids, hidden_states, self.lm_head, forward_batch input_ids, hidden_states, self.lm_head, forward_batch
) )
......
...@@ -18,6 +18,7 @@ from sglang.srt.managers.mm_utils import ( ...@@ -18,6 +18,7 @@ from sglang.srt.managers.mm_utils import (
from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
from sglang.srt.model_executor.forward_batch_info import ForwardBatch from sglang.srt.model_executor.forward_batch_info import ForwardBatch
from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.model_loader.weight_utils import default_weight_loader
from sglang.srt.models.deepseek import DeepseekForCausalLM
from sglang.srt.models.deepseek_v2 import DeepseekV2ForCausalLM from sglang.srt.models.deepseek_v2 import DeepseekV2ForCausalLM
...@@ -189,7 +190,11 @@ class DeepseekVL2ForCausalLM(nn.Module): ...@@ -189,7 +190,11 @@ class DeepseekVL2ForCausalLM(nn.Module):
# ----------- language model ------------ # ----------- language model ------------
language_config = config.language_config language_config = config.language_config
self.language_model = DeepseekV2ForCausalLM(language_config) if language_config.use_mla:
self.language_model = DeepseekV2ForCausalLM(language_config)
else:
# deepseek-vl2-tiny forbids mla
self.language_model = DeepseekForCausalLM(language_config)
def _init_vision_module( def _init_vision_module(
self, vision_config, quant_config: Optional[QuantizationConfig] self, vision_config, quant_config: Optional[QuantizationConfig]
......
...@@ -654,6 +654,30 @@ class TestDeepseekVL2Server(TestOpenAIVisionServer): ...@@ -654,6 +654,30 @@ class TestDeepseekVL2Server(TestOpenAIVisionServer):
pass pass
class TestDeepseekVL2TinyServer(TestOpenAIVisionServer):
@classmethod
def setUpClass(cls):
cls.model = "deepseek-ai/deepseek-vl2-tiny"
cls.base_url = DEFAULT_URL_FOR_TEST
cls.api_key = "sk-123456"
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--trust-remote-code",
"--chat-template",
"deepseek-vl2",
"--context-length",
"4096",
],
)
cls.base_url += "/v1"
def test_video_chat_completion(self):
pass
class TestJanusProServer(TestOpenAIVisionServer): class TestJanusProServer(TestOpenAIVisionServer):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment