Unverified Commit b1308b84 authored by courage17340's avatar courage17340 Committed by GitHub
Browse files

[Model][VLM] Add Kimi-VL model support (#16387)


Signed-off-by: default avatarcourage17340 <courage17340@163.com>
parent 7b5ecf79
......@@ -886,6 +886,13 @@ See [this page](#generative-models) for more information on how to use generativ
*
* ✅︎
* ✅︎
- * `KimiVLForConditionalGeneration`
* Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking
* T + I<sup>+</sup>
* `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking`
*
*
* ✅︎
- * `Llama4ForConditionalGeneration`
* Llama 4
* T + I<sup>+</sup>
......
......@@ -364,6 +364,29 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
)
# Kimi-VL
def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
prompts = [
"<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>"
f"<|media_pad|><|media_end|>{question}<|im_end|>"
"<|im_assistant|>assistant<|im_middle|>" for question in questions
]
engine_args = EngineArgs(
model="moonshotai/Kimi-VL-A3B-Instruct",
max_model_len=4096,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
trust_remote_code=True,
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# LLaVA-1.5
def run_llava(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
......@@ -966,6 +989,7 @@ model_example_map = {
"h2ovl_chat": run_h2ovl,
"idefics3": run_idefics3,
"internvl_chat": run_internvl,
"kimi_vl": run_kimi_vl,
"llava": run_llava,
"llava-next": run_llava_next,
"llava-next-video": run_llava_next_video,
......
......@@ -326,6 +326,45 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "moonshotai/Kimi-VL-A3B-Instruct"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=4,
tensor_parallel_size=1,
limit_mm_per_prompt={"image": len(image_urls)},
trust_remote_code=True,
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [{
"role":
"user",
"content": [
*placeholders,
{
"type": "text",
"text": question
},
],
}]
processor = AutoProcessor.from_pretrained(model_name,
trust_remote_code=True)
prompt = processor.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_mistral3(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
......@@ -640,6 +679,7 @@ model_example_map = {
"h2ovl_chat": load_h2ovl,
"idefics3": load_idefics3,
"internvl_chat": load_internvl,
"kimi_vl": load_kimi_vl,
"llama4": load_llama4,
"mistral3": load_mistral3,
"mllama": load_mllama,
......
......@@ -10,6 +10,7 @@ pytest-timeout
# testing utils
awscli
backoff # required for phi4mm test
blobfile # required for kimi-vl test
einops # required for MPT, qwen-vl and Mamba
httpx
librosa # required for audio tests
......
......@@ -39,6 +39,8 @@ bitsandbytes==0.45.3
# via -r requirements/test.in
black==24.10.0
# via datamodel-code-generator
blobfile==3.0.0
# via -r requirements/test.in
boto3==1.35.57
# via tensorizer
botocore==1.35.57
......@@ -127,6 +129,7 @@ fastsafetensors==0.1.10
# via -r requirements/test.in
filelock==3.16.1
# via
# blobfile
# datasets
# huggingface-hub
# ray
......@@ -227,7 +230,9 @@ llvmlite==0.44.0
lm-eval==0.4.8
# via -r requirements/test.in
lxml==5.3.0
# via sacrebleu
# via
# blobfile
# sacrebleu
markdown-it-py==3.0.0
# via rich
markupsafe==3.0.2
......@@ -426,6 +431,8 @@ pybind11==2.13.6
# via lm-eval
pycparser==2.22
# via cffi
pycryptodomex==3.22.0
# via blobfile
pydantic==2.9.2
# via
# datamodel-code-generator
......@@ -689,6 +696,7 @@ tzdata==2024.2
# via pandas
urllib3==2.2.3
# via
# blobfile
# botocore
# requests
# responses
......
......@@ -318,6 +318,18 @@ VLM_TEST_SETTINGS = {
use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner,
),
"kimi_vl": VLMTestInfo(
models=["moonshotai/Kimi-VL-A3B-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>", # noqa: E501
max_model_len=8192,
max_num_seqs=2,
dtype="bfloat16",
tensor_parallel_size=1,
vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output,
marks=[large_gpu_mark(min_gb=48)],
),
"llama4": VLMTestInfo(
models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
......
......@@ -68,6 +68,17 @@ def qwen2_vllm_to_hf_output(
return output_ids, hf_output_str, out_logprobs
def kimiv_vl_vllm_to_hf_output(
vllm_output: RunnerOutput,
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
"""Sanitize vllm output [kimi_vl models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
hf_output_str = output_str + "<|im_end|>[EOS]"
return output_ids, hf_output_str, out_logprobs
def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
config = AutoConfig.from_pretrained(model)
......
......@@ -258,6 +258,7 @@ def _test_processing_correctness_mistral(
"OpenGVLab/InternVL2-1B",
"HuggingFaceM4/Idefics3-8B-Llama3",
"HuggingFaceTB/SmolVLM2-2.2B-Instruct",
"moonshotai/Kimi-VL-A3B-Instruct",
"meta-llama/Llama-4-Scout-17B-16E-Instruct",
"llava-hf/llava-1.5-7b-hf",
"llava-hf/llava-v1.6-mistral-7b-hf",
......
......@@ -302,6 +302,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code=True),
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501
{"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501
"KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501
extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"}, # noqa: E501
trust_remote_code=True),
"Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
min_transformers_version="4.51"),
"LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
......
......@@ -512,6 +512,8 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
return "<|fim_prefix|><|img|><|fim_suffix|>"
if model_type == "gemma3":
return "<start_of_image>"
if model_type == "kimi_vl":
return "<|media_start|>image<|media_content|><|media_pad|><|media_end|>" # noqa: E501
raise TypeError(f"Unknown {modality} model type: {model_type}")
elif modality == "audio":
......
This diff is collapsed.
This diff is collapsed.
......@@ -177,6 +177,7 @@ _MULTIMODAL_MODELS = {
"InternVLChatModel": ("internvl", "InternVLChatModel"),
"Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
"SmolVLMForConditionalGeneration": ("smolvlm","SmolVLMForConditionalGeneration"), # noqa: E501
"KimiVLForConditionalGeneration": ("kimi_vl", "KimiVLForConditionalGeneration"), # noqa: E501
"LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
"LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501
"LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501
......
......@@ -33,12 +33,13 @@ from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config,
EAGLEConfig, ExaoneConfig,
H2OVLChatConfig,
InternVLChatConfig, JAISConfig,
MedusaConfig, MllamaConfig,
MLPSpeculatorConfig, MPTConfig,
NemotronConfig, NVLM_D_Config,
Olmo2Config, RWConfig,
SkyworkR1VChatConfig, SolarConfig,
Telechat2Config, UltravoxConfig)
KimiVLConfig, MedusaConfig,
MllamaConfig, MLPSpeculatorConfig,
MPTConfig, NemotronConfig,
NVLM_D_Config, Olmo2Config,
RWConfig, SkyworkR1VChatConfig,
SolarConfig, Telechat2Config,
UltravoxConfig)
# yapf: enable
from vllm.transformers_utils.utils import check_gguf_file
from vllm.utils import resolve_obj_by_qualname
......@@ -62,6 +63,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
"cohere2": Cohere2Config,
"dbrx": DbrxConfig,
"deepseek_vl_v2": DeepseekVLV2Config,
"kimi_vl": KimiVLConfig,
"mpt": MPTConfig,
"RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct)
"RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct)
......
......@@ -13,9 +13,11 @@ from vllm.transformers_utils.configs.falcon import RWConfig
from vllm.transformers_utils.configs.h2ovl import H2OVLChatConfig
from vllm.transformers_utils.configs.internvl import InternVLChatConfig
from vllm.transformers_utils.configs.jais import JAISConfig
from vllm.transformers_utils.configs.kimi_vl import KimiVLConfig
from vllm.transformers_utils.configs.medusa import MedusaConfig
from vllm.transformers_utils.configs.mllama import MllamaConfig
from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig
from vllm.transformers_utils.configs.moonvit import MoonViTConfig
from vllm.transformers_utils.configs.mpt import MPTConfig
from vllm.transformers_utils.configs.nemotron import NemotronConfig
from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config
......@@ -40,6 +42,8 @@ __all__ = [
"ExaoneConfig",
"MllamaConfig",
"MLPSpeculatorConfig",
"MoonViTConfig",
"KimiVLConfig",
"NemotronConfig",
"NVLM_D_Config",
"Olmo2Config",
......
# SPDX-License-Identifier: Apache-2.0
# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
from typing import Optional, Union
from transformers.configuration_utils import PretrainedConfig
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
from vllm.transformers_utils.configs.moonvit import MoonViTConfig
class KimiVLConfig(PretrainedConfig):
model_type = "kimi_vl"
def __init__(self,
vision_config: Optional[Union[dict, MoonViTConfig]] = None,
text_config: Optional[Union[dict, DeepseekV2Config]] = None,
ignore_index: int = -100,
media_placeholder_token_id: int = 163605,
pad_token_id: int = 0,
**kwargs):
if vision_config is None:
vision_config = MoonViTConfig()
elif isinstance(vision_config, dict):
vision_config = MoonViTConfig(**vision_config)
self.vision_config = vision_config
if text_config is None:
text_config = DeepseekV2Config()
elif isinstance(text_config, dict):
text_config = DeepseekV2Config(**text_config)
self.text_config = text_config
self.ignore_index = ignore_index
self.media_placeholder_token_id = media_placeholder_token_id
super().__init__(pad_token_id=pad_token_id, **kwargs)
# SPDX-License-Identifier: Apache-2.0
# Adapted from https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/blob/main/configuration_kimi_vl.py
from transformers.configuration_utils import PretrainedConfig
class MoonViTConfig(PretrainedConfig):
model_type = "moonvit"
def __init__(
self,
patch_size: int = 14,
init_pos_emb_height: int = 64,
init_pos_emb_width: int = 64,
num_attention_heads: int = 16,
num_hidden_layers: int = 27,
hidden_size: int = 1152,
intermediate_size: int = 4304,
merge_kernel_size: tuple[int, int] = (2, 2),
**kwargs,
):
super().__init__(**kwargs)
self.patch_size = patch_size
# Positional embedding config
self.init_pos_emb_height = init_pos_emb_height
self.init_pos_emb_width = init_pos_emb_width
# Transformer config
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
# Patch merger config
self.merge_kernel_size = merge_kernel_size
......@@ -997,13 +997,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# Return empty ModelRunnerOutput if there's no work to do.
return EMPTY_MODEL_RUNNER_OUTPUT
if self.is_multimodal_model:
# Run the multimodal encoder if any.
self._execute_mm_encoder(scheduler_output)
mm_embeds = self._gather_mm_embeddings(scheduler_output)
else:
mm_embeds = []
# Prepare the decoder inputs.
attn_metadata, logits_indices, spec_decode_metadata = (
self._prepare_inputs(scheduler_output))
......@@ -1019,6 +1012,15 @@ class GPUModelRunner(LoRAModelRunnerMixin):
num_input_tokens = num_scheduled_tokens
attn_metadata.num_input_tokens = num_input_tokens
# _prepare_inputs may reorder the batch, so we must gather multi
# modal outputs after that to ensure the correct order
if self.is_multimodal_model:
# Run the multimodal encoder if any.
self._execute_mm_encoder(scheduler_output)
mm_embeds = self._gather_mm_embeddings(scheduler_output)
else:
mm_embeds = []
if self.is_multimodal_model:
# NOTE(woosuk): To unify token ids and soft tokens (vision
# embeddings), we always use embeddings (rather than token ids)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment