Unverified Commit aadb6565 authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[Misc] Clean up Kimi-VL (#16833)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent 87e067de
...@@ -376,9 +376,9 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -376,9 +376,9 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs( engine_args = EngineArgs(
model="moonshotai/Kimi-VL-A3B-Instruct", model="moonshotai/Kimi-VL-A3B-Instruct",
max_model_len=4096,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={"image": 1},
) )
return ModelRequestData( return ModelRequestData(
......
...@@ -331,11 +331,10 @@ def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -331,11 +331,10 @@ def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
max_num_seqs=4, max_num_seqs=4,
tensor_parallel_size=1,
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
trust_remote_code=True,
) )
placeholders = [{"type": "image", "image": url} for url in image_urls] placeholders = [{"type": "image", "image": url} for url in image_urls]
......
...@@ -56,7 +56,6 @@ from transformers.activations import GELUActivation ...@@ -56,7 +56,6 @@ from transformers.activations import GELUActivation
from vllm.config import VllmConfig from vllm.config import VllmConfig
from vllm.distributed import (get_tensor_model_parallel_rank, from vllm.distributed import (get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size) get_tensor_model_parallel_world_size)
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.fused_moe import FusedMoE
from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
...@@ -70,22 +69,20 @@ from vllm.model_executor.models.moonvit import MoonVitPretrainedModel ...@@ -70,22 +69,20 @@ from vllm.model_executor.models.moonvit import MoonVitPretrainedModel
from vllm.model_executor.models.utils import merge_multimodal_embeddings from vllm.model_executor.models.utils import merge_multimodal_embeddings
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
NestedTensors) MultiModalKwargs, NestedTensors)
from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
MultiModalDataItems) MultiModalDataItems)
from vllm.multimodal.processing import (BaseMultiModalProcessor, from vllm.multimodal.processing import (BaseMultiModalProcessor,
BaseProcessingInfo, PromptReplacement, BaseProcessingInfo, PromptReplacement,
PromptUpdate) PromptUpdate)
from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig from vllm.transformers_utils.configs import KimiVLConfig, MoonViTConfig
from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config from vllm.transformers_utils.configs.deepseek_vl2 import DeepseekV2Config
from .utils import is_pp_missing_parameter, maybe_prefix from .utils import is_pp_missing_parameter, maybe_prefix
logger = init_logger(__name__)
# For dummy input only # For dummy input only
@dataclass @dataclass
...@@ -143,6 +140,9 @@ class KimiVLProcessingInfo(BaseProcessingInfo): ...@@ -143,6 +140,9 @@ class KimiVLProcessingInfo(BaseProcessingInfo):
def get_hf_config(self): def get_hf_config(self):
return self.ctx.get_hf_config(KimiVLConfig) return self.ctx.get_hf_config(KimiVLConfig)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
return {"image": None}
def get_num_image_tokens( def get_num_image_tokens(
self, self,
*, *,
...@@ -180,23 +180,6 @@ class KimiVLProcessingInfo(BaseProcessingInfo): ...@@ -180,23 +180,6 @@ class KimiVLProcessingInfo(BaseProcessingInfo):
token_width = (width + pad_width) // (kernel_size[1] * patch_size) token_width = (width + pad_width) // (kernel_size[1] * patch_size)
return int(token_height * token_width) return int(token_height * token_width)
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
# None means unlimited
return {"image": None}
def get_mm_max_tokens_per_item(
self,
seq_len: int,
mm_counts: Mapping[str, int],
) -> Mapping[str, int]:
return {
"image":
self.get_num_image_tokens(
image_width=MaxImageTokenMeta.width,
image_height=MaxImageTokenMeta.height,
),
}
@property @property
def image_token_id(self) -> int: def image_token_id(self) -> int:
return self.get_hf_config().media_placeholder_token_id return self.get_hf_config().media_placeholder_token_id
...@@ -204,34 +187,28 @@ class KimiVLProcessingInfo(BaseProcessingInfo): ...@@ -204,34 +187,28 @@ class KimiVLProcessingInfo(BaseProcessingInfo):
class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]): class KimiVLDummyInputsBuilder(BaseDummyInputsBuilder[KimiVLProcessingInfo]):
def __init__(self, info: KimiVLProcessingInfo) -> None: def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
super().__init__(info) num_images = mm_counts.get("image", 0)
processor = self.info.get_hf_processor()
image_token = processor.image_token
self.image_token_id = self.info.image_token_id return image_token * num_images
self.image_token = self.info.get_tokenizer().decode(
self.image_token_id)
def get_dummy_processor_inputs( def get_dummy_mm_data(
self, self,
seq_len: int, seq_len: int,
mm_counts: Mapping[str, int], mm_counts: Mapping[str, int],
) -> ProcessorInputs: ) -> MultiModalDataDict:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
width = MaxImageTokenMeta.width return {
height = MaxImageTokenMeta.height
mm_data = {
"image": "image":
self._get_dummy_images(width=width, self._get_dummy_images(width=MaxImageTokenMeta.width,
height=height, height=MaxImageTokenMeta.height,
num_images=num_images) num_images=num_images)
} }
return ProcessorInputs(
prompt_text=self.image_token * num_images,
mm_data=mm_data,
)
class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]): class KimiVLMultiModalProcessor(BaseMultiModalProcessor[KimiVLProcessingInfo]):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment