Unverified Commit 03dbf1aa authored by tc-mb's avatar tc-mb Committed by GitHub
Browse files

[model] support MiniCPM-V 4.0 (#8747)


Signed-off-by: default avatartc-mb <caitianchi@modelbest.cn>
Co-authored-by: default avatarXinyuan Tong <115166877+JustinTong0323@users.noreply.github.com>
parent 11dcabc5
...@@ -54,6 +54,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch ...@@ -54,6 +54,7 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
from sglang.srt.model_loader.utils import set_default_torch_dtype from sglang.srt.model_loader.utils import set_default_torch_dtype
from sglang.srt.model_loader.weight_utils import default_weight_loader from sglang.srt.model_loader.weight_utils import default_weight_loader
from sglang.srt.models.idefics2 import Idefics2VisionTransformer from sglang.srt.models.idefics2 import Idefics2VisionTransformer
from sglang.srt.models.llama import LlamaConfig, LlamaForCausalLM
from sglang.srt.models.qwen2 import Qwen2Config, Qwen2ForCausalLM from sglang.srt.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
from sglang.srt.utils import add_prefix, flatten_nested_list from sglang.srt.utils import add_prefix, flatten_nested_list
...@@ -581,7 +582,7 @@ class MiniCPMBaseModel(nn.Module): ...@@ -581,7 +582,7 @@ class MiniCPMBaseModel(nn.Module):
def init_llm( def init_llm(
self, self,
config: Qwen2Config, config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None, quant_config: Optional[QuantizationConfig] = None,
prefix: str = "", prefix: str = "",
) -> nn.Module: ) -> nn.Module:
...@@ -774,7 +775,168 @@ class MiniCPMV2_6(MiniCPMBaseModel): ...@@ -774,7 +775,168 @@ class MiniCPMV2_6(MiniCPMBaseModel):
return pattern.pad_input_tokens(input_ids, image_inputs) return pattern.pad_input_tokens(input_ids, image_inputs)
_SUPPORT_VERSION = {(2, 6): MiniCPMV2_6} class MiniCPMV4_0(MiniCPMBaseModel):
packed_modules_mapping = {
"qkv_proj": [
"q_proj",
"k_proj",
"v_proj",
],
"gate_up_proj": [
"gate_proj",
"up_proj",
],
}
# LoRA specific attributes
supported_lora_modules = [
# vision encoder
"fc1",
"fc2",
"out_proj",
# language model
"qkv_proj", # same name with vision encoder
"o_proj",
"gate_up_proj",
"down_proj",
# resampler
"kv_proj",
]
# BitandBytes specific attributes
bitsandbytes_stacked_params_mapping = {
# shard_name, weight_name, index
"q_proj": ("qkv_proj", 0),
"k_proj": ("qkv_proj", 1),
"v_proj": ("qkv_proj", 2),
"gate_proj": ("gate_up_proj", 0),
"up_proj": ("gate_up_proj", 1),
}
embedding_modules = {}
embedding_padding_modules = []
def __init__(
self,
config: PretrainedConfig,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
):
super().__init__(config=config, quant_config=quant_config, prefix=prefix)
assert self.version == (4, 0)
def init_llm(
self,
config: LlamaConfig,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
) -> nn.Module:
return LlamaForCausalLM(config=config, quant_config=quant_config, prefix=prefix)
def init_vision_module(
self,
config: PretrainedConfig,
quant_config: Optional[QuantizationConfig],
prefix: str = "",
) -> nn.Module:
model = Idefics2VisionTransformer(
config=config.vision_config, quant_config=quant_config, prefix=prefix
)
if self.config.drop_vision_last_layer:
model.encoder.layers = model.encoder.layers[:-1]
setattr(model, "embed_dim", model.embeddings.embed_dim)
setattr(model, "patch_size", model.embeddings.patch_size)
return model
def init_resampler(
self,
embed_dim: int,
vision_dim: int,
quant_config: Optional[QuantizationConfig] = None,
prefix: str = "",
) -> nn.Module:
with set_default_torch_dtype(torch.float16):
# The resampler in 2.6 remains consistent with the one in 2.5.
resampler = Resampler2_5(
num_queries=self.config.query_num,
embed_dim=embed_dim,
num_heads=embed_dim // 128,
kv_dim=vision_dim,
quant_config=quant_config,
prefix=prefix,
)
return resampler.to(device="cuda", dtype=torch.get_default_dtype())
def get_vision_embedding(
self,
pixel_values: List[torch.Tensor],
patch_attn_mask: Optional[torch.Tensor] = None,
tgt_sizes: Optional[torch.Tensor] = None,
) -> torch.Tensor:
vision_embedding = self.vpm(
pixel_values,
patch_attention_mask=patch_attn_mask,
tgt_sizes=tgt_sizes,
)
return vision_embedding
def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
# list of tensors
pixel_values = flatten_nested_list([item.feature for item in items])
tgt_sizes = torch.stack(
flatten_nested_list([item.tgt_size for item in items]), dim=0
)
assert len(pixel_values) == tgt_sizes.shape[0]
device = self.vpm.embeddings.position_embedding.weight.device
dtype = self.vpm.embeddings.position_embedding.weight.dtype
all_pixel_values_lst = [
i.flatten(end_dim=1).permute(1, 0) for i in pixel_values
]
max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
assert isinstance(max_patches, int)
all_pixel_values = torch.nn.utils.rnn.pad_sequence(
all_pixel_values_lst, batch_first=True, padding_value=0.0
)
B, L, _ = all_pixel_values.shape
all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
patch_attn_mask = torch.zeros(
(B, 1, max_patches), dtype=torch.bool, device=device
)
tgt_sizes_tensor = tgt_sizes.clone().to(device=patch_attn_mask.device)
mask_shapes = tgt_sizes_tensor[:, 0] * tgt_sizes_tensor[:, 1]
patch_attn_mask[:, 0, :] = torch.arange(
patch_attn_mask.size(2), device=patch_attn_mask.device
).unsqueeze(0) < mask_shapes.unsqueeze(1)
vision_embedding = self.vpm(
all_pixel_values.type(dtype),
patch_attention_mask=patch_attn_mask,
tgt_sizes=tgt_sizes,
)
return self.resampler(vision_embedding, tgt_sizes)
def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
# Get all special token IDs
im_start_id: int = image_inputs.im_start_id
im_end_id: int = image_inputs.im_end_id
slice_start_id: int = image_inputs.slice_start_id
slice_end_id: int = image_inputs.slice_end_id
media_token_pairs = [(im_start_id, im_end_id), (slice_start_id, slice_end_id)]
pattern = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
return pattern.pad_input_tokens(input_ids, image_inputs)
_SUPPORT_VERSION = {
(2, 6): MiniCPMV2_6,
(4, 0): MiniCPMV4_0,
}
class MiniCPMV: class MiniCPMV:
...@@ -809,7 +971,7 @@ class MiniCPMV: ...@@ -809,7 +971,7 @@ class MiniCPMV:
# Dispatch class based on version # Dispatch class based on version
instance_class = _SUPPORT_VERSION.get(version) instance_class = _SUPPORT_VERSION.get(version)
if instance_class is None: if instance_class is None:
raise ValueError("Currently, MiniCPMV only supports versions 2.6") raise ValueError("Currently, MiniCPMV only supports versions 2.6 and 4.0")
try: try:
minicpmv = instance_class( minicpmv = instance_class(
......
...@@ -39,7 +39,7 @@ class TestCompressedTensorsLlama3FP8(CustomTestCase): ...@@ -39,7 +39,7 @@ class TestCompressedTensorsLlama3FP8(CustomTestCase):
) )
metrics = run_eval(args) metrics = run_eval(args)
print(f"{metrics=}") print(f"{metrics=}")
self.assertGreater(metrics["accuracy"], 0.45) self.assertGreaterEqual(metrics["accuracy"], 0.45)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -165,6 +165,27 @@ class TestMinicpmvServer(ImageOpenAITestMixin): ...@@ -165,6 +165,27 @@ class TestMinicpmvServer(ImageOpenAITestMixin):
cls.base_url += "/v1" cls.base_url += "/v1"
class TestMinicpmv4Server(ImageOpenAITestMixin):
@classmethod
def setUpClass(cls):
cls.model = "openbmb/MiniCPM-V-4"
cls.base_url = DEFAULT_URL_FOR_TEST
cls.api_key = "sk-123456"
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--trust-remote-code",
"--mem-fraction-static",
"0.35",
"--cuda-graph-max-bs",
"4",
],
)
cls.base_url += "/v1"
class TestInternVL2_5Server(ImageOpenAITestMixin): class TestInternVL2_5Server(ImageOpenAITestMixin):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
...@@ -184,7 +205,7 @@ class TestInternVL2_5Server(ImageOpenAITestMixin): ...@@ -184,7 +205,7 @@ class TestInternVL2_5Server(ImageOpenAITestMixin):
cls.base_url += "/v1" cls.base_url += "/v1"
class TestMinicpmoServer(ImageOpenAITestMixin, AudioOpenAITestMixin): class TestMinicpmo2_6Server(ImageOpenAITestMixin, AudioOpenAITestMixin):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = "openbmb/MiniCPM-o-2_6" cls.model = "openbmb/MiniCPM-o-2_6"
......
...@@ -161,7 +161,7 @@ class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase): ...@@ -161,7 +161,7 @@ class VisionLLMLogitsBase(unittest.IsolatedAsyncioTestCase):
return self.model_runner.model return self.model_runner.model
class TestMiniCPMVLogits(VisionLLMLogitsBase): class TestMiniCPMV2_6Logits(VisionLLMLogitsBase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
super().setUpClass() super().setUpClass()
...@@ -265,3 +265,60 @@ class TestMiniCPMVLogits(VisionLLMLogitsBase): ...@@ -265,3 +265,60 @@ class TestMiniCPMVLogits(VisionLLMLogitsBase):
) )
self.compare_outputs(sglang_output, hf_output) self.compare_outputs(sglang_output, hf_output)
class TestMiniCPMV4Logits(VisionLLMLogitsBase):
@classmethod
def setUpClass(cls):
super().setUpClass()
cls.model_path = "openbmb/MiniCPM-V-4"
cls.tokenizer = AutoTokenizer.from_pretrained(
cls.model_path, trust_remote_code=True
)
cls.processor = AutoProcessor.from_pretrained(
cls.model_path, trust_remote_code=True
)
cls.chat_template = "minicpmv"
cls.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cls.hf_model = (
AutoModel.from_pretrained(
cls.model_path, torch_dtype=torch.bfloat16, trust_remote_code=True
)
.eval()
.to(cls.device)
)
init_embedding_cache()
async def test_vlm_embedding_output(self):
"""
Compares the embedding output of vlm
"""
inputs = self.get_processor_output()
with torch.no_grad():
# hf
model_inputs = {
"input_ids": inputs.input_ids,
"image_bound": inputs.image_bound,
"pixel_values": inputs.pixel_values,
"tgt_sizes": inputs.tgt_sizes,
}
hf_output = self.hf_model.get_input_embeddings()(inputs.input_ids)
# sglang
model = self.get_model()
sglang_output = self.vlm_func(
model,
input_ids=inputs.input_ids.to(self.device),
pixel_values=inputs.pixel_values,
image_bound=inputs.image_bound.to(self.device),
tgt_sizes=inputs.tgt_sizes.to(self.device),
input_embedding=model.get_input_embeddings(),
multimodal_model=model,
placeholder_tokens={
Modality.IMAGE: self.processor.tokenizer.unk_token_id,
},
)
self.compare_outputs(sglang_output, hf_output)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment