Unverified Commit d249a9e9 authored by artem-spector's avatar artem-spector Committed by GitHub
Browse files

Add Granite 4.1 Vision as built-in multimodal model (#40282)


Signed-off-by: default avatarArtem Spector <artems@il.ibm.com>
Signed-off-by: default avatarartemspector <artems@il.ibm.com>
Co-authored-by: default avatarartemspector <artems@il.ibm.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent d2e2e856
......@@ -560,6 +560,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `Glm4vForConditionalGeneration` | GLM-4.1V-Thinking | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.1V-9B-Thinking`, etc. | ✅︎ | ✅︎ |
| `Glm4vMoeForConditionalGeneration` | GLM-4.5V | T + I<sup>E+</sup> + V<sup>E+</sup> | `zai-org/GLM-4.5V`, etc. | ✅︎ | ✅︎ |
| `GlmOcrForConditionalGeneration` | GLM-OCR | T + I<sup>E+</sup> | `zai-org/GLM-OCR`, etc. | ✅︎ | ✅︎ |
| `Granite4VisionForConditionalGeneration` | Granite 4 Vision | T + I<sup>E+</sup> | `ibm-granite/granite-4.1-3b-vision`, etc. | ✅︎ | ✅︎ |
| `GraniteSpeechForConditionalGeneration` | Granite Speech | T + A | `ibm-granite/granite-speech-3.3-8b` | ✅︎ | ✅︎ |
| `HCXVisionForCausalLM` | HyperCLOVAX-SEED-Vision-Instruct-3B | T + I<sup>+</sup> + V<sup>+</sup> | `naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B` | | |
| `HCXVisionV2ForCausalLM` | HyperCLOVAX-SEED-Think-32B | T + I<sup>+</sup> + V<sup>+</sup> | `naver-hyperclovax/HyperCLOVAX-SEED-Think-32B` | | |
......
......@@ -310,6 +310,38 @@ def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_granite4_vision(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "ibm-granite/granite-vision-4.1-4b"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=16,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": question},
],
}
]
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "h2oai/h2ovl-mississippi-800m"
......@@ -1487,6 +1519,7 @@ model_example_map = {
"deepseek_ocr": load_deepseek_ocr,
"exaone4_5": load_exaone4_5,
"gemma3": load_gemma3,
"granite4_vision": load_granite4_vision,
"h2ovl_chat": load_h2ovl,
"hunyuan_vl": load_hunyuan_vl,
"hyperclovax_seed_vision": load_hyperclovax_seed_vision,
......
......@@ -86,6 +86,29 @@ COMMON_BROADCAST_SETTINGS = {
# which cases would be selected and deselected by pytest. In general,
# this is a good idea for checking your command first, since tests are slow.
def _granite4_vision_vllm_to_hf_output(vllm_output, model):
"""Post-processor for granite4_vision vLLM output.
Self-contained to avoid calling AutoConfig/AutoTokenizer without
trust_remote_code (needed while the model is not in upstream HF).
"""
output_ids, output_str, out_logprobs = vllm_output
mm_token_id = 100352
hf_output_ids = [
token_id
for idx, token_id in enumerate(output_ids)
if token_id != mm_token_id or idx == 0 or output_ids[idx - 1] != mm_token_id
]
hf_output_str = (
output_str[1:] if output_str and output_str[0] == " " else output_str
)
eos_token_id = 100257
if hf_output_ids and hf_output_ids[-1] == eos_token_id:
hf_output_str = hf_output_str + "<|end_of_text|>"
return hf_output_ids, hf_output_str, out_logprobs
VLM_TEST_SETTINGS = {
#### Core tests to always run in the CI
"llava": VLMTestInfo(
......@@ -492,6 +515,20 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForImageTextToText,
marks=[large_gpu_mark(min_gb=32)],
),
"granite4_vision": VLMTestInfo(
models=["ibm-granite/granite-vision-4.1-4b"],
test_type=(VLMTestType.IMAGE),
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}\n<|assistant|>\n",
max_model_len=8192,
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=_granite4_vision_vllm_to_hf_output,
image_size_factors=[(1.0,)],
vllm_runner_kwargs={
"enable_lora": True,
"max_lora_rank": 256,
"default_mm_loras": {"image": "ibm-granite/granite-vision-4.1-4b"},
},
),
"h2ovl": VLMTestInfo(
models=[
"h2oai/h2ovl-mississippi-800m",
......
......@@ -899,6 +899,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"zai-org/GLM-ASR-Nano-2512",
min_transformers_version="5.0.0",
),
"Granite4VisionForConditionalGeneration": _HfExamplesInfo(
"ibm-granite/granite-vision-4.1-4b",
is_available_online=False,
),
"GraniteVision": _HfExamplesInfo("ibm-granite/granite-vision-3.3-2b"),
"GraniteSpeechForConditionalGeneration": _HfExamplesInfo(
"ibm-granite/granite-speech-3.3-2b",
......
This diff is collapsed.
......@@ -407,6 +407,10 @@ _MULTIMODAL_MODELS = {
"granite_speech",
"GraniteSpeechForConditionalGeneration",
),
"Granite4VisionForConditionalGeneration": (
"granite4_vision",
"Granite4VisionForConditionalGeneration",
),
"H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
"HunYuanVLForConditionalGeneration": (
"hunyuan_vision",
......
......@@ -92,6 +92,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
flex_olmo="FlexOlmoConfig",
fireredlid="FireRedLIDConfig",
funaudiochat="FunAudioChatConfig",
granite4_vision="Granite4VisionConfig",
hunyuan_vl="HunYuanVLConfig",
isaac="IsaacConfig",
kimi_k2="DeepseekV3Config", # Kimi K2 uses same architecture as DeepSeek V3
......
......@@ -32,6 +32,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
"FlexOlmoConfig": "vllm.transformers_utils.configs.flex_olmo",
"FunAudioChatConfig": "vllm.transformers_utils.configs.funaudiochat",
"FunAudioChatAudioEncoderConfig": "vllm.transformers_utils.configs.funaudiochat",
"Granite4VisionConfig": "vllm.transformers_utils.configs.granite4_vision",
"HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl",
"HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl",
"HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl",
......@@ -92,6 +93,7 @@ __all__ = [
"FireRedLIDConfig",
"FunAudioChatConfig",
"FunAudioChatAudioEncoderConfig",
"Granite4VisionConfig",
"HunYuanVLConfig",
"HunYuanVLTextConfig",
"HunYuanVLVisionConfig",
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
import transformers
class Granite4VisionConfig(transformers.PretrainedConfig):
"""Configuration for Granite 4 Vision model.
This config is needed because the granite4_vision model type is not yet
in the transformers version pinned by vLLM. Once transformers adds native
support, this file can be removed and the _CONFIG_REGISTRY entry dropped.
"""
model_type = "granite4_vision"
is_composition = False
def __init__(
self,
vision_config: dict[str, Any] | None = None,
text_config: dict[str, Any] | None = None,
image_token_index: int = 100352,
image_seq_length: int = 576,
image_grid_pinpoints: list[list[int]] | None = None,
vision_feature_select_strategy: str = "full",
vision_feature_layer: int | list[int] = -2,
projector_hidden_act: str = "gelu",
projector_dropout: float = 0.1,
downsample_rate: str | None = None,
use_image_newline_parameter: bool = True,
deepstack_layer_map: list[list[int]] | None = None,
use_spatial_sampling: bool = False,
spatial_stride: int = 2,
spatial_vision_layer: int = -1,
spatial_target_layers: list[int] | None = None,
# Hub aliases — base model config uses different field names
vision_layer_to_llm_layer: list[list[int]] | None = None,
use_checkerboard_sampling: bool | None = None,
checkerboard_stride: int | None = None,
checkerboard_vision_layer: int | None = None,
checkerboard_llm_layers: list[int] | None = None,
**kwargs: Any,
):
self.image_token_index = image_token_index
self.image_seq_length = image_seq_length
self.image_grid_pinpoints = image_grid_pinpoints or []
self.vision_feature_select_strategy = vision_feature_select_strategy
self.vision_feature_layer = vision_feature_layer
self.projector_hidden_act = projector_hidden_act
self.projector_dropout = projector_dropout
self.downsample_rate = downsample_rate
self.use_image_newline_parameter = use_image_newline_parameter
self.deepstack_layer_map = deepstack_layer_map or vision_layer_to_llm_layer
self.use_spatial_sampling = (
use_spatial_sampling
if use_checkerboard_sampling is None
else use_checkerboard_sampling
)
self.spatial_stride = (
spatial_stride if checkerboard_stride is None else checkerboard_stride
)
self.spatial_vision_layer = (
spatial_vision_layer
if checkerboard_vision_layer is None
else checkerboard_vision_layer
)
self.spatial_target_layers = (
spatial_target_layers or checkerboard_llm_layers or [0, 10, 20, 30]
)
if vision_config is None:
vision_config = {}
if text_config is None:
text_config = {}
vision_model_type = vision_config.get("model_type", "siglip_vision_model")
if vision_model_type in transformers.CONFIG_MAPPING:
self.vision_config = transformers.CONFIG_MAPPING[vision_model_type](
**vision_config
)
else:
self.vision_config = transformers.PretrainedConfig(**vision_config)
text_model_type = text_config.get("model_type", "granite")
if text_model_type in transformers.CONFIG_MAPPING:
self.text_config = transformers.CONFIG_MAPPING[text_model_type](
**text_config
)
else:
self.text_config = transformers.PretrainedConfig(**text_config)
super().__init__(**kwargs)
......@@ -19,6 +19,7 @@ __all__ = [
"FireRedLIDProcessor",
"FunASRProcessor",
"GLM4VProcessor",
"Granite4VisionProcessor",
"H2OVLProcessor",
"HunYuanVLProcessor",
"HunYuanVLImageProcessor",
......@@ -48,6 +49,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
"FireRedLIDProcessor": "vllm.transformers_utils.processors.fireredlid",
"FunASRProcessor": "vllm.transformers_utils.processors.funasr",
"GLM4VProcessor": "vllm.transformers_utils.processors.glm4v",
"Granite4VisionProcessor": "vllm.transformers_utils.processors.granite4_vision",
"H2OVLProcessor": "vllm.transformers_utils.processors.h2ovl",
"HunYuanVLProcessor": "vllm.transformers_utils.processors.hunyuan_vl",
"HunYuanVLImageProcessor": "vllm.transformers_utils.processors.hunyuan_vl_image",
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from fractions import Fraction
from transformers import LlavaNextProcessor
from transformers.image_processing_utils import select_best_resolution
class Granite4VisionProcessor(LlavaNextProcessor):
"""Processor for Granite 4 Vision.
Extends LlavaNextProcessor to account for the Window Q-Former
downsampling when computing the number of image features.
This processor is needed because the granite4_vision processor type
is not yet in the transformers version pinned by vLLM.
"""
model_type = "granite4_vision"
def __init__(
self,
image_processor=None,
tokenizer=None,
patch_size=None,
vision_feature_select_strategy=None,
chat_template=None,
image_token="<image>",
num_additional_image_tokens=0,
downsample_rate=None,
**kwargs,
):
super().__init__(
image_processor=image_processor,
tokenizer=tokenizer,
patch_size=patch_size,
vision_feature_select_strategy=vision_feature_select_strategy,
chat_template=chat_template,
image_token=image_token,
num_additional_image_tokens=num_additional_image_tokens,
)
self.downsample_rate = downsample_rate
def _get_number_of_features(
self,
orig_height: int,
orig_width: int,
height: int,
width: int,
) -> int:
image_grid_pinpoints = self.image_processor.image_grid_pinpoints
height_best_resolution, width_best_resolution = select_best_resolution(
[orig_height, orig_width], image_grid_pinpoints
)
scale_height = height_best_resolution // height
scale_width = width_best_resolution // width
patches_height = height // self.patch_size
patches_width = width // self.patch_size
if self.downsample_rate is not None:
ds_rate = Fraction(self.downsample_rate)
patches_height = int(patches_height * ds_rate)
patches_width = int(patches_width * ds_rate)
unpadded_features, newline_features = self._get_unpadded_features(
orig_height,
orig_width,
patches_height,
patches_width,
scale_height,
scale_width,
)
base_features = (
patches_height * patches_width + self.num_additional_image_tokens
)
return unpadded_features + newline_features + base_features
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment