Feature/isaac 0.1 (#28367)

Signed-off-by: oscardev256 <42308241+oscardev256@users.noreply.github.com> Signed-off-by: Oscar Gonzalez <ogonzal6@alumni.jh.edu> Signed-off-by: Yang <lymailforjob@gmail.com> Co-authored-by: Yang <lymailforjob@gmail.com>

Feature/isaac 0.1 (#28367)
Signed-off-by: oscardev256 <42308241+oscardev256@users.noreply.github.com> Signed-off-by: Oscar Gonzalez <ogonzal6@alumni.jh.edu> Signed-off-by: Yang <lymailforjob@gmail.com> Co-authored-by: Yang <lymailforjob@gmail.com>
b7165d53 · oscardev256 · GitHub · 81786c87 · b7165d53 · b7165d53
Unverified Commit b7165d53 authored Dec 25, 2025 by oscardev256 Committed by GitHub Dec 25, 2025
11 changed files
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -688,6 +688,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
 | `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ |
 | `HunYuanVLForConditionalGeneration` | HunyuanOCR | T + I<sup>E+</sup> | `tencent/HunyuanOCR`, etc. | ✅︎ | ✅︎ |
 | `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
+| `IsaacForConditionalGeneration` | Isaac | T + I<sup>+</sup> | `PerceptronAI/Isaac-0.1` | ✅︎ | ✅︎ |
 | `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ |
 | `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ |
 | `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ |

--- a/requirements/test.in
+++ b/requirements/test.in
@@ -56,3 +56,5 @@ pydantic>=2.12 # 2.11 leads to error on python 3.13
 decord==0.6.0
 terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
 gpt-oss >= 0.0.7; python_version > '3.11'
+
+perceptron # required for isaac test
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -135,6 +135,7 @@ cloudpickle==3.1.1
    # via mlflow-skinny
 colorama==0.4.6
    # via
+    #   perceptron
    #   sacrebleu
    #   schemathesis
    #   tqdm-multiprocess
@@ -302,6 +303,8 @@ h11==0.14.0
    # via
    #   httpcore
    #   uvicorn
+h2==4.3.0
+    # via httpx
 h5py==3.13.0
    # via terratorch
 harfile==0.3.0
@@ -310,6 +313,8 @@ hf-xet==1.1.7
    # via huggingface-hub
 hiredis==3.0.0
    # via tensorizer
+hpack==4.1.0
+    # via h2
 html2text==2025.4.15
    # via gpt-oss
 httpcore==1.0.6
@@ -317,6 +322,7 @@ httpcore==1.0.6
 httpx==0.27.2
    # via
    #   -r requirements/test.in
+    #   perceptron
    #   schemathesis
 huggingface-hub==0.34.3
    # via
@@ -338,6 +344,8 @@ hydra-core==1.3.2
    # via
    #   lightly
    #   lightning
+hyperframe==6.1.0
+    # via h2
 hypothesis==6.131.0
    # via
    #   hypothesis-graphql
@@ -549,6 +557,7 @@ numpy==1.26.4
    #   pandas
    #   patsy
    #   peft
+    #   perceptron
    #   pycocotools
    #   pyogrio
    #   rasterio
@@ -702,6 +711,8 @@ peft==0.16.0
    # via
    #   -r requirements/test.in
    #   lm-eval
+perceptron==0.1.4
+    # via -r requirements/test.in
 pillow==10.4.0
    # via
    #   genai-perf
@@ -709,6 +720,7 @@ pillow==10.4.0
    #   lightly-utils
    #   matplotlib
    #   mistral-common
+    #   perceptron
    #   scikit-image
    #   segmentation-models-pytorch
    #   sentence-transformers
@@ -952,6 +964,7 @@ rich==13.9.4
    #   genai-perf
    #   lightning
    #   mteb
+    #   perceptron
    #   typer
 rioxarray==0.19.0
    # via terratorch
@@ -1024,7 +1037,9 @@ shapely==2.1.1
    #   geopandas
    #   torchgeo
 shellingham==1.5.4
-    # via typer
+    # via
+    #   perceptron
+    #   typer
 six==1.16.0
    # via
    #   junit-xml
@@ -1218,7 +1233,9 @@ typepy==1.3.2
    #   pytablewriter
    #   tabledata
 typer==0.15.2
-    # via fastsafetensors
+    # via
+    #   fastsafetensors
+    #   perceptron
 types-python-dateutil==2.9.0.20241206
    # via arrow
 typeshed-client==2.8.2

--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -529,6 +529,31 @@ VLM_TEST_SETTINGS = {
        use_tokenizer_eos=True,
        auto_cls=AutoModelForImageTextToText,
    ),
+    "isaac": VLMTestInfo(
+        models=["PerceptronAI/Isaac-0.1"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: (
+            f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n"
+        ),
+        img_idx_to_prompt=lambda idx: "<image>",
+        single_image_prompts=IMAGE_ASSETS.prompts(
+            {
+                "stop_sign": "<vlm_image>Please describe the image shortly.",
+                "cherry_blossom": "<vlm_image>Please infer the season with reason.",
+            }
+        ),
+        multi_image_prompt=(
+            "Picture 1: <vlm_image>\n"
+            "Picture 2: <vlm_image>\n"
+            "Describe these two images with one paragraph respectively."
+        ),
+        enforce_eager=False,
+        max_model_len=4096,
+        max_num_seqs=2,
+        hf_model_kwargs={"device_map": "auto"},
+        patch_hf_runner=model_utils.isaac_patch_hf_runner,
+        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+    ),
    "kimi_vl": VLMTestInfo(
        models=["moonshotai/Kimi-VL-A3B-Instruct"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),

--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -522,6 +522,183 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    return hf_model


+def isaac_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
+    """Patch HF runner for Isaac:
+    1) Move processor outputs to model device
+    2) Ensure IsaacModel.forward returns hidden_states
+    for compatibility with hidden_states_to_seq_logprobs()
+    """
+
+    from perceptron.tensorstream import TextType
+    from perceptron.tensorstream.ops import compute_mrope_pos_tensor, modality_mask
+    from transformers.modeling_outputs import BaseModelOutputWithPast
+
+    def compute_position_ids_input_ids(input_ids: torch.Tensor) -> torch.Tensor:
+        """
+        Create 3D positional indices for token input.
+        """
+        batch_size, seq_length = input_ids.shape
+        position_ids = torch.arange(seq_length, device=input_ids.device)
+        position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+        position_ids = position_ids.unsqueeze(2).expand(-1, -1, 3)  # Add 3D for MRoPE
+        return position_ids
+
+    model_device = next(hf_model.model.parameters()).device
+
+    # ----------------------------
+    # 1) Patch processor: move BatchFeature input_ids and TensorStream to model device
+    # ----------------------------
+    original_processor = hf_model.processor
+
+    def patched_processor(*args, **kwargs):
+        result = original_processor(*args, **kwargs)
+        for k, v in result.data.items():
+            result[k] = v.to(model_device)
+        return result
+
+    hf_model.processor = patched_processor
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        hf_model.model_name, trust_remote_code=True
+    )
+
+    original_generate = hf_model.model.generate
+
+    def patched_generate(*args, **kwargs):
+        kwargs["pad_token_id"] = tokenizer.eos_token_id
+        kwargs["eos_token_id"] = tokenizer.eos_token_id
+        return original_generate(*args, **kwargs)
+
+    hf_model.model.generate = patched_generate
+
+    # ----------------------------
+    # 2) Patch IsaacModel.forward: add hidden_states to the output
+    # ----------------------------
+    isaac_model = hf_model.model.model
+
+    def patched_forward(
+        self,
+        input_ids=None,
+        tensor_stream=None,
+        attention_mask=None,
+        position_ids=None,
+        modality_tensor=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_hidden_states=None,
+        return_dict=None,
+        cache_position=None,
+        **kwargs,
+    ):
+        """
+        Forward pass with MRoPE position embeddings.
+        Computes position embeddings once and passes them through all layers.
+        """
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # Get inputs
+        if tensor_stream is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both tensor_stream and inputs_embeds")
+        elif tensor_stream is not None:
+            # Embed TensorStream directly
+            inputs_embeds = self.embed_stream(tensor_stream)
+            # Create modality tensor if not provided
+            if modality_tensor is None:
+                modality_tensor = modality_mask(tensor_stream)
+        elif input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            inputs_embeds = self.embed_tokens(input_ids)
+            # Create text modality tensor if not provided
+            if modality_tensor is None:
+                batch_size, seq_length = input_ids.shape
+                modality_tensor = torch.full(
+                    (batch_size, seq_length),
+                    TextType.text.value,
+                    device=input_ids.device,
+                    dtype=torch.long,
+                )
+        elif inputs_embeds is None:
+            raise ValueError(
+                "You have to specify either tensor_stream, input_ids or inputs_embeds"
+            )
+
+        # Create default position_ids if not provided
+        if position_ids is None:
+            if tensor_stream is not None:
+                position_ids = compute_mrope_pos_tensor(tensor_stream)  # (B,L,3)
+            else:
+                position_ids = compute_position_ids_input_ids(input_ids)
+
+        # Compute MRoPE position embeddings if we have custom rotary_emb
+        cos, sin = self.rotary_emb(position_ids, modality_tensor)
+        cos = cos.to(inputs_embeds.dtype)
+        sin = sin.to(inputs_embeds.dtype)
+
+        # Prepare attention mask
+        if attention_mask is not None:
+            attention_mask = self._update_causal_mask(
+                attention_mask, inputs_embeds, cache_position, past_key_values, False
+            )
+
+        # Initialize and collect hidden states
+        hidden_states = inputs_embeds
+        hidden_states_list: list[torch.Tensor] = []
+
+        if output_hidden_states:
+            hidden_states_list.append(hidden_states)
+
+        for decoder_layer in self.layers:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=(cos, sin),
+                **kwargs,
+            )
+
+            hidden_states = (
+                layer_outputs[0] if isinstance(layer_outputs, tuple) else layer_outputs
+            )
+
+            if output_hidden_states:
+                hidden_states_list.append(hidden_states)
+
+        # Final layer norm
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            hidden_states_list.append(hidden_states)
+
+        # Convert to tuple or None
+        all_hidden_states = tuple(hidden_states_list) if output_hidden_states else None
+
+        # Include hiden_states for compatibility with hidden_states_to_seq_logprobs()
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+        )
+
+    isaac_model.forward = types.MethodType(patched_forward, isaac_model)
+
+    return hf_model
+
+
 def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
    """Patches and returns an instance of the HfRunner to use for SkyworkR1V."""


--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -662,6 +662,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
        "HuggingFaceM4/Idefics3-8B-Llama3",
        extras={"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"},
    ),
+    "IsaacForConditionalGeneration": _HfExamplesInfo(
+        "PerceptronAI/Isaac-0.1",
+        trust_remote_code=True,
+    ),
    "InternS1ForConditionalGeneration": _HfExamplesInfo(
        "internlm/Intern-S1", trust_remote_code=True
    ),

--- a/vllm/model_executor/models/isaac.py
+++ b/vllm/model_executor/models/isaac.py
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -333,6 +333,7 @@ _MULTIMODAL_MODELS = {
        "idefics3",
        "Idefics3ForConditionalGeneration",
    ),
+    "IsaacForConditionalGeneration": ("isaac", "IsaacForConditionalGeneration"),
    "SmolVLMForConditionalGeneration": ("smolvlm", "SmolVLMForConditionalGeneration"),  # noqa: E501
    "KeyeForConditionalGeneration": ("keye", "KeyeForConditionalGeneration"),
    "KeyeVL1_5ForConditionalGeneration": (

--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -81,6 +81,7 @@ _CONFIG_REGISTRY: dict[str, type[PretrainedConfig]] = LazyConfigDict(
    deepseek_v32="DeepseekV3Config",
    flex_olmo="FlexOlmoConfig",
    hunyuan_vl="HunYuanVLConfig",
+    isaac="IsaacConfig",
    kimi_linear="KimiLinearConfig",
    kimi_vl="KimiVLConfig",
    RefinedWeb="RWConfig",  # For tiiuae/falcon-40b(-instruct)

--- a/vllm/transformers_utils/configs/__init__.py
+++ b/vllm/transformers_utils/configs/__init__.py
@@ -25,6 +25,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
    "HunYuanVLConfig": "vllm.transformers_utils.configs.hunyuan_vl",
    "HunYuanVLTextConfig": "vllm.transformers_utils.configs.hunyuan_vl",
    "HunYuanVLVisionConfig": "vllm.transformers_utils.configs.hunyuan_vl",
+    "IsaacConfig": "vllm.transformers_utils.configs.isaac",
    # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
    # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
    # `FalconConfig` class from the official HuggingFace transformers library.
@@ -41,6 +42,7 @@ _CLASS_TO_MODULE: dict[str, str] = {
    "NemotronHConfig": "vllm.transformers_utils.configs.nemotron_h",
    "Olmo3Config": "vllm.transformers_utils.configs.olmo3",
    "OvisConfig": "vllm.transformers_utils.configs.ovis",
+    "PixelShuffleSiglip2VisionConfig": "vllm.transformers_utils.configs.isaac",
    "RadioConfig": "vllm.transformers_utils.configs.radio",
    "SpeculatorsConfig": "vllm.transformers_utils.configs.speculators.base",
    "UltravoxConfig": "vllm.transformers_utils.configs.ultravox",
@@ -65,6 +67,7 @@ __all__ = [
    "HunYuanVLConfig",
    "HunYuanVLTextConfig",
    "HunYuanVLVisionConfig",
+    "IsaacConfig",
    "RWConfig",
    "JAISConfig",
    "Lfm2MoeConfig",
@@ -78,6 +81,7 @@ __all__ = [
    "NemotronHConfig",
    "Olmo3Config",
    "OvisConfig",
+    "PixelShuffleSiglip2VisionConfig",
    "RadioConfig",
    "SpeculatorsConfig",
    "UltravoxConfig",

--- a/vllm/transformers_utils/configs/isaac.py
+++ b/vllm/transformers_utils/configs/isaac.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
+
+from transformers import Qwen3Config
+from transformers.models.siglip2.configuration_siglip2 import Siglip2VisionConfig
+
+
+class PixelShuffleSiglip2VisionConfig(Siglip2VisionConfig):
+    """Vision configuration for Isaac with Pixel Shuffle support.
+
+    Extends Siglip2VisionConfig with additional fields for pixel shuffle.
+    """
+
+    model_type = "pixel_shuffle_siglip2"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        pixel_shuffle_scale_factor: int = 1,
+        num_patches: int = 256,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # Add our custom fields
+        self.pixel_shuffle_scale_factor = pixel_shuffle_scale_factor
+        self.num_patches = num_patches
+
+
+class IsaacConfig(Qwen3Config):
+    """Configuration class for Isaac multimodal model."""
+
+    model_type = "isaac"
+    sub_configs = {"vision_config": PixelShuffleSiglip2VisionConfig}
+
+    def __init__(
+        self,
+        vision_config=None,
+        vision_patch_size: int = 16,
+        vision_max_num_patches: int = 256,
+        vision_min_num_patches: int | None = None,
+        pixel_shuffle_scale: int = 1,
+        max_sequence_length: int = 16384,
+        vision_token: str = "<image>",
+        vision_attn_implementation: str | None = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # EventStreamProcessor parameters (for backward compatibility)
+        self.video_patch_size = vision_patch_size
+        self.vision_max_num_patches = vision_max_num_patches
+        self.vision_min_num_patches = vision_min_num_patches
+        self.pixel_shuffle_scale = pixel_shuffle_scale
+
+        # Processing parameters
+        self.max_sequence_length = max_sequence_length
+        self.vision_token = vision_token
+
+        # Handle vision config - PixelShuffleSiglip2VisionConfig instance
+        if isinstance(vision_config, dict):
+            self.vision_config = PixelShuffleSiglip2VisionConfig(**vision_config)
+        elif vision_config is None:
+            self.vision_config = PixelShuffleSiglip2VisionConfig()
+        else:
+            self.vision_config = vision_config
+
+        # Ensure compatibility with pretrained checkpoints
+        self.vision_config.pixel_shuffle_scale_factor = getattr(
+            self.vision_config,
+            "pixel_shuffle_scale_factor",
+            pixel_shuffle_scale,
+        )
+        self.vision_config.num_patches = getattr(
+            self.vision_config,
+            "num_patches",
+            vision_max_num_patches,
+        )
+        self.vision_attn_implementation = vision_attn_implementation
+
+
+__all__ = [
+    "IsaacConfig",
+    "PixelShuffleSiglip2VisionConfig",
+]