Unverified Commit a3acfa10 authored by zxy's avatar zxy Committed by GitHub
Browse files

[Models] Intern-S1-Pro (#33636)


Signed-off-by: default avatarzxy <zhou0493@e.ntu.edu.sg>
Signed-off-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: default avatarIsotr0py <mozf@mail2.sysu.edu.cn>
parent be8168ff
......@@ -689,6 +689,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3`, etc. | ✅︎ | |
| `IsaacForConditionalGeneration` | Isaac | T + I<sup>+</sup> | `PerceptronAI/Isaac-0.1` | ✅︎ | ✅︎ |
| `InternS1ForConditionalGeneration` | Intern-S1 | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1`, `internlm/Intern-S1-mini`, etc. | ✅︎ | ✅︎ |
| `InternS1ProForConditionalGeneration` | Intern-S1-Pro | T + I<sup>E+</sup> + V<sup>E+</sup> | `internlm/Intern-S1-Pro`, etc. | ✅︎ | ✅︎ |
| `InternVLChatModel` | InternVL 3.5, InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3_5-14B`, `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ |
| `InternVLForConditionalGeneration` | InternVL 3.0 (HF format) | T + I<sup>E+</sup> + V<sup>E+</sup> | `OpenGVLab/InternVL3-1B-hf`, etc. | ✅︎ | ✅︎ |
| `KananaVForConditionalGeneration` | Kanana-V | T + I<sup>+</sup> | `kakaocorp/kanana-1.5-v-3b-instruct`, etc. | | ✅︎ |
......
......@@ -842,6 +842,40 @@ def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
)
# Intern-S1-Pro
def run_interns1_pro(questions: list[str], modality: str) -> ModelRequestData:
model_name = "internlm/Intern-S1-Pro"
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=2,
limit_mm_per_prompt={modality: 1},
enforce_eager=True,
tensor_parallel_size=4,
)
if modality == "image":
placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
elif modality == "video":
placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
messages = [
[{"role": "user", "content": f"{placeholder}\n{question}"}]
for question in questions
]
prompts = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# InternVL
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "OpenGVLab/InternVL3-2B"
......@@ -2130,6 +2164,7 @@ model_example_map = {
"hyperclovax_seed_vision": run_hyperclovax_seed_vision,
"idefics3": run_idefics3,
"interns1": run_interns1,
"interns1_pro": run_interns1_pro,
"internvl_chat": run_internvl,
"kanana_v": run_kanana_v,
"keye_vl": run_keye_vl,
......
......@@ -755,6 +755,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"InternS1ForConditionalGeneration": _HfExamplesInfo(
"internlm/Intern-S1", trust_remote_code=True
),
"InternS1ProForConditionalGeneration": _HfExamplesInfo(
"internlm/Intern-S1-Pro",
trust_remote_code=True,
min_transformers_version="5.0.0",
is_available_online=False,
),
"InternVLChatModel": _HfExamplesInfo(
"OpenGVLab/InternVL2-1B",
extras={
......
......@@ -11,6 +11,7 @@ from .deepseek_scaling_rope import DeepseekScalingRotaryEmbedding
from .dual_chunk_rope import DualChunkRotaryEmbedding
from .dynamic_ntk_alpha_rope import DynamicNTKAlphaRotaryEmbedding
from .dynamic_ntk_scaling_rope import DynamicNTKScalingRotaryEmbedding
from .fope import FourierRotaryEmbedding
from .linear_scaling_rope import LinearScalingRotaryEmbedding
from .llama3_rope import Llama3RotaryEmbedding
from .llama4_vision_rope import Llama4VisionRotaryEmbedding
......@@ -102,6 +103,28 @@ def get_rope(
mrope_section=rope_parameters["mrope_section"],
mrope_interleaved=rope_parameters.get("mrope_interleaved", False),
)
elif "use_fope" in rope_parameters and rope_parameters["use_fope"]:
extra_kwargs = {
k: v
for k, v in rope_parameters.items()
if k
in (
"num_key_value_heads",
"num_inv_freq",
"fope_sep_head",
"fope_init_factor",
)
}
extra_kwargs["init_cache"] = False
rotary_emb = FourierRotaryEmbedding(
head_size,
rotary_dim,
max_position,
base,
is_neox_style,
dtype,
**extra_kwargs,
)
else:
rotary_emb = RotaryEmbedding(
head_size,
......
......@@ -25,6 +25,7 @@ class RotaryEmbeddingBase(CustomOp):
base: float,
is_neox_style: bool,
dtype: torch.dtype,
init_cache: bool = True,
) -> None:
super().__init__()
self.head_size = head_size
......@@ -46,6 +47,7 @@ class RotaryEmbeddingBase(CustomOp):
if not hasattr(self, "use_flashinfer"):
self.use_flashinfer = False
if init_cache:
cache = self._compute_cos_sin_cache()
if not self.use_flashinfer:
cache = cache.to(dtype)
......@@ -108,9 +110,16 @@ class RotaryEmbedding(RotaryEmbeddingBase):
base: float,
is_neox_style: bool,
dtype: torch.dtype,
init_cache: bool = True,
) -> None:
super().__init__(
head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
head_size=head_size,
rotary_dim=rotary_dim,
max_position_embeddings=max_position_embeddings,
base=base,
is_neox_style=is_neox_style,
dtype=dtype,
init_cache=init_cache,
)
@staticmethod
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
import torch.nn.functional as F
from torch import nn
from vllm.distributed import (
get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
)
from .base import RotaryEmbedding
from .common import rotate_neox
class FourierRotaryEmbedding(RotaryEmbedding):
def __init__(
self,
head_size: int,
rotary_dim: int,
max_position_embeddings: int,
base: float,
is_neox_style: bool,
dtype: torch.dtype,
init_cache: bool,
# extra parameters for FoPE
num_key_value_heads: int,
num_inv_freq: int,
fope_sep_head: bool,
fope_init_factor: float,
):
# fope related parameters
self.num_key_value_heads = num_key_value_heads
self.num_inv_freq = num_inv_freq
self.fope_sep_head = fope_sep_head
self.fope_init_factor = fope_init_factor
super().__init__(
head_size=head_size,
rotary_dim=rotary_dim,
max_position_embeddings=max_position_embeddings,
base=base,
is_neox_style=is_neox_style,
dtype=dtype,
init_cache=init_cache,
)
# setup buffers and parameters
self.inv_freq: torch.Tensor
self.register_buffer(
"inv_freq", self._compute_inv_freq(self.base), persistent=False
)
self.input_dim = self.inv_freq.shape[-1]
self.output_dim = self.inv_freq.shape[-1]
self.cos_coef = nn.Parameter(
torch.empty(num_key_value_heads, self.input_dim, self.output_dim),
requires_grad=False,
)
self.sin_coef = nn.Parameter(
torch.empty(num_key_value_heads, self.input_dim, self.output_dim),
requires_grad=False,
)
self.sin_coef.weight_loader = self.weight_loader
self.cos_coef.weight_loader = self.weight_loader
self.cos_sin_cache: torch.Tensor
cache = self._compute_cos_sin_cache().to(dtype)
self.register_buffer("cos_sin_cache", cache, persistent=False)
# update cache in the first forward, where sin/cos_coef weights are ready
self.update_cache = True
def _compute_inv_freq(self, base: float) -> torch.Tensor:
"""Compute the inverse frequency."""
inv_freq = 1.0 / (
base
** (
torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
)
)
inv_freq_idx_selected = torch.ones_like(inv_freq, dtype=torch.bool)
if self.num_inv_freq is not None:
inv_freq_idx_selected[self.num_inv_freq :] = False
else:
inv_freq_idx_selected = inv_freq > (
2.0 * torch.pi / self.max_position_embeddings
)
inv_freq = inv_freq[inv_freq_idx_selected]
return inv_freq
def _compute_cos_sin_cache(self) -> torch.Tensor:
"""Compute the cos and sin cache."""
device = self.inv_freq.device
t = torch.arange(self.max_position_embeddings, dtype=torch.float, device=device)
freqs = torch.einsum("j,i -> ji", t, self.inv_freq)
if self.fope_sep_head:
pos_cos = freqs.cos().unsqueeze(0).expand(self.num_key_value_heads, -1, -1)
pos_sin = freqs.sin().unsqueeze(0).expand(self.num_key_value_heads, -1, -1)
else:
pos_cos = freqs.cos()
pos_sin = freqs.sin()
if self.fope_sep_head:
sin = torch.einsum("htD, hDd -> thd", pos_sin, self.sin_coef.float())
cos = torch.einsum("htD, hDd -> thd", pos_cos, self.cos_coef.float())
else:
sin = torch.einsum("tD, Dd -> td", pos_sin, self.sin_coef.float())
cos = torch.einsum("tD, Dd -> td", pos_cos, self.cos_coef.float())
sin = F.pad(
input=sin,
pad=(0, self.head_size // 2 - sin.size(-1)),
mode="constant",
value=1,
)
cos = F.pad(
input=cos,
pad=(0, self.head_size // 2 - cos.size(-1)),
mode="constant",
value=1,
)
sin = torch.cat((sin, sin), dim=-1)
cos = torch.cat((cos, cos), dim=-1)
# cache: (max_position_embeddings, num_kv_heads, kv_size * 2)
cache = torch.cat((cos, sin), dim=-1)
return cache
def forward_native(
self,
positions: torch.Tensor,
query: torch.Tensor,
key: torch.Tensor | None = None,
offsets: torch.Tensor | None = None,
) -> tuple[torch.Tensor, torch.Tensor | None]:
# update cos/sin cache in the first forward
if self.update_cache:
cache = self._compute_cos_sin_cache().to(self.dtype)
self.cos_sin_cache.copy_(cache)
self.update_cache = False
positions = positions.flatten()
cos_sin = self.cos_sin_cache.index_select(0, positions)
cos, sin = cos_sin.chunk(2, dim=-1)
# apply rotary embedding
# query: (seq_len, num_heads, head_size)
# key: (seq_len, num_kv_heads, head_size)
query = query.unflatten(-1, (-1, self.head_size))
assert key is not None, "Key tensor is required for FoPE."
key = key.unflatten(-1, (-1, self.head_size))
assert query.dim() == key.dim() == 3, (
"Expected query key (seq_len, heads, head_dim)"
)
assert cos.dim() <= 3 and sin.dim() <= 3
need_reshape = False
if cos.dim() == 3:
# for fope
need_reshape = True
query_shape = query.shape
key_shape = key.shape
cos = cos.flatten(0, 1)
sin = sin.flatten(0, 1)
seq_len = cos.size(0)
query = query.view(seq_len, -1, query.size(-1))
key = key.view(seq_len, -1, key.size(-1))
# native implementation of apply rope for neox style
cos = cos.unsqueeze(1)
sin = sin.unsqueeze(1)
query = (query * cos) + (rotate_neox(query) * sin)
key = (key * cos) + (rotate_neox(key) * sin)
if need_reshape:
query = query.view(query_shape)
key = key.view(key_shape)
return query, key
def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
"""load fope weights"""
world_size = get_tensor_model_parallel_world_size()
rank = get_tensor_model_parallel_rank()
num_key_value_heads = loaded_weight.size(0)
if num_key_value_heads < world_size:
n_replicate = world_size // num_key_value_heads
world_size = num_key_value_heads
rank = rank // n_replicate
loaded_weight = loaded_weight.chunk(world_size, dim=0)[rank]
param.data.copy_(loaded_weight)
This diff is collapsed.
......@@ -428,7 +428,13 @@ class Qwen3MoeDecoderLayer(nn.Module):
@support_torch_compile
class Qwen3MoeModel(nn.Module):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
def __init__(
self,
*,
vllm_config: VllmConfig,
prefix: str = "",
decoder_layer_type: type[torch.nn.Module] = Qwen3MoeDecoderLayer,
):
super().__init__()
config = vllm_config.model_config.hf_text_config
......@@ -449,7 +455,7 @@ class Qwen3MoeModel(nn.Module):
)
self.start_layer, self.end_layer, self.layers = make_layers(
config.num_hidden_layers,
lambda prefix: Qwen3MoeDecoderLayer(vllm_config=vllm_config, prefix=prefix),
lambda prefix: decoder_layer_type(vllm_config=vllm_config, prefix=prefix),
prefix=f"{prefix}.layers",
)
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
......
......@@ -325,7 +325,11 @@ class Qwen3_VisionTransformer(nn.Module):
self.spatial_merge_size = vision_config.spatial_merge_size
self.spatial_merge_unit = self.spatial_merge_size**2
self.temporal_patch_size = vision_config.temporal_patch_size
self.deepstack_visual_indexes = vision_config.deepstack_visual_indexes
self.deepstack_visual_indexes = (
vision_config.deepstack_visual_indexes
if hasattr(vision_config, "deepstack_visual_indexes")
else []
)
self.num_grid_per_side = int(self.num_position_embeddings**0.5)
# NOTE: This is used for creating empty tensor for all_gather for
......
......@@ -48,6 +48,7 @@ from vllm.sequence import IntermediateTensors
from .interfaces import MixtureOfExperts
from .qwen3_moe import (
Qwen3MoeDecoderLayer,
Qwen3MoeForCausalLM,
Qwen3MoeModel,
Qwen3MoeSparseMoeBlock,
......@@ -82,8 +83,18 @@ class Qwen3VLMoeProcessingInfo(Qwen3VLProcessingInfo):
}
)
class Qwen3MoeLLMModel(Qwen3MoeModel):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__(vllm_config=vllm_config, prefix=prefix)
def __init__(
self,
*,
vllm_config: VllmConfig,
prefix: str = "",
decoder_layer_type: type[torch.nn.Module] = Qwen3MoeDecoderLayer,
):
super().__init__(
vllm_config=vllm_config,
prefix=prefix,
decoder_layer_type=decoder_layer_type,
)
if not get_pp_group().is_first_rank:
assert self.start_layer >= len(
vllm_config.model_config.hf_config.vision_config.deepstack_visual_indexes
......
......@@ -357,6 +357,10 @@ _MULTIMODAL_MODELS = {
"interns1",
"InternS1ForConditionalGeneration",
),
"InternS1ProForConditionalGeneration": (
"interns1_pro",
"InternS1ProForConditionalGeneration",
),
"Idefics3ForConditionalGeneration": (
"idefics3",
"Idefics3ForConditionalGeneration",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment