Commit 4eabe123 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori

parents 45840cd2 58738772
...@@ -162,20 +162,9 @@ class LlamaAttention(nn.Module): ...@@ -162,20 +162,9 @@ class LlamaAttention(nn.Module):
prefix=f"{prefix}.o_proj", prefix=f"{prefix}.o_proj",
) )
is_neox_style = True self._init_rotary_emb(config,
is_gguf = quant_config and quant_config.get_name() == "gguf" rope_scaling=rope_scaling,
if is_gguf and config.model_type == "llama": quant_config=quant_config)
is_neox_style = False
self.rotary_emb = get_rope(
self.head_dim,
rotary_dim=self.head_dim,
max_position=max_position_embeddings,
base=rope_theta,
rope_scaling=rope_scaling,
is_neox_style=is_neox_style,
partial_rotary_factor=self.partial_rotary_factor,
)
if hasattr(config, "interleaved_sliding_window"): if hasattr(config, "interleaved_sliding_window"):
interleaved_sliding_window = config.interleaved_sliding_window interleaved_sliding_window = config.interleaved_sliding_window
...@@ -214,6 +203,24 @@ class LlamaAttention(nn.Module): ...@@ -214,6 +203,24 @@ class LlamaAttention(nn.Module):
output, _ = self.o_proj(attn_output) output, _ = self.o_proj(attn_output)
return output return output
def _init_rotary_emb(self, config: LlamaConfig,
rope_scaling: Optional[dict[str, Any]],
quant_config: Optional[QuantizationConfig]) -> None:
is_neox_style = True
is_gguf = quant_config and quant_config.get_name() == "gguf"
if is_gguf and config.model_type == "llama":
is_neox_style = False
self.rotary_emb = get_rope(
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
base=self.rope_theta,
rope_scaling=rope_scaling,
is_neox_style=is_neox_style,
partial_rotary_factor=self.partial_rotary_factor,
)
class LlamaDecoderLayer(nn.Module): class LlamaDecoderLayer(nn.Module):
......
...@@ -130,13 +130,15 @@ class LlamaModel(nn.Module): ...@@ -130,13 +130,15 @@ class LlamaModel(nn.Module):
class EagleLlamaForCausalLM(LlamaForCausalLM): class EagleLlamaForCausalLM(LlamaForCausalLM):
def __init__(self, *, vllm_config: VllmConfig, start_layer_id: int = 0): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
nn.Module.__init__(self) nn.Module.__init__(self)
self.config = vllm_config. \ self.config = vllm_config. \
speculative_config.draft_model_config.hf_config speculative_config.draft_model_config.hf_config
target_layer_num = vllm_config.model_config.get_num_layers(
vllm_config.parallel_config)
self.model = LlamaModel(vllm_config=vllm_config, self.model = LlamaModel(vllm_config=vllm_config,
prefix="model", prefix="model",
start_layer_id=start_layer_id) start_layer_id=target_layer_num)
logit_scale = getattr(self.config, "logit_scale", 1.0) logit_scale = getattr(self.config, "logit_scale", 1.0)
self.logits_processor = LogitsProcessor(self.config.vocab_size, self.logits_processor = LogitsProcessor(self.config.vocab_size,
......
...@@ -175,13 +175,15 @@ class LlamaModel(nn.Module): ...@@ -175,13 +175,15 @@ class LlamaModel(nn.Module):
class Eagle3LlamaForCausalLM(LlamaForCausalLM): class Eagle3LlamaForCausalLM(LlamaForCausalLM):
def __init__(self, *, vllm_config: VllmConfig, start_layer_id: int = 0): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
nn.Module.__init__(self) nn.Module.__init__(self)
self.config = vllm_config. \ self.config = vllm_config. \
speculative_config.draft_model_config.hf_config speculative_config.draft_model_config.hf_config
target_layer_num = vllm_config.model_config.get_num_layers(
vllm_config.parallel_config)
self.model = LlamaModel(vllm_config=vllm_config, self.model = LlamaModel(vllm_config=vllm_config,
start_layer_id=start_layer_id, prefix="model",
prefix="model") start_layer_id=target_layer_num)
logit_scale = getattr(self.config, "logit_scale", 1.0) logit_scale = getattr(self.config, "logit_scale", 1.0)
self.lm_head = ParallelLMHead( self.lm_head = ParallelLMHead(
...@@ -193,8 +195,7 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM): ...@@ -193,8 +195,7 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
self.logits_processor = LogitsProcessor(self.config.draft_vocab_size, self.logits_processor = LogitsProcessor(self.config.draft_vocab_size,
scale=logit_scale) scale=logit_scale)
self.draft_id_to_target_id = nn.Parameter( self.draft_id_to_target_id = nn.Parameter(
torch.zeros((self.config.draft_vocab_size), torch.zeros(self.config.draft_vocab_size, dtype=torch.long),
dtype=torch.long).type(torch.LongTensor),
requires_grad=False, requires_grad=False,
) )
...@@ -213,6 +214,9 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM): ...@@ -213,6 +214,9 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
) -> Optional[torch.Tensor]: ) -> Optional[torch.Tensor]:
logits = self.logits_processor(self.lm_head, hidden_states, logits = self.logits_processor(self.lm_head, hidden_states,
sampling_metadata) sampling_metadata)
if self.draft_id_to_target_id is None:
return logits
base = torch.arange(self.config.draft_vocab_size, device=logits.device) base = torch.arange(self.config.draft_vocab_size, device=logits.device)
targets = base + self.draft_id_to_target_id targets = base + self.draft_id_to_target_id
logits_new = logits.new_full(( logits_new = logits.new_full((
...@@ -245,4 +249,9 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM): ...@@ -245,4 +249,9 @@ class Eagle3LlamaForCausalLM(LlamaForCausalLM):
name = "model." + name name = "model." + name
model_weights[name] = loaded_weight model_weights[name] = loaded_weight
return loader.load_weights(model_weights.items()) loaded_weights = loader.load_weights(model_weights.items())
if 'd2t' not in loaded_weights:
self.draft_id_to_target_id = None
return loaded_weights
...@@ -721,9 +721,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): ...@@ -721,9 +721,8 @@ class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
batch. batch.
pixel_values: The pixels in each input image. pixel_values: The pixels in each input image.
:::{seealso} Info:
{class}`LlavaImageInputs` [LlavaImageInputs][]
:::
""" """
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
......
...@@ -135,11 +135,13 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo): ...@@ -135,11 +135,13 @@ class LlavaNextProcessingInfo(BaseLlavaProcessingInfo):
current_aspect_ratio = current_width / current_height current_aspect_ratio = current_width / current_height
if aspect_ratio > current_aspect_ratio: if aspect_ratio > current_aspect_ratio:
new_height = (original_height * current_width) // original_width new_height = int(
round(original_height * (current_width / original_width), 7))
padding = (current_height - new_height) // 2 padding = (current_height - new_height) // 2
current_height = current_height - (2 * padding) current_height = current_height - (2 * padding)
else: else:
new_width = (original_width * current_height) // original_height new_width = int(
round(original_width * (current_height / original_height), 7))
padding = (current_width - new_width) // 2 padding = (current_width - new_width) // 2
current_width = current_width - (2 * padding) current_width = current_width - (2 * padding)
...@@ -538,7 +540,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -538,7 +540,7 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
Unlike in LLaVA-1.5, the number of image tokens inputted to the language Unlike in LLaVA-1.5, the number of image tokens inputted to the language
model depends on the original size of the input image. Including the model depends on the original size of the input image. Including the
original image token in the input, the required number of image tokens original image token in the input, the required number of image tokens
is given by {func}`get_llava_next_image_feature_size`. is given by [get_llava_next_image_feature_size][].
This way, the `positions` and `attn_metadata` are consistent This way, the `positions` and `attn_metadata` are consistent
with the `input_ids`. with the `input_ids`.
...@@ -549,9 +551,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, ...@@ -549,9 +551,8 @@ class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
pixel_values: The pixels in each grid patch for each input image. pixel_values: The pixels in each grid patch for each input image.
image_sizes: The original `(height, width)` for each input image. image_sizes: The original `(height, width)` for each input image.
:::{seealso} Info:
{class}`LlavaNextImageInputs` [LlavaNextImageInputs][]
:::
""" """
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
......
...@@ -116,11 +116,13 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo): ...@@ -116,11 +116,13 @@ class LlavaOnevisionProcessingInfo(LlavaNextProcessingInfo):
current_aspect_ratio = current_width / current_height current_aspect_ratio = current_width / current_height
if aspect_ratio > current_aspect_ratio: if aspect_ratio > current_aspect_ratio:
new_height = (original_height * current_width) // original_width new_height = int(
round(original_height * (current_width / original_width), 7))
padding = (current_height - new_height) // 2 padding = (current_height - new_height) // 2
current_height = current_height - (2 * padding) current_height = current_height - (2 * padding)
else: else:
new_width = (original_width * current_height) // original_height new_width = int(
round(original_width * (current_height / original_height), 7))
padding = (current_width - new_width) // 2 padding = (current_width - new_width) // 2
current_width = current_width - (2 * padding) current_width = current_width - (2 * padding)
......
...@@ -51,10 +51,7 @@ class Medusa(nn.Module): ...@@ -51,10 +51,7 @@ class Medusa(nn.Module):
needs to have truncated_vocab_size (=k) as an attribute.""" needs to have truncated_vocab_size (=k) as an attribute."""
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
if hasattr(vllm_config, 'draft_model_config'): config = vllm_config.speculative_config.draft_model_config.hf_config
config = vllm_config.draft_model_config.hf_config
else:
config = vllm_config.model_config.hf_config
super().__init__() super().__init__()
self.config = config self.config = config
self.blocks = nn.ModuleList([ self.blocks = nn.ModuleList([
......
...@@ -250,7 +250,7 @@ class MiMoMTP(nn.Module): ...@@ -250,7 +250,7 @@ class MiMoMTP(nn.Module):
return loaded_params return loaded_params
def map_model_name_to_mtp_param_name(self, name: str) -> str: def map_model_name_to_mtp_param_name(self, name: str) -> str:
import re import regex as re
name_without_prefix = [ name_without_prefix = [
"token_layernorm", "hidden_layernorm", "input_proj", "token_layernorm", "hidden_layernorm", "input_proj",
"final_layernorm" "final_layernorm"
......
...@@ -242,9 +242,6 @@ class MiniCPMAttention(nn.Module): ...@@ -242,9 +242,6 @@ class MiniCPMAttention(nn.Module):
base=rope_theta, base=rope_theta,
rope_scaling=rope_scaling, rope_scaling=rope_scaling,
) )
# set rope as fp32 instead of bf16
self.rotary_emb.cos_sin_cache = self.rotary_emb._compute_cos_sin_cache(
)
self.attn = Attention(self.num_heads, self.attn = Attention(self.num_heads,
self.head_dim, self.head_dim,
self.scaling, self.scaling,
......
...@@ -2,10 +2,10 @@ ...@@ -2,10 +2,10 @@
"""Inference-only MiniMaxText01 model.""" """Inference-only MiniMaxText01 model."""
import copy import copy
import math import math
import re
from collections.abc import Iterable from collections.abc import Iterable
from typing import Optional, Union from typing import Optional, Union
import regex as re
import torch import torch
import torch.distributed import torch.distributed
import torch.nn.functional as F import torch.nn.functional as F
...@@ -604,8 +604,9 @@ class MiniMaxText01DecoderLayer(nn.Module): ...@@ -604,8 +604,9 @@ class MiniMaxText01DecoderLayer(nn.Module):
rope_theta = getattr(config, "rope_theta", 10000) rope_theta = getattr(config, "rope_theta", 10000)
head_dim = getattr(config, "head_dim", head_dim = getattr(config, "head_dim", None)
config.hidden_size // config.num_attention_heads) if head_dim is None:
head_dim = config.hidden_size // config.num_attention_heads
if hasattr(config, "max_model_len") and isinstance( if hasattr(config, "max_model_len") and isinstance(
config.max_model_len, int): config.max_model_len, int):
max_position_embeddings = min(config.max_position_embeddings, max_position_embeddings = min(config.max_position_embeddings,
...@@ -861,8 +862,9 @@ class MiniMaxText01Model(nn.Module): ...@@ -861,8 +862,9 @@ class MiniMaxText01Model(nn.Module):
cache_shape=self.cache_shape) cache_shape=self.cache_shape)
rope_theta = getattr(config, "rope_theta", 10000) rope_theta = getattr(config, "rope_theta", 10000)
head_dim = getattr(config, "head_dim", head_dim = getattr(config, "head_dim", None)
config.hidden_size // config.num_attention_heads) if head_dim is None:
head_dim = config.hidden_size // config.num_attention_heads
if hasattr(config, "max_model_len") and isinstance( if hasattr(config, "max_model_len") and isinstance(
config.max_model_len, int): config.max_model_len, int):
max_position_embeddings = min(config.max_position_embeddings, max_position_embeddings = min(config.max_position_embeddings,
......
...@@ -559,9 +559,8 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA, ...@@ -559,9 +559,8 @@ class Mistral3ForConditionalGeneration(nn.Module, SupportsLoRA,
batch. batch.
pixel_values: The pixels in each input image. pixel_values: The pixels in each input image.
:::{seealso} Info:
{class}`Mistral3ImagePixelInputs` [Mistral3ImagePixelInputs][]
:::
""" """
if intermediate_tensors is not None: if intermediate_tensors is not None:
inputs_embeds = None inputs_embeds = None
......
...@@ -138,8 +138,9 @@ class MixtralAttention(nn.Module): ...@@ -138,8 +138,9 @@ class MixtralAttention(nn.Module):
assert tp_size % self.total_num_kv_heads == 0 assert tp_size % self.total_num_kv_heads == 0
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
# MixtralConfig has an optional head_dim argument # MixtralConfig has an optional head_dim argument
self.head_dim = getattr(config, "head_dim", self.head_dim = getattr(config, "head_dim", None)
self.hidden_size // self.total_num_heads) if self.head_dim is None:
self.head_dim = self.hidden_size // self.total_num_heads
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
...@@ -482,5 +483,5 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -482,5 +483,5 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def load_weights(self, weights: Iterable[tuple[str, def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]: torch.Tensor]]) -> set[str]:
loader = AutoWeightsLoader(self, skip_prefixes=["rotary_emb.inv_freq"]) loader = AutoWeightsLoader(self)
return loader.load_weights(weights) return loader.load_weights(weights)
...@@ -193,8 +193,9 @@ class MixtralAttention(nn.Module): ...@@ -193,8 +193,9 @@ class MixtralAttention(nn.Module):
assert tp_size % self.total_num_kv_heads == 0 assert tp_size % self.total_num_kv_heads == 0
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
# MixtralConfig has an optional head_dim argument # MixtralConfig has an optional head_dim argument
self.head_dim = getattr(config, "head_dim", self.head_dim = getattr(config, "head_dim", None)
self.hidden_size // self.total_num_heads) if self.head_dim is None:
self.head_dim = self.hidden_size // self.total_num_heads
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
...@@ -447,8 +448,5 @@ class MixtralForCausalLM(nn.Module, SupportsPP): ...@@ -447,8 +448,5 @@ class MixtralForCausalLM(nn.Module, SupportsPP):
def load_weights(self, weights: Iterable[tuple[str, def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]: torch.Tensor]]) -> set[str]:
loader = AutoWeightsLoader( loader = AutoWeightsLoader(self)
self,
skip_prefixes=(["rotary_emb.inv_freq"]),
)
return loader.load_weights(weights) return loader.load_weights(weights)
...@@ -965,7 +965,7 @@ def select_tiling( ...@@ -965,7 +965,7 @@ def select_tiling(
class MolmoProcessorWrapper: class MolmoProcessorWrapper:
""" """
Wraps {class}`MolmoProcessor` so that it can be called directly. Wraps `MolmoProcessor` so that it can be called directly.
The original definition can be found here: The original definition can be found here:
https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py
......
...@@ -158,8 +158,9 @@ class NemotronAttention(nn.Module): ...@@ -158,8 +158,9 @@ class NemotronAttention(nn.Module):
assert tp_size % self.total_num_kv_heads == 0 assert tp_size % self.total_num_kv_heads == 0
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
# MistralConfig has an optional head_dim introduced by Mistral-Nemo # MistralConfig has an optional head_dim introduced by Mistral-Nemo
self.head_dim = getattr(config, "head_dim", self.head_dim = getattr(config, "head_dim", None)
self.hidden_size // self.total_num_heads) if self.head_dim is None:
self.head_dim = self.hidden_size // self.total_num_heads
self.q_size = self.num_heads * self.head_dim self.q_size = self.num_heads * self.head_dim
self.kv_size = self.num_kv_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim
self.scaling = self.head_dim**-0.5 self.scaling = self.head_dim**-0.5
...@@ -502,14 +503,5 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP): ...@@ -502,14 +503,5 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
def load_weights(self, weights: Iterable[tuple[str, def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]: torch.Tensor]]) -> set[str]:
loader = AutoWeightsLoader( loader = AutoWeightsLoader(self)
self,
skip_prefixes=([
"rotary_emb.inv_freq",
# Models trained using ColossalAI may include these tensors in
# the checkpoint. Skip them.
"rotary_emb.cos_cached",
"rotary_emb.sin_cached"
]),
)
return loader.load_weights(weights) return loader.load_weights(weights)
...@@ -23,18 +23,20 @@ ...@@ -23,18 +23,20 @@
# limitations under the License. # limitations under the License.
"""Inference-only deci model compatible with HuggingFace weights.""" """Inference-only deci model compatible with HuggingFace weights."""
from collections.abc import Iterable from collections.abc import Iterable
from typing import Optional, Union from typing import Any, Optional, Union
import torch import torch
from torch import nn from torch import nn
from transformers import LlamaConfig from transformers import LlamaConfig
from vllm.attention import AttentionType
from vllm.compilation.decorators import support_torch_compile from vllm.compilation.decorators import support_torch_compile
from vllm.config import CacheConfig, VllmConfig from vllm.config import CacheConfig, VllmConfig
from vllm.distributed import get_pp_group from vllm.distributed import get_pp_group
from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding) DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.model_loader.weight_utils import (
...@@ -62,6 +64,48 @@ def _find_multiple(n: int, k: int) -> int: ...@@ -62,6 +64,48 @@ def _find_multiple(n: int, k: int) -> int:
return n + k - (n % k) return n + k - (n % k)
class DeciLMAttention(LlamaAttention):
def __init__(
self,
config: LlamaConfig,
hidden_size: int,
num_heads: int,
num_kv_heads: int,
rope_theta: float = 10000,
rope_scaling: Optional[dict[str, Any]] = None,
max_position_embeddings: int = 8192,
quant_config: Optional[QuantizationConfig] = None,
bias: bool = False,
bias_o_proj: bool = False,
cache_config: Optional[CacheConfig] = None,
prefix: str = "",
attn_type: str = AttentionType.DECODER,
) -> None:
super().__init__(config, hidden_size, num_heads, num_kv_heads,
rope_theta, rope_scaling, max_position_embeddings,
quant_config, bias, bias_o_proj, cache_config, prefix,
attn_type)
def _init_rotary_emb(self, config, rope_scaling: Optional[dict[str, Any]],
quant_config: Optional[QuantizationConfig]) -> None:
# Enables YARN for Mistral and LLaMA4 derivatives.
is_neox_style = True
if hasattr(config, "position_embedding_type"):
is_neox_style = config.position_embedding_type not in [
"mistral_yarn", "rope_llama4"
]
self.rotary_emb = get_rope(
self.head_dim,
rotary_dim=self.head_dim,
max_position=self.max_position_embeddings,
base=self.rope_theta,
rope_scaling=rope_scaling,
is_neox_style=is_neox_style,
partial_rotary_factor=self.partial_rotary_factor)
class DeciLMDecoderLayer(nn.Module): class DeciLMDecoderLayer(nn.Module):
def __init__( def __init__(
...@@ -98,7 +142,7 @@ class DeciLMDecoderLayer(nn.Module): ...@@ -98,7 +142,7 @@ class DeciLMDecoderLayer(nn.Module):
if not self._is_no_op_attention: if not self._is_no_op_attention:
num_kv_heads = (config.num_attention_heads // num_kv_heads = (config.num_attention_heads //
block_config.attention.n_heads_in_group) block_config.attention.n_heads_in_group)
self.self_attn = LlamaAttention( self.self_attn = DeciLMAttention(
config=config, config=config,
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
num_heads=config.num_attention_heads, num_heads=config.num_attention_heads,
......
...@@ -22,9 +22,10 @@ from vllm.multimodal.processing import (PromptReplacement, PromptUpdate, ...@@ -22,9 +22,10 @@ from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
PromptUpdateDetails) PromptUpdateDetails)
from .intern_vit import InternVisionModel from .intern_vit import InternVisionModel
from .internvl import (BaseInternVLProcessingInfo, BaseInternVLProcessor, from .internvl import (BaseInternVLDummyInputsBuilder,
InternVLChatModel, InternVLDummyInputsBuilder, BaseInternVLMultiModalProcessor,
InternVLMultiModalProcessor) BaseInternVLProcessingInfo, BaseInternVLProcessor,
InternVLChatModel)
IMG_PAD = "<|vision_pad|>" IMG_PAD = "<|vision_pad|>"
...@@ -84,7 +85,8 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo): ...@@ -84,7 +85,8 @@ class NVLMProcessingInfo(BaseInternVLProcessingInfo):
) )
class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]): class NVLMDummyInputsBuilder(BaseInternVLDummyInputsBuilder[NVLMProcessingInfo]
):
def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
num_images = mm_counts.get("image", 0) num_images = mm_counts.get("image", 0)
...@@ -110,7 +112,8 @@ class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]): ...@@ -110,7 +112,8 @@ class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]):
} }
class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]): class NVLMMultiModalProcessor(
BaseInternVLMultiModalProcessor[NVLMProcessingInfo]):
def _get_prompt_updates( def _get_prompt_updates(
self, self,
......
...@@ -382,19 +382,7 @@ class OlmoForCausalLM(nn.Module, SupportsPP): ...@@ -382,19 +382,7 @@ class OlmoForCausalLM(nn.Module, SupportsPP):
torch.Tensor]]) -> set[str]: torch.Tensor]]) -> set[str]:
loader = AutoWeightsLoader( loader = AutoWeightsLoader(
self, self,
skip_prefixes=([ skip_prefixes=(["lm_head.weight"]
"rotary_emb.inv_freq", if self.config.tie_word_embeddings else None),
# Models trained using ColossalAI may include these tensors in
# the checkpoint. Skip them.
"rotary_emb.cos_cached",
"rotary_emb.sin_cached",
"lm_head.weight"
] if self.config.tie_word_embeddings else [
"rotary_emb.inv_freq",
# Models trained using ColossalAI may include these tensors in
# the checkpoint. Skip them.
"rotary_emb.cos_cached",
"rotary_emb.sin_cached"
]),
) )
return loader.load_weights(weights) return loader.load_weights(weights)
...@@ -314,7 +314,8 @@ class Olmo2Model(nn.Module): ...@@ -314,7 +314,8 @@ class Olmo2Model(nn.Module):
hidden_states = self.norm(hidden_states) hidden_states = self.norm(hidden_states)
return hidden_states return hidden_states
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]:
stacked_params_mapping = [ stacked_params_mapping = [
# (param_name, shard_name, shard_id) # (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"), ("qkv_proj", "q_proj", "q"),
...@@ -325,6 +326,7 @@ class Olmo2Model(nn.Module): ...@@ -325,6 +326,7 @@ class Olmo2Model(nn.Module):
] ]
params_dict = dict(self.named_parameters(remove_duplicate=False)) params_dict = dict(self.named_parameters(remove_duplicate=False))
loaded_params: set[str] = set()
for name, loaded_weight in weights: for name, loaded_weight in weights:
if is_pp_missing_parameter(name, self): if is_pp_missing_parameter(name, self):
continue continue
...@@ -347,6 +349,8 @@ class Olmo2Model(nn.Module): ...@@ -347,6 +349,8 @@ class Olmo2Model(nn.Module):
weight_loader = getattr(param, "weight_loader", weight_loader = getattr(param, "weight_loader",
default_weight_loader) default_weight_loader)
weight_loader(param, loaded_weight) weight_loader(param, loaded_weight)
loaded_params.add(name)
return loaded_params
class Olmo2ForCausalLM(nn.Module, SupportsPP): class Olmo2ForCausalLM(nn.Module, SupportsPP):
...@@ -403,19 +407,7 @@ class Olmo2ForCausalLM(nn.Module, SupportsPP): ...@@ -403,19 +407,7 @@ class Olmo2ForCausalLM(nn.Module, SupportsPP):
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
loader = AutoWeightsLoader( loader = AutoWeightsLoader(
self, self,
skip_prefixes=([ skip_prefixes=(["lm_head.weight"]
"rotary_emb.inv_freq", if self.config.tie_word_embeddings else None),
# Models trained using ColossalAI may include these tensors in
# the checkpoint. Skip them.
"rotary_emb.cos_cached",
"rotary_emb.sin_cached",
"lm_head.weight"
] if self.config.tie_word_embeddings else [
"rotary_emb.inv_freq",
# Models trained using ColossalAI may include these tensors in
# the checkpoint. Skip them.
"rotary_emb.cos_cached",
"rotary_emb.sin_cached"
]),
) )
return loader.load_weights(weights) return loader.load_weights(weights)
...@@ -442,8 +442,5 @@ class OlmoeForCausalLM(nn.Module, SupportsPP): ...@@ -442,8 +442,5 @@ class OlmoeForCausalLM(nn.Module, SupportsPP):
def load_weights(self, weights: Iterable[tuple[str, def load_weights(self, weights: Iterable[tuple[str,
torch.Tensor]]) -> set[str]: torch.Tensor]]) -> set[str]:
loader = AutoWeightsLoader( loader = AutoWeightsLoader(self)
self,
skip_prefixes=["rotary_emb.inv_freq"],
)
return loader.load_weights(weights) return loader.load_weights(weights)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment