Unverified Commit 111d8690 authored by Jakub Zakrzewski's avatar Jakub Zakrzewski Committed by GitHub
Browse files

[Model] Add nvidia/llama-nemotron-embed-vl-1b-v2 multimodal embedding model (#35297)


Signed-off-by: default avatarJakub Zakrzewski <jzakrzewski@nvidia.com>
parent 7fea7250
......@@ -498,6 +498,67 @@ curl -s http://localhost:8000/pooling -H "Content-Type: application/json" -d '{
- Multi-vector retrieval: [examples/pooling/token_embed/colqwen3_token_embed_online.py](../../examples/pooling/token_embed/colqwen3_token_embed_online.py)
- Reranking (text + multi-modal): [examples/pooling/score/colqwen3_rerank_online.py](../../examples/pooling/score/colqwen3_rerank_online.py)
### Llama Nemotron Multimodal Embedding Models
Llama Nemotron VL Embedding models combine the bidirectional Llama embedding backbone
(from `nvidia/llama-nemotron-embed-1b-v2`) with SigLIP as the vision encoder to produce
single-vector embeddings from text and/or images.
| Architecture | Backbone | Example HF Models |
|---|---|---|
| `LlamaNemotronVLModel` | Bidirectional Llama + SigLIP | `nvidia/llama-nemotron-embed-vl-1b-v2` |
Start the server:
```shell
vllm serve nvidia/llama-nemotron-embed-vl-1b-v2 \
--trust-remote-code \
--chat-template examples/pooling/embed/template/nemotron_embed_vl.jinja
```
!!! note
The chat template bundled with this model's tokenizer is not suitable for
the embeddings API. Use the provided override template above when serving
with the `messages`-based (chat-style) embeddings endpoint.
The override template uses the message `role` to automatically prepend the
appropriate prefix: set `role` to `"query"` for queries (prepends `query: `)
or `"document"` for passages (prepends `passage: `). Any other role omits
the prefix.
Embed text queries:
```shell
curl -s http://localhost:8000/v1/embeddings -H "Content-Type: application/json" -d '{
"model": "nvidia/llama-nemotron-embed-vl-1b-v2",
"messages": [
{
"role": "query",
"content": [
{"type": "text", "text": "What is machine learning?"}
]
}
]
}'
```
Embed images via the chat-style `messages` field:
```shell
curl -s http://localhost:8000/v1/embeddings -H "Content-Type: application/json" -d '{
"model": "nvidia/llama-nemotron-embed-vl-1b-v2",
"messages": [
{
"role": "document",
"content": [
{"type": "image_url", "image_url": {"url": "data:image/png;base64,<BASE64>"}},
{"type": "text", "text": "Describe the image."}
]
}
]
}'
```
### BAAI/bge-m3
The `BAAI/bge-m3` model comes with extra weights for sparse and colbert embeddings but unfortunately in its `config.json`
......
......@@ -821,6 +821,7 @@ The following table lists those that are tested in vLLM.
|--------------|--------|--------|-------------------|----------------------|---------------------------|
| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | |
| `ColModernVBertForRetrieval` | ColModernVBERT | T / I | `ModernVBERT/colmodernvbert-merged` | | |
| `LlamaNemotronVLModel` | Llama Nemotron Embedding + SigLIP | T + I | `nvidia/llama-nemotron-embed-vl-1b-v2` | | |
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ |
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ |
| `Qwen3VLForConditionalGeneration`<sup>C</sup> | Qwen3-VL | T + I + V | `Qwen/Qwen3-VL-Embedding-2B`, etc. | ✅︎ | ✅︎ |
......
{%- if messages | length > 1 -%}
{{ raise_exception('Embedding models should only embed one message at a time') }}
{%- endif -%}
{% set vars = namespace(prefix='', images=[], texts=[]) %}
{%- for message in messages -%}
{%- if message['role'] == 'query' -%}
{%- set vars.prefix = 'query: ' %}
{%- elif message['role'] == 'document' -%}
{%- set vars.prefix = 'passage: ' %}
{%- endif -%}
{%- for content in message['content'] -%}
{%- if content['type'] == 'text' -%}
{%- set vars.texts = vars.texts + [content['text']] %}
{%- elif content['type'] == 'image' -%}
{%- set vars.images = vars.images + ['<image> '] %}
{%- endif -%}
{%- endfor -%}
{%- endfor -%}
{{- bos_token }}{{ vars.prefix }}{{ (vars.images + vars.texts) | join('') }}
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for LlamaNemotronVL embedding model (nvidia/llama-nemotron-embed-vl-1b-v2).
This model uses SigLIP vision encoder with bidirectional LLaMA for embeddings.
"""
import pytest
import torch
from transformers import AutoModel
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ...utils import check_embeddings_close
# Prefixes used by the model API
QUERY_PREFIX = "query: "
PASSAGE_PREFIX = "passage: "
# Text prompts for text-only embedding
HF_TEXT_PROMPTS = [
# T -> X (text embedding queries)
f"{QUERY_PREFIX}The label of the object is stop sign",
f"{QUERY_PREFIX}cherry blossom",
]
# Image prompts using the model's expected format
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
# I -> X (image embedding as passage/document)
"stop_sign": f"{PASSAGE_PREFIX}<image>",
"cherry_blossom": f"{PASSAGE_PREFIX}<image>",
}
)
MODELS = ["nvidia/llama-nemotron-embed-vl-1b-v2"]
def _run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
input_texts: list[str],
input_images: PromptImageInput,
model: str,
*,
dtype: str,
) -> None:
"""Run embedding comparison test between HF and vLLM.
NOTE: Run vLLM first to avoid CUDA initialization issues with multiprocessing.
"""
# Run vLLM inference first
with vllm_runner(
model,
runner="pooling",
dtype=dtype,
max_model_len=2048,
enforce_eager=True,
trust_remote_code=True,
) as vllm_model:
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
# Run HF inference using the model's encode_queries/encode_documents API
with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
hf_outputs = []
for text, image in zip(input_texts, input_images):
with torch.inference_mode():
if text.startswith(QUERY_PREFIX):
# Strip prefix and use encode_queries for query texts
query_text = text[len(QUERY_PREFIX) :]
embedding = hf_model.model.encode_queries([query_text])
elif text.startswith(PASSAGE_PREFIX):
# Strip prefix and use encode_documents for passages/images
passage_text = text[len(PASSAGE_PREFIX) :]
if image is not None:
# Image document - pass image to encode_documents
embedding = hf_model.model.encode_documents(
images=[image],
texts=[passage_text],
)
else:
# Text-only document
embedding = hf_model.model.encode_documents(
texts=[passage_text]
)
else:
raise ValueError(
f"Text must start with '{QUERY_PREFIX}' or '{PASSAGE_PREFIX}'"
)
hf_outputs.append(embedding[0].tolist())
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_models_text(
hf_runner,
vllm_runner,
image_assets,
model: str,
dtype: str,
) -> None:
"""Test text-only embedding."""
input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]
_run_test(
hf_runner,
vllm_runner,
input_texts,
input_images, # type: ignore
model,
dtype=dtype,
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_models_image(
hf_runner,
vllm_runner,
image_assets,
model: str,
dtype: str,
) -> None:
"""Test image embedding."""
input_texts_images = [
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]
_run_test(
hf_runner,
vllm_runner,
input_texts,
input_images,
model,
dtype=dtype,
)
......@@ -598,6 +598,9 @@ _EMBEDDING_EXAMPLE_MODELS = {
"ColModernVBertForRetrieval": _HfExamplesInfo(
"ModernVBERT/colmodernvbert-merged",
),
"LlamaNemotronVLModel": _HfExamplesInfo(
"nvidia/llama-nemotron-embed-vl-1b-v2", trust_remote_code=True
),
"LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"),
"Phi3VForCausalLM": _HfExamplesInfo(
"TIGER-Lab/VLM2Vec-Full", trust_remote_code=True
......
......@@ -112,6 +112,42 @@ class LlamaBidirectionalConfig(VerifyAndUpdateConfig):
model_config.pooler_config.seq_pooling_type = pooling_type
class LlamaNemotronVLConfig(VerifyAndUpdateConfig):
"""Config handler for LlamaNemotronVL embedding models."""
@staticmethod
def verify_and_update_model_config(model_config: "ModelConfig") -> None:
from vllm.config.pooler import SequencePoolingType
hf_config = model_config.hf_config
# Set bidirectional attention on the language model config
hf_config.is_causal = False
if hasattr(hf_config, "llm_config"):
hf_config.llm_config.is_causal = False
if hasattr(hf_config, "vision_config"):
hf_config.patch_size = hf_config.vision_config.patch_size
# Set up pooling type
pooling_type_map: dict[str, SequencePoolingType] = {
"avg": "MEAN",
"cls": "CLS",
"last": "LAST",
}
# Get pooling type from config (check both top-level and llm_config)
pooling = getattr(hf_config, "pooling", None)
if pooling is None and hasattr(hf_config, "llm_config"):
pooling = getattr(hf_config.llm_config, "pooling", "avg")
pooling_type = pooling_type_map.get(pooling)
if pooling_type is None:
raise ValueError(f"pool_type {pooling!r} not supported")
model_config.pooler_config.seq_pooling_type = pooling_type
class NomicBertModelConfig(VerifyAndUpdateConfig):
@staticmethod
def verify_and_update_model_config(model_config: "ModelConfig") -> None:
......@@ -619,6 +655,7 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
"Gemma3TextModel": Gemma3TextModelConfig,
"LlamaBidirectionalForSequenceClassification": LlamaBidirectionalConfig,
"LlamaBidirectionalModel": LlamaBidirectionalConfig,
"LlamaNemotronVLModel": LlamaNemotronVLConfig,
"NomicBertModel": NomicBertModelConfig,
"Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig,
"Qwen2ForRewardModel": Qwen2ForRewardModelConfig,
......
......@@ -18,6 +18,7 @@ from transformers import AutoModel, PretrainedConfig
from transformers.image_processing_utils_fast import BaseImageProcessorFast
from vllm.config import VllmConfig
from vllm.model_executor.layers.pooler import DispatchPooler
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.quantization.awq import AWQConfig
from vllm.model_executor.models.internvl import (
......@@ -30,12 +31,14 @@ from vllm.model_executor.models.internvl import (
InternVLProcessor,
)
from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.models.siglip import SiglipVisionModel
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.processing import PromptUpdateDetails
from vllm.sequence import IntermediateTensors
from vllm.tokenizers import TokenizerLike
from vllm.transformers_utils.processor import cached_image_processor_from_config
from vllm.transformers_utils.repo_utils import get_hf_file_to_dict
from .interfaces import (
MultiModalEmbeddings,
......@@ -43,11 +46,13 @@ from .interfaces import (
SupportsMultiModal,
SupportsPP,
)
from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<image>"
from .interfaces_base import VllmModelForPooling
from .utils import (
AutoWeightsLoader,
WeightsMapper,
init_vllm_registered_model,
maybe_prefix,
)
def build_transform(input_size: int):
......@@ -183,10 +188,12 @@ def image_to_pixel_values_nemotron_vl(
min_num: int,
max_num: int,
use_thumbnail: bool,
transform: T.Compose | None = None,
) -> torch.Tensor:
target_ratios = get_nemotron_vl_target_ratios(min_num, max_num)
transform = build_transform(input_size=input_size)
if transform is None:
transform = build_transform(input_size=input_size)
images = dynamic_preprocess_nemotron_vl(
image,
......@@ -200,11 +207,15 @@ def image_to_pixel_values_nemotron_vl(
class NemotronVLProcessor(InternVLProcessor):
IMG_START = "<img>"
IMG_END = "</img>"
IMG_CONTEXT = "<image>"
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
image_processor: BaseImageProcessorFast,
image_processor: BaseImageProcessorFast | None = None,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
......@@ -236,11 +247,18 @@ class NemotronVLProcessor(InternVLProcessor):
self.min_dynamic_patch = min_dynamic_patch
self.max_dynamic_patch = max_dynamic_patch
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail: bool = self.image_processor.use_thumbnail
if image_processor is not None:
self.use_thumbnail = image_processor.use_thumbnail
else:
self.use_thumbnail = getattr(config, "use_thumbnail", True)
@property
def image_token_id(self) -> int:
return self.tokenizer.get_vocab()[IMG_CONTEXT]
return self.tokenizer.get_vocab()[self.IMG_CONTEXT]
def _get_transform(self) -> T.Compose:
return build_transform(input_size=self.image_size)
def get_num_image_tokens(
self,
......@@ -283,10 +301,26 @@ class NemotronVLProcessor(InternVLProcessor):
min_num=min_num,
max_num=max_num,
use_thumbnail=self.use_thumbnail,
transform=self._get_transform(),
)
for image in images
]
def _replace_image_tokens(
self,
text: list[str],
pixel_values_lst: list[torch.Tensor],
) -> list[str]:
"""Replace <image> placeholders with image tokens."""
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
# Use temporary placeholder to avoid replacing tokens we just inserted
NVL_IMAGE_CONTEXT = image_repl.full.replace("<image>", "<NVL_IMG_CONTEXT>")
text = [t.replace("<image>", NVL_IMAGE_CONTEXT, 1) for t in text]
return [t.replace("<NVL_IMG_CONTEXT>", self.IMG_CONTEXT) for t in text]
def _preprocess_image(
self,
text: list[str],
......@@ -311,15 +345,7 @@ class NemotronVLProcessor(InternVLProcessor):
),
}
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
NVL_IMAGE_CONTEXT = image_repl.full.replace(
"<image>", "<NVL_IMG_CONTEXT>"
)
text = [t.replace("<image>", NVL_IMAGE_CONTEXT, 1) for t in text]
text = [t.replace("<NVL_IMG_CONTEXT>", IMG_CONTEXT) for t in text]
text = self._replace_image_tokens(text, pixel_values_lst)
return text, image_inputs
def get_image_repl(
......@@ -327,10 +353,10 @@ class NemotronVLProcessor(InternVLProcessor):
feature_size: int,
num_patches: int | None,
) -> PromptUpdateDetails[str]:
repl_features = IMG_CONTEXT * feature_size
repl_full = IMG_START + repl_features + IMG_END
repl_features = self.IMG_CONTEXT * feature_size
repl_full = self.IMG_START + repl_features + self.IMG_END
return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
return PromptUpdateDetails.select_text(repl_full, self.IMG_CONTEXT)
class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
......@@ -396,7 +422,7 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
with self._mark_language_model(vllm_config):
self.language_model = init_vllm_registered_model(
vllm_config=vllm_config,
hf_config=config.text_config,
hf_config=config.get_text_config(),
prefix=maybe_prefix(prefix, "language_model"),
)
......@@ -413,7 +439,7 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
# the awq models from OpenGVLab missing `modules_to_not_convert`
# patch the quant_config to add `modules_to_not_convert` back
if isinstance(quant_config, AWQConfig):
text_config = config.text_config
text_config = config.get_text_config()
llm_quant_config = getattr(text_config, "quantization_config", None)
if (not quant_config.modules_to_not_convert) and (
llm_quant_config is not None
......@@ -429,10 +455,17 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
):
return AutoModel.from_config(config.vision_config, trust_remote_code=True)
def _init_mlp1(self, config: PretrainedConfig) -> nn.Module:
vit_hidden_size = config.vit_hidden_size
vision_projection_hidden_size = config.projector_hidden_size
llm_hidden_size = config.text_config.hidden_size
def _init_mlp1(
self,
config: PretrainedConfig,
vit_hidden_size: int | None = None,
vision_projection_hidden_size: int | None = None,
) -> nn.Module:
if vit_hidden_size is None:
vit_hidden_size = config.vit_hidden_size
if vision_projection_hidden_size is None:
vision_projection_hidden_size = config.projector_hidden_size
llm_hidden_size = config.get_text_config().hidden_size
return nn.Sequential(
nn.LayerNorm(
......@@ -465,10 +498,18 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
x = x.permute(0, 2, 1, 3).contiguous()
return x
def _call_vision_model(self, pixel_values: torch.Tensor) -> torch.Tensor:
"""Call vision model and return embeddings.
Override this method in subclasses to handle different vision model
interfaces (e.g., SigLIP vs C-RADIO).
"""
vit_embeds = self.vision_model(x=pixel_values).features
return vit_embeds.to(dtype=torch.bfloat16)
def extract_feature(self, pixel_values: torch.Tensor) -> torch.Tensor:
# https://huggingface.co/nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1/blob/main/modeling.py#L177
vit_embeds = self.vision_model(x=pixel_values).features
vit_embeds = vit_embeds.to(dtype=torch.bfloat16)
vit_embeds = self._call_vision_model(pixel_values)
h = w = int(vit_embeds.shape[1] ** 0.5)
vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
......@@ -523,15 +564,16 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
image_embeds = self.extract_feature(image_input["pixel_values_flat"])
num_patches = image_input["num_patches"]
hidden_size = self.config.get_text_config().hidden_size
# Only one image in the current batch
if len(num_patches) == 1:
return (image_embeds.view(-1, self.config.text_config.hidden_size),)
return (image_embeds.view(-1, hidden_size),)
# NOTE: Image embeddings are split into separate tensors for each image
# by the size of each embedding.
feature_size = image_embeds.shape[1]
image_embeds = image_embeds.view(-1, self.config.text_config.hidden_size)
image_embeds = image_embeds.view(-1, hidden_size)
image_feature_sizes = [
num_patches * feature_size for num_patches in num_patches
]
......@@ -643,3 +685,201 @@ class LlamaNemotronVLChatModel(nn.Module, SupportsMultiModal, SupportsPP, Suppor
connector="mlp1",
tower_model="vision_model",
)
# --------------------------------------------------------
# LlamaNemotronVL Embedding Model (nvidia/llama-nemotron-embed-vl-1b-v2)
# Extends LlamaNemotronVLChatModel for embedding/pooling tasks:
# - SigLIP vision encoder (instead of C-RADIO)
# - Bidirectional (non-causal) LLaMA language model
# - Pooler output instead of generative logits
# --------------------------------------------------------
# SigLIP normalization constants
SIGLIP_MEAN = (0.5, 0.5, 0.5)
SIGLIP_STD = (0.5, 0.5, 0.5)
def build_siglip_transform(input_size: int):
"""Build transform for SigLIP vision encoder with normalization.
Extends the base transform from nemotron_vl with SigLIP-specific normalization.
"""
base_transform = build_transform(input_size=input_size)
return T.Compose(
[
base_transform,
T.Normalize(mean=SIGLIP_MEAN, std=SIGLIP_STD),
]
)
class LlamaNemotronVLEmbedProcessor(NemotronVLProcessor):
"""
Processor for LlamaNemotronVL embedding model.
Inherits from NemotronVLProcessor and specializes it for embedding tasks:
- Uses SigLIP transform with normalization instead of base transform
- Uses different image context token (<IMG_CONTEXT> vs <image>)
"""
IMG_CONTEXT = "<IMG_CONTEXT>"
def __init__(
self,
config: PretrainedConfig,
tokenizer: TokenizerLike,
processor_config: dict,
*,
min_dynamic_patch: int | None = None,
max_dynamic_patch: int | None = None,
dynamic_image_size: bool | None = None,
) -> None:
if min_dynamic_patch is None:
min_dynamic_patch = processor_config.get(
"min_input_tiles",
getattr(config, "min_dynamic_patch", 1),
)
if max_dynamic_patch is None:
max_dynamic_patch = processor_config.get(
"max_input_tiles",
getattr(config, "max_dynamic_patch", 1),
)
if dynamic_image_size is None:
dynamic_image_size = processor_config.get(
"dynamic_image_size",
getattr(config, "dynamic_image_size", True),
)
super().__init__(
config=config,
tokenizer=tokenizer,
image_processor=None,
min_dynamic_patch=min_dynamic_patch,
max_dynamic_patch=max_dynamic_patch,
dynamic_image_size=dynamic_image_size,
)
def _get_transform(self) -> T.Compose:
"""Override to add SigLIP normalization."""
return build_siglip_transform(input_size=self.image_size)
def _replace_image_tokens(
self,
text: list[str],
pixel_values_lst: list[torch.Tensor],
) -> list[str]:
"""Override with simpler token replacement for embedding model.
No temporary placeholder needed because IMG_CONTEXT is <IMG_CONTEXT>,
not <image>, so there's no collision risk.
"""
for pixel_values in pixel_values_lst:
num_patches = pixel_values.shape[0]
feature_size = num_patches * self.num_image_token
image_repl = self.get_image_repl(feature_size, num_patches)
text = [t.replace("<image>", image_repl.full, 1) for t in text]
return text
class LlamaNemotronVLEmbedProcessingInfo(NemotronVLProcessingInfo):
"""Processing info for LlamaNemotronVL embedding model."""
def get_hf_processor(self, **kwargs: object) -> LlamaNemotronVLEmbedProcessor:
"""Override to create embedding-specific processor without image_processor."""
model_config = self.ctx.model_config
processor_config = {}
if model_config.model is not None:
processor_config = (
get_hf_file_to_dict(
"processor_config.json",
model_config.model,
model_config.revision,
)
or {}
)
return self.ctx.init_processor(
LlamaNemotronVLEmbedProcessor,
config=self.get_hf_config(),
tokenizer=self.get_tokenizer(),
processor_config=processor_config,
**kwargs,
)
@MULTIMODAL_REGISTRY.register_processor(
BaseInternVLMultiModalProcessor[LlamaNemotronVLEmbedProcessingInfo],
info=LlamaNemotronVLEmbedProcessingInfo,
dummy_inputs=BaseInternVLDummyInputsBuilder[LlamaNemotronVLEmbedProcessingInfo],
)
class LlamaNemotronVLForEmbedding(LlamaNemotronVLChatModel, VllmModelForPooling):
"""
LlamaNemotronVL model for embeddings.
Inherits from LlamaNemotronVLChatModel and specializes it for embedding tasks:
- Uses SigLIP vision encoder instead of C-RADIO
- Uses bidirectional LLaMA (via llm_config) instead of causal LLaMA
- Adds pooler for embedding output instead of generating logits
"""
is_pooling_model = True
# Weight mapping from checkpoint format to vLLM format
# Different from parent class due to different vision model structure
weight_mapper = WeightsMapper(
orig_to_new_prefix={
# Language model mapping
"language_model.layers.": "language_model.model.layers.",
"language_model.embed_tokens.": "language_model.model.embed_tokens.",
"language_model.norm.": "language_model.model.norm.",
# Vision model mapping (SiglipVisionModel has nested vision_model)
"vision_model.encoder.": "vision_model.vision_model.encoder.",
"vision_model.embeddings.": "vision_model.vision_model.embeddings.",
"vision_model.post_layernorm.": "vision_model.vision_model.post_layernorm.",
}
)
def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
super().__init__(vllm_config=vllm_config, prefix=prefix)
config = vllm_config.model_config.hf_config
# Override: get img_context_token_id from config (parent sets None)
self.img_context_token_id = getattr(config, "img_context_token_id", None)
# Initialize pooler for embedding output
pooler_config = vllm_config.model_config.pooler_config
assert pooler_config is not None
self.pooler = DispatchPooler.for_embedding(pooler_config)
def _init_vision_model(
self,
config: PretrainedConfig,
quant_config,
*,
prefix: str,
) -> nn.Module:
"""Override to use SigLIP instead of C-RADIO."""
return SiglipVisionModel(
config.vision_config,
quant_config=quant_config,
prefix=prefix,
use_head=False,
)
def _init_mlp1(self, config: PretrainedConfig) -> nn.Module:
"""Override to use different MLP structure for embedding model."""
return super()._init_mlp1(
config,
vit_hidden_size=config.vision_config.hidden_size,
vision_projection_hidden_size=config.get_text_config().hidden_size,
)
def _call_vision_model(self, pixel_values: torch.Tensor) -> torch.Tensor:
"""Override to handle SigLIP interface."""
return self.vision_model(pixel_values)
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
"""Override to use different weight mapping for SigLIP."""
loader = AutoWeightsLoader(self)
return loader.load_weights(weights, mapper=self.weight_mapper)
......@@ -260,6 +260,10 @@ _EMBEDDING_MODELS = {
"OpsColQwen3Model": ("colqwen3", "ColQwen3Model"),
"Qwen3VLNemotronEmbedModel": ("colqwen3", "ColQwen3Model"),
"SiglipModel": ("siglip", "SiglipEmbeddingModel"),
"LlamaNemotronVLModel": (
"nemotron_vl",
"LlamaNemotronVLForEmbedding",
),
# Technically Terratorch models work on images, both in
# input and output. I am adding it here because it piggy-backs on embedding
# models for the time being.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment