test_colmodernvbert.py 3.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for ColModernVBERT multimodal late-interaction model.

ColModernVBERT combines SigLIP vision encoder + ModernBERT text encoder
with a pixel shuffle connector and ColBERT-style 128-dim per-token
embeddings for visual document retrieval.
"""

import pytest
import torch

from vllm.entrypoints.pooling.score.utils import compute_maxsim_score

MODEL_NAME = "ModernVBERT/colmodernvbert-merged"
COLBERT_DIM = 128
DTYPE = "half"
18
19
20
21
# Fixme:
#  Update colmodernvbert code to support the latest HF version
#  and remove revision set.
REVISION = "4a0a9f3ac7a7992fec410bfa8e3d080ac9a5bcee"
22
23
24
25
26
27
28
29
30
31
32


# -----------------------------------------------------------------------
# Text-only tests
# -----------------------------------------------------------------------


def test_colmodernvbert_text_token_embed(vllm_runner):
    """Text query produces per-token embeddings with shape (seq_len, 128)."""
    with vllm_runner(
        MODEL_NAME,
33
        revision=REVISION,
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
        runner="pooling",
        dtype=DTYPE,
        enforce_eager=True,
    ) as vllm_model:
        outputs = vllm_model.token_embed(["What is machine learning?"])

        assert len(outputs) == 1
        emb = torch.tensor(outputs[0])
        assert emb.dim() == 2
        assert emb.shape[1] == COLBERT_DIM
        assert emb.shape[0] > 1


def test_colmodernvbert_text_relevance_ordering(vllm_runner):
    """Relevant documents score higher than irrelevant ones."""
    query = "What is machine learning?"
    documents = [
        "Machine learning is a subset of artificial intelligence.",
        "The weather in Paris is mild in spring.",
    ]

    with vllm_runner(
        MODEL_NAME,
57
        revision=REVISION,
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
        runner="pooling",
        dtype=DTYPE,
        enforce_eager=True,
    ) as vllm_model:
        scores = vllm_model.score(query, documents)

        assert len(scores) == 2
        assert scores[0] > scores[1], "ML doc should score higher than weather doc"


def test_colmodernvbert_text_late_interaction(vllm_runner):
    """MaxSim scoring via vLLM matches manual computation."""
    query = "What is the capital of France?"
    doc = "The capital of France is Paris."

    with vllm_runner(
        MODEL_NAME,
75
        revision=REVISION,
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
        runner="pooling",
        dtype=DTYPE,
        enforce_eager=True,
    ) as vllm_model:
        q_out = vllm_model.token_embed([query])
        d_out = vllm_model.token_embed([doc])

        q_emb = torch.tensor(q_out[0])
        d_emb = torch.tensor(d_out[0])
        manual_score = compute_maxsim_score(q_emb, d_emb).item()

        vllm_scores = vllm_model.score(query, doc)

        assert len(vllm_scores) == 1
        assert vllm_scores[0] == pytest.approx(manual_score, rel=0.01)


# -----------------------------------------------------------------------
# Image tests
# -----------------------------------------------------------------------


def test_colmodernvbert_image_token_embed(vllm_runner, image_assets):
    """Image input produces per-token embeddings including vision tokens."""
    with vllm_runner(
        MODEL_NAME,
102
        revision=REVISION,
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
        runner="pooling",
        dtype=DTYPE,
        enforce_eager=True,
    ) as vllm_model:
        image = image_assets[0].pil_image
        inputs = vllm_model.get_inputs(
            [""],
            images=[image],
        )
        req_outputs = vllm_model.llm.encode(
            inputs,
            pooling_task="token_embed",
        )
        outputs = [req_output.outputs.data for req_output in req_outputs]

        assert len(outputs) == 1
        emb = torch.tensor(outputs[0])
        assert emb.dim() == 2
        assert emb.shape[1] == COLBERT_DIM
        # Should have at least the image tokens (64 after pixel shuffle)
        assert emb.shape[0] >= 64