Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
89358f0d
Unverified
Commit
89358f0d
authored
Feb 20, 2026
by
Andreas Karatzas
Committed by
GitHub
Feb 20, 2026
Browse files
[CI] Fix ColBERT HF comparison tests on AMD CI + refactor (#34567)
Signed-off-by:
Andreas Karatzas
<
akaratza@amd.com
>
parent
a0fe7ea2
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
107 additions
and
149 deletions
+107
-149
tests/models/language/pooling/test_colbert.py
tests/models/language/pooling/test_colbert.py
+107
-149
No files found.
tests/models/language/pooling/test_colbert.py
View file @
89358f0d
...
@@ -20,6 +20,12 @@ COLBERT_MODELS = {
...
@@ -20,6 +20,12 @@ COLBERT_MODELS = {
"colbert_dim"
:
96
,
"colbert_dim"
:
96
,
"max_model_len"
:
512
,
"max_model_len"
:
512
,
"extra_kwargs"
:
{},
"extra_kwargs"
:
{},
"hf_comparison"
:
{
"weights_file"
:
"model.safetensors"
,
"weights_key"
:
"linear.weight"
,
"trust_remote_code"
:
False
,
"model_cls"
:
"BertModel"
,
},
},
},
"modernbert"
:
{
"modernbert"
:
{
"model"
:
"lightonai/GTE-ModernColBERT-v1"
,
"model"
:
"lightonai/GTE-ModernColBERT-v1"
,
...
@@ -30,6 +36,12 @@ COLBERT_MODELS = {
...
@@ -30,6 +36,12 @@ COLBERT_MODELS = {
"architectures"
:
[
"ColBERTModernBertModel"
],
"architectures"
:
[
"ColBERTModernBertModel"
],
},
},
},
},
"hf_comparison"
:
{
"weights_file"
:
"1_Dense/model.safetensors"
,
"weights_key"
:
"linear.weight"
,
"trust_remote_code"
:
False
,
"model_cls"
:
"AutoModel"
,
},
},
},
"jina"
:
{
"jina"
:
{
"model"
:
"jinaai/jina-colbert-v2"
,
"model"
:
"jinaai/jina-colbert-v2"
,
...
@@ -40,9 +52,16 @@ COLBERT_MODELS = {
...
@@ -40,9 +52,16 @@ COLBERT_MODELS = {
"architectures"
:
[
"ColBERTJinaRobertaModel"
],
"architectures"
:
[
"ColBERTJinaRobertaModel"
],
},
},
},
},
"hf_comparison"
:
{
"weights_file"
:
"model.safetensors"
,
"weights_key"
:
"linear.weight"
,
"trust_remote_code"
:
True
,
"model_cls"
:
"AutoModel"
,
},
},
},
}
}
TEXTS_1
=
[
TEXTS_1
=
[
"What is the capital of France?"
,
"What is the capital of France?"
,
"What is the capital of Germany?"
,
"What is the capital of Germany?"
,
...
@@ -56,9 +75,68 @@ TEXTS_2 = [
...
@@ -56,9 +75,68 @@ TEXTS_2 = [
DTYPE
=
"half"
DTYPE
=
"half"
# -----------------------------------------------------------------------
def
_load_hf_model
(
model_name
:
str
,
hf_spec
:
dict
,
device
:
torch
.
device
):
# Fixtures
"""Load HF model on the given device with a compatible attention impl."""
# -----------------------------------------------------------------------
from
transformers
import
AutoModel
,
BertModel
cls
=
BertModel
if
hf_spec
[
"model_cls"
]
==
"BertModel"
else
AutoModel
trust
=
hf_spec
.
get
(
"trust_remote_code"
,
False
)
# Flash / Triton kernels require GPU tensors; fall back to eager on CPU.
extra
=
{}
if
device
.
type
==
"cpu"
:
extra
[
"attn_implementation"
]
=
"eager"
model
=
cls
.
from_pretrained
(
model_name
,
trust_remote_code
=
trust
,
**
extra
,
).
to
(
device
)
model
.
eval
()
return
model
def
_load_projection_weight
(
model_name
:
str
,
hf_spec
:
dict
,
device
:
torch
.
device
):
"""Download and return the ColBERT linear projection weight."""
from
huggingface_hub
import
hf_hub_download
from
safetensors.torch
import
load_file
path
=
hf_hub_download
(
model_name
,
filename
=
hf_spec
[
"weights_file"
])
weights
=
load_file
(
path
)
return
weights
[
hf_spec
[
"weights_key"
]].
to
(
device
)
def
_compute_hf_colbert_embeddings
(
model
,
tokenizer
,
linear_weight
,
texts
,
device
):
"""Run HF model + projection and return L2-normalised token embeddings."""
import
torch.nn.functional
as
F
embeddings
=
[]
for
text
in
texts
:
inputs
=
tokenizer
(
text
,
return_tensors
=
"pt"
).
to
(
device
)
with
torch
.
no_grad
():
hidden
=
model
(
**
inputs
).
last_hidden_state
.
float
()
projected
=
F
.
linear
(
hidden
,
linear_weight
.
float
())
normalised
=
F
.
normalize
(
projected
,
p
=
2
,
dim
=-
1
)
embeddings
.
append
(
normalised
.
squeeze
(
0
).
cpu
())
return
embeddings
def
_assert_embeddings_close
(
vllm_outputs
,
hf_embeddings
):
"""Assert that vLLM and HuggingFace embeddings match."""
for
i
,
(
hf_emb
,
vllm_out
)
in
enumerate
(
zip
(
hf_embeddings
,
vllm_outputs
)):
vllm_emb
=
torch
.
as_tensor
(
vllm_out
).
float
()
assert
hf_emb
.
shape
==
vllm_emb
.
shape
,
(
f
"Shape mismatch for text
{
i
}
: HF
{
hf_emb
.
shape
}
vs vLLM
{
vllm_emb
.
shape
}
"
)
torch
.
testing
.
assert_close
(
vllm_emb
,
hf_emb
,
rtol
=
1e-2
,
atol
=
1e-2
,
msg
=
f
"Embedding mismatch for text
{
i
}
"
,
)
@
pytest
.
fixture
(
params
=
list
(
COLBERT_MODELS
.
keys
()),
scope
=
"module"
)
@
pytest
.
fixture
(
params
=
list
(
COLBERT_MODELS
.
keys
()),
scope
=
"module"
)
...
@@ -87,11 +165,6 @@ def colbert_extra_kwargs(colbert_spec):
...
@@ -87,11 +165,6 @@ def colbert_extra_kwargs(colbert_spec):
return
colbert_spec
[
"extra_kwargs"
]
return
colbert_spec
[
"extra_kwargs"
]
# -----------------------------------------------------------------------
# Tests
# -----------------------------------------------------------------------
def
test_colbert_token_embed
(
def
test_colbert_token_embed
(
vllm_runner
,
vllm_runner
,
colbert_model_name
,
colbert_model_name
,
...
@@ -111,7 +184,7 @@ def test_colbert_token_embed(
...
@@ -111,7 +184,7 @@ def test_colbert_token_embed(
outputs
=
vllm_model
.
token_embed
([
TEXTS_1
[
0
]])
outputs
=
vllm_model
.
token_embed
([
TEXTS_1
[
0
]])
assert
len
(
outputs
)
==
1
assert
len
(
outputs
)
==
1
emb
=
torch
.
tensor
(
outputs
[
0
])
emb
=
torch
.
as_
tensor
(
outputs
[
0
])
assert
emb
.
dim
()
==
2
assert
emb
.
dim
()
==
2
assert
emb
.
shape
[
1
]
==
colbert_dim
assert
emb
.
shape
[
1
]
==
colbert_dim
assert
emb
.
shape
[
0
]
>
1
assert
emb
.
shape
[
0
]
>
1
...
@@ -135,8 +208,8 @@ def test_colbert_late_interaction_1_to_1(
...
@@ -135,8 +208,8 @@ def test_colbert_late_interaction_1_to_1(
q_outputs
=
vllm_model
.
token_embed
([
TEXTS_1
[
0
]])
q_outputs
=
vllm_model
.
token_embed
([
TEXTS_1
[
0
]])
d_outputs
=
vllm_model
.
token_embed
([
TEXTS_2
[
0
]])
d_outputs
=
vllm_model
.
token_embed
([
TEXTS_2
[
0
]])
q_emb
=
torch
.
tensor
(
q_outputs
[
0
])
q_emb
=
torch
.
as_
tensor
(
q_outputs
[
0
])
d_emb
=
torch
.
tensor
(
d_outputs
[
0
])
d_emb
=
torch
.
as_
tensor
(
d_outputs
[
0
])
manual_score
=
compute_maxsim_score
(
q_emb
,
d_emb
).
item
()
manual_score
=
compute_maxsim_score
(
q_emb
,
d_emb
).
item
()
...
@@ -164,11 +237,11 @@ def test_colbert_late_interaction_1_to_N(
...
@@ -164,11 +237,11 @@ def test_colbert_late_interaction_1_to_N(
q_outputs
=
vllm_model
.
token_embed
([
TEXTS_1
[
0
]])
q_outputs
=
vllm_model
.
token_embed
([
TEXTS_1
[
0
]])
d_outputs
=
vllm_model
.
token_embed
(
TEXTS_2
)
d_outputs
=
vllm_model
.
token_embed
(
TEXTS_2
)
q_emb
=
torch
.
tensor
(
q_outputs
[
0
])
q_emb
=
torch
.
as_
tensor
(
q_outputs
[
0
])
manual_scores
=
[]
manual_scores
=
[]
for
d_out
in
d_outputs
:
for
d_out
in
d_outputs
:
d_emb
=
torch
.
tensor
(
d_out
)
d_emb
=
torch
.
as_
tensor
(
d_out
)
manual_scores
.
append
(
compute_maxsim_score
(
q_emb
,
d_emb
).
item
())
manual_scores
.
append
(
compute_maxsim_score
(
q_emb
,
d_emb
).
item
())
vllm_scores
=
vllm_model
.
score
(
TEXTS_1
[
0
],
TEXTS_2
)
vllm_scores
=
vllm_model
.
score
(
TEXTS_1
[
0
],
TEXTS_2
)
...
@@ -198,8 +271,8 @@ def test_colbert_late_interaction_N_to_N(
...
@@ -198,8 +271,8 @@ def test_colbert_late_interaction_N_to_N(
manual_scores
=
[]
manual_scores
=
[]
for
q_out
,
d_out
in
zip
(
q_outputs
,
d_outputs
):
for
q_out
,
d_out
in
zip
(
q_outputs
,
d_outputs
):
q_emb
=
torch
.
tensor
(
q_out
)
q_emb
=
torch
.
as_
tensor
(
q_out
)
d_emb
=
torch
.
tensor
(
d_out
)
d_emb
=
torch
.
as_
tensor
(
d_out
)
manual_scores
.
append
(
compute_maxsim_score
(
q_emb
,
d_emb
).
item
())
manual_scores
.
append
(
compute_maxsim_score
(
q_emb
,
d_emb
).
item
())
vllm_scores
=
vllm_model
.
score
(
TEXTS_1
,
TEXTS_2
)
vllm_scores
=
vllm_model
.
score
(
TEXTS_1
,
TEXTS_2
)
...
@@ -259,79 +332,16 @@ def test_colbert_embed_not_supported(
...
@@ -259,79 +332,16 @@ def test_colbert_embed_not_supported(
vllm_model
.
embed
([
TEXTS_1
[
0
]])
vllm_model
.
embed
([
TEXTS_1
[
0
]])
# -----------------------------------------------------------------------
@
pytest
.
mark
.
parametrize
(
"backend"
,
list
(
COLBERT_MODELS
.
keys
()))
# Per-model HuggingFace comparison tests
def
test_colbert_hf_comparison
(
vllm_runner
,
backend
):
# -----------------------------------------------------------------------
"""Test that vLLM ColBERT embeddings match HuggingFace for each backend."""
from
transformers
import
AutoTokenizer
def
_assert_embeddings_close
(
vllm_outputs
,
hf_embeddings
):
"""Assert that vLLM and HuggingFace embeddings match."""
for
i
,
(
hf_emb
,
vllm_out
)
in
enumerate
(
zip
(
hf_embeddings
,
vllm_outputs
)):
vllm_emb
=
torch
.
tensor
(
vllm_out
).
float
()
assert
hf_emb
.
shape
==
vllm_emb
.
shape
,
(
f
"Shape mismatch for text
{
i
}
: HF
{
hf_emb
.
shape
}
vs vLLM
{
vllm_emb
.
shape
}
"
)
torch
.
testing
.
assert_close
(
spec
=
COLBERT_MODELS
[
backend
]
vllm_emb
,
hf_spec
=
spec
[
"hf_comparison"
]
hf_emb
,
rtol
=
1e-2
,
atol
=
1e-2
,
msg
=
f
"Embedding mismatch for text
{
i
}
"
,
)
def
test_colbert_hf_comparison_bert
(
vllm_runner
):
"""Test that vLLM ColBERT produces same embeddings as HuggingFace (BERT)."""
import
torch.nn.functional
as
F
from
huggingface_hub
import
hf_hub_download
from
safetensors.torch
import
load_file
from
transformers
import
AutoTokenizer
,
BertModel
model_name
=
COLBERT_MODELS
[
"bert"
][
"model"
]
test_texts
=
[
TEXTS_1
[
0
],
TEXTS_2
[
0
]]
with
vllm_runner
(
model_name
,
runner
=
"pooling"
,
dtype
=
"float32"
,
max_model_len
=
512
,
enforce_eager
=
True
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
token_embed
(
test_texts
)
hf_tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
hf_bert
=
BertModel
.
from_pretrained
(
model_name
)
hf_bert
.
eval
()
weights_path
=
hf_hub_download
(
model_name
,
filename
=
"model.safetensors"
)
weights
=
load_file
(
weights_path
)
linear_weight
=
weights
[
"linear.weight"
]
# [96, 384]
hf_embeddings
=
[]
for
text
in
test_texts
:
inputs
=
hf_tokenizer
(
text
,
return_tensors
=
"pt"
)
with
torch
.
no_grad
():
outputs
=
hf_bert
(
**
inputs
)
hidden_states
=
outputs
.
last_hidden_state
token_emb
=
F
.
linear
(
hidden_states
,
linear_weight
)
token_emb
=
F
.
normalize
(
token_emb
,
p
=
2
,
dim
=-
1
)
hf_embeddings
.
append
(
token_emb
.
squeeze
(
0
).
float
())
_assert_embeddings_close
(
vllm_outputs
,
hf_embeddings
)
def
test_colbert_hf_comparison_modernbert
(
vllm_runner
):
"""Test that vLLM ColBERT produces same embeddings as HuggingFace
(ModernBERT)."""
import
torch.nn.functional
as
F
from
huggingface_hub
import
hf_hub_download
from
safetensors.torch
import
load_file
from
transformers
import
AutoModel
,
AutoTokenizer
spec
=
COLBERT_MODELS
[
"modernbert"
]
model_name
=
spec
[
"model"
]
model_name
=
spec
[
"model"
]
assert
isinstance
(
model_name
,
str
)
assert
isinstance
(
hf_spec
,
dict
)
test_texts
=
[
TEXTS_1
[
0
],
TEXTS_2
[
0
]]
test_texts
=
[
TEXTS_1
[
0
],
TEXTS_2
[
0
]]
with
vllm_runner
(
with
vllm_runner
(
...
@@ -344,73 +354,21 @@ def test_colbert_hf_comparison_modernbert(vllm_runner):
...
@@ -344,73 +354,21 @@ def test_colbert_hf_comparison_modernbert(vllm_runner):
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
token_embed
(
test_texts
)
vllm_outputs
=
vllm_model
.
token_embed
(
test_texts
)
hf_tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
hf_model
=
AutoModel
.
from_pretrained
(
model_name
)
hf_model
.
eval
()
# Load projection from sentence-transformers 1_Dense layer
dense_path
=
hf_hub_download
(
model_name
,
filename
=
"1_Dense/model.safetensors"
)
dense_weights
=
load_file
(
dense_path
)
linear_weight
=
dense_weights
[
"linear.weight"
]
# [128, 768]
hf_embeddings
=
[]
for
text
in
test_texts
:
inputs
=
hf_tokenizer
(
text
,
return_tensors
=
"pt"
)
with
torch
.
no_grad
():
outputs
=
hf_model
(
**
inputs
)
hidden_states
=
outputs
.
last_hidden_state
token_emb
=
F
.
linear
(
hidden_states
,
linear_weight
)
token_emb
=
F
.
normalize
(
token_emb
,
p
=
2
,
dim
=-
1
)
hf_embeddings
.
append
(
token_emb
.
squeeze
(
0
).
float
())
_assert_embeddings_close
(
vllm_outputs
,
hf_embeddings
)
def
test_colbert_hf_comparison_jina
(
vllm_runner
):
"""Test that vLLM ColBERT produces same embeddings as HuggingFace
(Jina XLM-RoBERTa)."""
import
torch.nn.functional
as
F
from
huggingface_hub
import
hf_hub_download
from
safetensors.torch
import
load_file
from
transformers
import
AutoModel
,
AutoTokenizer
spec
=
COLBERT_MODELS
[
"jina"
]
model_name
=
spec
[
"model"
]
test_texts
=
[
TEXTS_1
[
0
],
TEXTS_2
[
0
]]
with
vllm_runner
(
model_name
,
runner
=
"pooling"
,
dtype
=
"float32"
,
max_model_len
=
spec
[
"max_model_len"
],
enforce_eager
=
True
,
**
spec
[
"extra_kwargs"
],
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
token_embed
(
test_texts
)
hf_tokenizer
=
AutoTokenizer
.
from_pretrained
(
hf_tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
,
model_name
,
trust_remote_code
=
True
,
trust_remote_code
=
hf_spec
.
get
(
"trust_remote_code"
,
False
)
,
)
)
hf_model
=
AutoModel
.
from_pretrained
(
hf_model
=
_load_hf_model
(
model_name
,
hf_spec
,
device
)
model_name
,
linear_weight
=
_load_projection_weight
(
model_name
,
hf_spec
,
device
)
trust_remote_code
=
True
,
hf_embeddings
=
_compute_hf_colbert_embeddings
(
hf_model
,
hf_tokenizer
,
linear_weight
,
test_texts
,
device
,
)
)
hf_model
.
eval
()
# Load projection from main checkpoint
weights_path
=
hf_hub_download
(
model_name
,
filename
=
"model.safetensors"
)
weights
=
load_file
(
weights_path
)
linear_weight
=
weights
[
"linear.weight"
]
# [128, 1024]
hf_embeddings
=
[]
for
text
in
test_texts
:
inputs
=
hf_tokenizer
(
text
,
return_tensors
=
"pt"
)
with
torch
.
no_grad
():
outputs
=
hf_model
(
**
inputs
)
hidden_states
=
outputs
.
last_hidden_state
token_emb
=
F
.
linear
(
hidden_states
.
float
(),
linear_weight
.
float
())
token_emb
=
F
.
normalize
(
token_emb
,
p
=
2
,
dim
=-
1
)
hf_embeddings
.
append
(
token_emb
.
squeeze
(
0
).
float
())
_assert_embeddings_close
(
vllm_outputs
,
hf_embeddings
)
_assert_embeddings_close
(
vllm_outputs
,
hf_embeddings
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment