Commit 7e63ef82 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0' into v0.14.0-dev

parents 8cbcac5d b17039bc
......@@ -4,90 +4,93 @@ import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo,
LASTPoolingEmbedModelInfo,
RerankModelInfo,
)
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
from .mteb_embed_utils import mteb_test_embed_models
from .mteb_score_utils import mteb_test_rerank_models
MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-base-en",
architecture="BertModel",
mteb_score=0.779336792,
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-base-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-small-en", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-small-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-en", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"BAAI/bge-large-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo("BAAI/bge-base-zh", architecture="BertModel", enable_test=False),
EmbedModelInfo("BAAI/bge-small-en", architecture="BertModel", enable_test=False),
EmbedModelInfo("BAAI/bge-small-zh", architecture="BertModel", enable_test=False),
EmbedModelInfo("BAAI/bge-large-en", architecture="BertModel", enable_test=False),
EmbedModelInfo("BAAI/bge-large-zh", architecture="BertModel", enable_test=False),
EmbedModelInfo(
"BAAI/bge-large-zh-noinstruct", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-base-en-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-base-zh-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-small-en-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-small-zh-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-large-en-v1.5", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-large-zh-v1.5", architecture="BertModel", enable_test=False
),
########## XLMRobertaModel
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-m3",
architecture="XLMRobertaModel",
mteb_score=0.787343078,
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
########## Qwen2Model
LASTPoolingEmbedModelInfo(
EmbedModelInfo(
"BAAI/bge-code-v1",
architecture="Qwen2Model",
mteb_score=0.75724465,
dtype="float32",
seq_pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True,
),
]
RERANK_MODELS = [
########## XLMRobertaForSequenceClassification
CLSPoolingRerankModelInfo(
RerankModelInfo(
"BAAI/bge-reranker-base",
architecture="XLMRobertaForSequenceClassification",
mteb_score=0.32398,
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingRerankModelInfo(
RerankModelInfo(
"BAAI/bge-reranker-large",
architecture="XLMRobertaForSequenceClassification",
enable_test=False,
),
CLSPoolingRerankModelInfo(
RerankModelInfo(
"BAAI/bge-reranker-v2-m3",
architecture="XLMRobertaForSequenceClassification",
enable_test=False,
......@@ -108,7 +111,5 @@ def test_embed_models_correctness(
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(
hf_runner, vllm_runner, model_info: RerankModelInfo
) -> None:
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(vllm_runner, model_info)
......@@ -9,40 +9,62 @@ import torch
from torch.utils.data import DataLoader
from tests.conftest import HfRunner
from tests.models.language.pooling_mteb_test.mteb_utils import (
VllmMtebCrossEncoder,
from tests.models.utils import RerankModelInfo
from .mteb_score_utils import (
MtebCrossEncoderMixin,
mteb_test_rerank_models,
)
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
RERANK_MODELS = [
LASTPoolingRerankModelInfo(
RerankModelInfo(
"BAAI/bge-reranker-v2-gemma",
architecture="GemmaForSequenceClassification",
mteb_score=0.33757,
hf_overrides={
"architectures": ["GemmaForSequenceClassification"],
"classifier_from_token": ["Yes"],
"method": "no_post_processing",
},
mteb_score=0.33757,
seq_pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
chat_template_name="bge-reranker-v2-gemma.jinja",
),
]
PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'." # noqa: E501
class GemmaRerankerHfRunner(HfRunner):
class GemmaRerankerHfRunner(MtebCrossEncoderMixin, HfRunner):
def __init__(
self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
) -> None:
from transformers import AutoModelForCausalLM, AutoTokenizer
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
HfRunner.__init__(
self,
model_name=model_name,
auto_cls=AutoModelForCausalLM,
dtype=dtype,
**kwargs,
)
self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
self.yes_loc = self.tokenizer.convert_tokens_to_ids("Yes")
@torch.no_grad()
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
@torch.no_grad
def predict(
self,
inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
queries = [text for batch in inputs1 for text in batch["text"]]
corpus = [text for batch in inputs2 for text in batch["text"]]
def get_inputs(pairs, tokenizer, prompt=None):
if prompt is None:
prompt = PROMPT
......@@ -87,8 +109,8 @@ class GemmaRerankerHfRunner(HfRunner):
)
scores = []
for query, doc, *_ in prompts:
pairs = [(query, doc)]
for query, document in zip(queries, corpus):
pairs = [(query, document)]
inputs = get_inputs(pairs, self.tokenizer)
inputs = inputs.to(self.model.device)
_n_tokens = inputs["input_ids"].shape[1]
......@@ -105,41 +127,10 @@ class GemmaRerankerHfRunner(HfRunner):
return torch.Tensor(scores)
class GemmaMtebEncoder(VllmMtebCrossEncoder):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.query_template = "A: {query}\n"
self.document_template = "B: {doc}\n{prompt}"
def predict(
self,
inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
queries = [
self.query_template.format(query=text)
for batch in inputs1
for text in batch["text"]
]
corpus = [
self.document_template.format(doc=text, prompt=PROMPT)
for batch in inputs2
for text in batch["text"]
]
outputs = self.llm.score(
queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False
)
scores = np.array(outputs)
return scores
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(
GemmaRerankerHfRunner,
vllm_runner,
model_info,
vllm_mteb_encoder=GemmaMtebEncoder,
hf_runner=GemmaRerankerHfRunner,
)
......@@ -3,29 +3,34 @@
import pytest
from tests.models.utils import (
CLSPoolingRerankModelInfo,
LASTPoolingRerankModelInfo,
RerankModelInfo,
)
from .mteb_utils import mteb_test_rerank_models
from .mteb_score_utils import mteb_test_rerank_models
RERANK_MODELS = [
CLSPoolingRerankModelInfo(
RerankModelInfo(
"cross-encoder/ms-marco-TinyBERT-L-2-v2",
mteb_score=0.32898,
architecture="BertForSequenceClassification",
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
mteb_score=0.32898,
),
LASTPoolingRerankModelInfo(
RerankModelInfo(
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls",
mteb_score=0.25736,
architecture="Qwen3ForSequenceClassification",
seq_pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
chat_template_name="qwen3_reranker.jinja",
mteb_score=0.33459,
),
]
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(
hf_runner, vllm_runner, model_info: RerankModelInfo
) -> None:
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(vllm_runner, model_info)
......@@ -5,36 +5,32 @@ import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo,
LASTPoolingEmbedModelInfo,
RerankModelInfo,
)
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
from .mteb_embed_utils import mteb_test_embed_models
from .mteb_score_utils import mteb_test_rerank_models
MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"thenlper/gte-large",
mteb_score=0.76807651,
architecture="BertModel",
seq_pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-base", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-small", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo("thenlper/gte-base", architecture="BertModel", enable_test=False),
EmbedModelInfo("thenlper/gte-small", architecture="BertModel", enable_test=False),
EmbedModelInfo(
"thenlper/gte-large-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"thenlper/gte-base-zh", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo("thenlper/gte-base-zh", architecture="BertModel", enable_test=False),
EmbedModelInfo(
"thenlper/gte-small-zh", architecture="BertModel", enable_test=False
),
########### NewModel
......@@ -43,68 +39,90 @@ MODELS = [
# - whether to use token_type_embeddings
# - whether to use context expansion
# So only test one (the most widely used) model
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Alibaba-NLP/gte-multilingual-base",
architecture="GteNewModel",
mteb_score=0.775074696,
hf_overrides={"architectures": ["GteNewModel"]},
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Alibaba-NLP/gte-base-en-v1.5",
architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Alibaba-NLP/gte-large-en-v1.5",
architecture="GteNewModel",
hf_overrides={"architectures": ["GteNewModel"]},
enable_test=False,
),
########### Qwen2ForCausalLM
LASTPoolingEmbedModelInfo(
EmbedModelInfo(
"Alibaba-NLP/gte-Qwen2-1.5B-instruct",
mteb_score=0.758473459018872,
architecture="Qwen2ForCausalLM",
seq_pooling_type="LAST",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
########## ModernBertModel
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Alibaba-NLP/gte-modernbert-base",
mteb_score=0.748193353,
architecture="ModernBertModel",
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
########## Qwen3ForCausalLM
LASTPoolingEmbedModelInfo(
EmbedModelInfo(
"Qwen/Qwen3-Embedding-0.6B",
mteb_score=0.771163695,
architecture="Qwen3ForCausalLM",
dtype="float32",
seq_pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
enable_test=True,
),
LASTPoolingEmbedModelInfo(
EmbedModelInfo(
"Qwen/Qwen3-Embedding-4B",
architecture="Qwen3ForCausalLM",
dtype="float32",
enable_test=False,
),
]
RERANK_MODELS = [
CLSPoolingRerankModelInfo(
RerankModelInfo(
# classifier_pooling: mean
"Alibaba-NLP/gte-reranker-modernbert-base",
mteb_score=0.33386,
architecture="ModernBertForSequenceClassification",
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingRerankModelInfo(
RerankModelInfo(
"Alibaba-NLP/gte-multilingual-reranker-base",
mteb_score=0.33062,
architecture="GteNewForSequenceClassification",
hf_overrides={"architectures": ["GteNewForSequenceClassification"]},
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
]
......@@ -123,7 +141,5 @@ def test_embed_models_correctness(
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(
hf_runner, vllm_runner, model_info: RerankModelInfo
) -> None:
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(vllm_runner, model_info)
......@@ -3,40 +3,44 @@
import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from tests.models.utils import EmbedModelInfo
from .mteb_utils import mteb_test_embed_models
from .mteb_embed_utils import mteb_test_embed_models
MODELS = [
########## BertModel
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"intfloat/e5-small",
architecture="BertModel",
mteb_score=0.742285423,
seq_pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
"intfloat/e5-base", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
"intfloat/e5-large", architecture="BertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo("intfloat/e5-base", architecture="BertModel", enable_test=False),
EmbedModelInfo("intfloat/e5-large", architecture="BertModel", enable_test=False),
EmbedModelInfo(
"intfloat/multilingual-e5-small", architecture="BertModel", enable_test=False
),
########## XLMRobertaModel
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"intfloat/multilingual-e5-base",
architecture="XLMRobertaModel",
mteb_score=0.779325955,
seq_pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"intfloat/multilingual-e5-large",
architecture="XLMRobertaModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"intfloat/multilingual-e5-large-instruct",
architecture="XLMRobertaModel",
enable_test=False,
......
......@@ -10,30 +10,36 @@ from tests.models.language.pooling.embed_utils import (
matryoshka_fy,
)
from tests.models.utils import (
CLSPoolingEmbedModelInfo,
CLSPoolingRerankModelInfo,
EmbedModelInfo,
RerankModelInfo,
)
from vllm import PoolingParams
from .mteb_utils import mteb_test_embed_models, mteb_test_rerank_models
from .mteb_embed_utils import mteb_test_embed_models
from .mteb_score_utils import mteb_test_rerank_models
EMBEDDING_MODELS = [
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"jinaai/jina-embeddings-v3",
mteb_score=0.824413164,
architecture="XLMRobertaModel",
is_matryoshka=True,
dtype="float32",
seq_pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
)
]
RERANK_MODELS = [
CLSPoolingRerankModelInfo(
RerankModelInfo(
"jinaai/jina-reranker-v2-base-multilingual",
mteb_score=0.33643,
architecture="XLMRobertaForSequenceClassification",
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
)
]
......@@ -65,10 +71,8 @@ def test_embed_models_correctness(
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(
hf_runner, vllm_runner, model_info: RerankModelInfo
) -> None:
mteb_test_rerank_models(hf_runner, vllm_runner, model_info)
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(vllm_runner, model_info)
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
......
......@@ -2,13 +2,16 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any
import mteb
import numpy as np
import pytest
import torch
from torch.utils.data import DataLoader
from tests.conftest import HfRunner
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
from tests.models.utils import RerankModelInfo
from .mteb_utils import mteb_test_rerank_models
from .mteb_score_utils import MtebCrossEncoderMixin, mteb_test_rerank_models
mxbai_rerank_hf_overrides = {
"architectures": ["Qwen2ForSequenceClassification"],
......@@ -17,50 +20,73 @@ mxbai_rerank_hf_overrides = {
}
RERANK_MODELS = [
LASTPoolingRerankModelInfo(
RerankModelInfo(
"mixedbread-ai/mxbai-rerank-base-v2",
architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides,
mteb_score=0.273,
seq_pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
chat_template_name="mxbai_rerank_v2.jinja",
mteb_score=0.33651,
enable_test=True,
),
LASTPoolingRerankModelInfo(
RerankModelInfo(
"mixedbread-ai/mxbai-rerank-large-v2",
architecture="Qwen2ForSequenceClassification",
hf_overrides=mxbai_rerank_hf_overrides,
chat_template_name="mxbai_rerank_v2.jinja",
enable_test=False,
),
]
class MxbaiRerankerHfRunner(HfRunner):
class MxbaiRerankerHfRunner(MtebCrossEncoderMixin, HfRunner):
def __init__(
self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
) -> None:
from transformers import AutoModelForCausalLM, AutoTokenizer
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
HfRunner.__init__(
self,
model_name=model_name,
auto_cls=AutoModelForCausalLM,
dtype=dtype,
**kwargs,
)
self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
self.yes_loc = self.tokenizer.convert_tokens_to_ids("1")
self.no_loc = self.tokenizer.convert_tokens_to_ids("0")
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
def process_inputs(pairs):
inputs = self.tokenizer(
pairs,
padding=False,
truncation="longest_first",
return_attention_mask=False,
@torch.no_grad
def predict(
self,
inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
queries = [text for batch in inputs1 for text in batch["text"]]
corpus = [text for batch in inputs2 for text in batch["text"]]
tokenizer = self.tokenizer
prompts = []
for query, document in zip(queries, corpus):
conversation = [
{"role": "query", "content": query},
{"role": "document", "content": document},
]
prompt = tokenizer.apply_chat_template(
conversation=conversation,
tools=None,
chat_template=self.chat_template,
tokenize=False,
)
for i, ele in enumerate(inputs["input_ids"]):
inputs["input_ids"][i] = ele
inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
for key in inputs:
inputs[key] = inputs[key].to(self.model.device)
return inputs
@torch.no_grad()
prompts.append(prompt)
def compute_logits(inputs):
logits = self.model(**inputs).logits[:, -1, :]
yes_logits = logits[:, self.yes_loc]
......@@ -70,9 +96,9 @@ class MxbaiRerankerHfRunner(HfRunner):
return scores
scores = []
for query, doc, *_ in prompts:
pairs = [(query, doc)]
inputs = process_inputs(pairs)
for prompt in prompts:
inputs = tokenizer([prompt], return_tensors="pt")
inputs = self.wrap_device(inputs)
score = compute_logits(inputs)
scores.append(score[0].item())
return torch.Tensor(scores)
......@@ -80,4 +106,4 @@ class MxbaiRerankerHfRunner(HfRunner):
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info)
mteb_test_rerank_models(vllm_runner, model_info, hf_runner=MxbaiRerankerHfRunner)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from tests.models.language.pooling_mteb_test.mteb_embed_utils import (
mteb_test_embed_models,
)
from tests.models.language.pooling_mteb_test.mteb_score_utils import (
mteb_test_rerank_models,
)
from tests.models.utils import (
EmbedModelInfo,
RerankModelInfo,
)
EMBEDDING_MODELS = [
EmbedModelInfo(
"nvidia/llama-nemotron-embed-1b-v2",
architecture="LlamaBidirectionalModel",
mteb_score=0.689164662128673,
seq_pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
)
]
RERANK_MODELS = [
RerankModelInfo(
"nvidia/llama-nemotron-rerank-1b-v2",
architecture="LlamaBidirectionalForSequenceClassification",
chat_template_name="nemotron-rerank.jinja",
mteb_score=0.33994,
seq_pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
),
]
@pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) -> None:
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(vllm_runner, model_info)
......@@ -4,30 +4,38 @@
import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from tests.models.utils import EmbedModelInfo
from .mteb_utils import mteb_test_embed_models
from .mteb_embed_utils import mteb_test_embed_models
MODELS = [
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"nomic-ai/nomic-embed-text-v1",
architecture="NomicBertModel",
mteb_score=0.737568559,
enable_test=True,
seq_pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"nomic-ai/nomic-embed-text-v1.5",
architecture="NomicBertModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"nomic-ai/CodeRankEmbed", architecture="NomicBertModel", enable_test=False
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"nomic-ai/nomic-embed-text-v2-moe",
architecture="NomicBertModel",
mteb_score=0.715488912,
enable_test=True,
seq_pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
),
]
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
from typing import Any
import mteb
import numpy as np
import pytest
import torch
from torch.utils.data import DataLoader
from tests.conftest import HfRunner
from tests.models.utils import LASTPoolingRerankModelInfo, RerankModelInfo
from tests.models.utils import RerankModelInfo
from tests.utils import multi_gpu_test
from .mteb_utils import mteb_test_rerank_models
from .mteb_score_utils import MtebCrossEncoderMixin, mteb_test_rerank_models
qwen3_reranker_hf_overrides = {
"architectures": ["Qwen3ForSequenceClassification"],
......@@ -18,50 +22,74 @@ qwen3_reranker_hf_overrides = {
}
RERANK_MODELS = [
LASTPoolingRerankModelInfo(
RerankModelInfo(
"Qwen/Qwen3-Reranker-0.6B",
architecture="Qwen3ForSequenceClassification",
mteb_score=0.25736,
hf_overrides=qwen3_reranker_hf_overrides,
chat_template_name="qwen3_reranker.jinja",
seq_pooling_type="LAST",
attn_type="decoder",
is_prefix_caching_supported=True,
is_chunked_prefill_supported=True,
mteb_score=0.33459,
enable_test=True,
),
LASTPoolingRerankModelInfo(
RerankModelInfo(
"Qwen/Qwen3-Reranker-4B",
architecture="Qwen3ForSequenceClassification",
chat_template_name="qwen3_reranker.jinja",
hf_overrides=qwen3_reranker_hf_overrides,
enable_test=False,
),
]
class Qwen3RerankerHfRunner(HfRunner):
class Qwen3RerankerHfRunner(MtebCrossEncoderMixin, HfRunner):
def __init__(
self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any
) -> None:
from transformers import AutoModelForCausalLM, AutoTokenizer
super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM)
HfRunner.__init__(
self,
model_name=model_name,
auto_cls=AutoModelForCausalLM,
dtype=dtype,
**kwargs,
)
self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor:
def process_inputs(pairs):
inputs = self.tokenizer(
pairs,
padding=False,
truncation="longest_first",
return_attention_mask=False,
self.max_length = 40960
@torch.no_grad
def predict(
self,
inputs1: DataLoader[mteb.types.BatchedInput],
inputs2: DataLoader[mteb.types.BatchedInput],
*args,
**kwargs,
) -> np.ndarray:
queries = [text for batch in inputs1 for text in batch["text"]]
corpus = [text for batch in inputs2 for text in batch["text"]]
tokenizer = self.tokenizer
prompts = []
for query, document in zip(queries, corpus):
conversation = [
{"role": "query", "content": query},
{"role": "document", "content": document},
]
prompt = tokenizer.apply_chat_template(
conversation=conversation,
tools=None,
chat_template=self.chat_template,
tokenize=False,
)
for i, ele in enumerate(inputs["input_ids"]):
inputs["input_ids"][i] = ele
inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt")
for key in inputs:
inputs[key] = inputs[key].to(self.model.device)
return inputs
@torch.no_grad()
prompts.append(prompt)
def compute_logits(inputs):
batch_scores = self.model(**inputs).logits[:, -1, :]
true_vector = batch_scores[:, self.token_true_id]
......@@ -72,9 +100,9 @@ class Qwen3RerankerHfRunner(HfRunner):
return scores
scores = []
for query, doc, *_ in prompts:
pairs = [(query, doc)]
inputs = process_inputs(pairs)
for prompt in prompts:
inputs = tokenizer([prompt], return_tensors="pt")
inputs = self.wrap_device(inputs)
score = compute_logits(inputs)
scores.append(score[0].item())
return torch.Tensor(scores)
......@@ -82,7 +110,7 @@ class Qwen3RerankerHfRunner(HfRunner):
@pytest.mark.parametrize("model_info", RERANK_MODELS)
def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info)
mteb_test_rerank_models(vllm_runner, model_info, hf_runner=Qwen3RerankerHfRunner)
@pytest.mark.parametrize("model_info", RERANK_MODELS)
......@@ -95,5 +123,8 @@ def test_rerank_models_mteb_tp(vllm_runner, model_info: RerankModelInfo) -> None
}
mteb_test_rerank_models(
Qwen3RerankerHfRunner, vllm_runner, model_info, vllm_extra_kwargs
vllm_runner,
model_info,
vllm_extra_kwargs=vllm_extra_kwargs,
hf_runner=Qwen3RerankerHfRunner,
)
......@@ -4,62 +4,82 @@
import pytest
from tests.models.language.pooling.embed_utils import correctness_test_embed_models
from tests.models.utils import CLSPoolingEmbedModelInfo, EmbedModelInfo
from tests.models.utils import EmbedModelInfo
from .mteb_utils import mteb_test_embed_models
from .mteb_embed_utils import mteb_test_embed_models
MODELS = [
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-xs",
is_matryoshka=False,
architecture="BertModel",
mteb_score=0.714927797,
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-s",
is_matryoshka=False,
architecture="BertModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m",
is_matryoshka=False,
architecture="BertModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-long",
is_matryoshka=False,
architecture="NomicBertModel",
mteb_score=0.681146831,
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-l",
is_matryoshka=False,
architecture="BertModel",
enable_test=False,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-v1.5",
is_matryoshka=True,
architecture="BertModel",
mteb_score=0.649088363,
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-l-v2.0",
is_matryoshka=True,
architecture="XLMRobertaModel",
mteb_score=0.712258299,
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"Snowflake/snowflake-arctic-embed-m-v2.0",
is_matryoshka=True,
architecture="GteModel",
mteb_score=0.706622444,
seq_pooling_type="CLS",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
]
......
......@@ -3,27 +3,32 @@
import pytest
from tests.models.utils import (
CLSPoolingEmbedModelInfo,
EmbedModelInfo,
LASTPoolingEmbedModelInfo,
)
from .mteb_utils import mteb_test_embed_models
from .mteb_embed_utils import mteb_test_embed_models
# ST models with projector (Dense) layers
ST_PROJECTOR_MODELS = [
CLSPoolingEmbedModelInfo(
EmbedModelInfo(
"TencentBAC/Conan-embedding-v1",
architecture="BertModel",
mteb_score=0.688611955,
seq_pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
),
LASTPoolingEmbedModelInfo(
EmbedModelInfo(
"google/embeddinggemma-300m",
architecture="Gemma3TextModel",
mteb_score=0.7473819294684156,
seq_pooling_type="MEAN",
attn_type="encoder_only",
is_prefix_caching_supported=False,
is_chunked_prefill_supported=False,
enable_test=True,
dtype="float32",
),
]
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM tests."""
"""Pytest configuration for vLLM multimodal tests."""
import warnings
......@@ -9,20 +9,17 @@ import torch
from vllm.platforms import current_platform
def pytest_configure(config):
"""Disable Flash/MemEfficient SDP on ROCm to avoid HF
Transformers accuracy issues.
"""
def pytest_collection_modifyitems(config, items):
"""Configure ROCm-specific settings based on collected tests."""
if not current_platform.is_rocm():
return
skip_patterns = ["test_granite_speech.py"]
if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
# Skip disabling SDP for Granite Speech tests on ROCm
return
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
# accuracy issues
# accuracy issues: https://github.com/vllm-project/vllm/issues/30167
# TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
......
......@@ -123,10 +123,6 @@ VLM_TEST_SETTINGS = {
),
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
dtype="bfloat16",
marks=[
pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
],
),
"qwen2_5_vl": VLMTestInfo(
......@@ -176,6 +172,13 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner,
vllm_runner_kwargs={
"attention_config": {
"backend": "ROCM_AITER_FA",
},
}
if current_platform.is_rocm()
else None,
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[
pytest.mark.core_model,
......@@ -256,8 +259,19 @@ VLM_TEST_SETTINGS = {
image_size_factors=[(0.25, 0.2, 0.15)],
vllm_runner_kwargs={
"model_impl": "transformers",
# TODO: [ROCm] Revert this once issue #30167 is resolved
**(
{
"mm_processor_kwargs": {
"min_pixels": 256 * 28 * 28,
"max_pixels": 1280 * 28 * 28,
},
}
if current_platform.is_rocm()
else {}
),
},
marks=[large_gpu_mark(min_gb=32)],
marks=[large_gpu_mark(min_gb=80 if current_platform.is_rocm() else 32)],
),
#### Extended model tests
"aria": VLMTestInfo(
......@@ -498,6 +512,7 @@ VLM_TEST_SETTINGS = {
max_model_len=8192,
use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner,
num_logprobs=10 if current_platform.is_rocm() else 5,
),
"intern_vl-hf": VLMTestInfo(
models=["OpenGVLab/InternVL3-1B-hf"],
......@@ -513,6 +528,34 @@ VLM_TEST_SETTINGS = {
use_tokenizer_eos=True,
auto_cls=AutoModelForImageTextToText,
),
"isaac": VLMTestInfo(
models=[
"PerceptronAI/Isaac-0.1",
"PerceptronAI/Isaac-0.2-2B-Preview",
],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: (
f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n"
),
img_idx_to_prompt=lambda idx: "<image>",
single_image_prompts=IMAGE_ASSETS.prompts(
{
"stop_sign": "<vlm_image>Please describe the image shortly.",
"cherry_blossom": "<vlm_image>Please infer the season with reason.",
}
),
multi_image_prompt=(
"Picture 1: <vlm_image>\n"
"Picture 2: <vlm_image>\n"
"Describe these two images with one paragraph respectively."
),
enforce_eager=False,
max_model_len=4096,
max_num_seqs=2,
hf_model_kwargs={"device_map": "auto"},
patch_hf_runner=model_utils.isaac_patch_hf_runner,
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
),
"kimi_vl": VLMTestInfo(
models=["moonshotai/Kimi-VL-A3B-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
......@@ -648,7 +691,17 @@ VLM_TEST_SETTINGS = {
hf_output_post_proc=model_utils.minimax_vl_01_hf_output,
patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner,
auto_cls=AutoModelForImageTextToText,
marks=[large_gpu_mark(min_gb=80)],
marks=[
large_gpu_mark(min_gb=80),
# TODO: [ROCm] Fix pickle issue with ROCm spawn and tp>1
pytest.mark.skipif(
current_platform.is_rocm(),
reason=(
"ROCm: Model too large for single GPU; "
"multi-GPU blocked by HF _LazyConfigMapping pickle issue with spawn"
),
),
],
),
"molmo": VLMTestInfo(
models=["allenai/Molmo-7B-D-0924"],
......
......@@ -37,10 +37,12 @@ audio_lora_path = MODEL_NAME
models = [MODEL_NAME]
@pytest.fixture(autouse=True)
def set_attention_backend_for_rocm(monkeypatch):
@pytest.fixture
def granite_speech_attention_config():
"""Return attention config for Granite Speech tests on ROCm."""
if current_platform.is_rocm():
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
return {"backend": "ROCM_AITER_FA"}
return None
def run_test(
......@@ -55,6 +57,7 @@ def run_test(
num_logprobs: int,
tensor_parallel_size: int,
distributed_executor_backend: str | None = None,
attention_config: dict | None = None,
):
"""Inference result should be the same between hf and vllm.
......@@ -82,6 +85,7 @@ def run_test(
enable_lora=True,
max_lora_rank=64,
enforce_eager=True,
attention_config=attention_config,
) as vllm_model:
lora_request = LoRARequest("audio", 1, audio_lora_path)
vllm_outputs_per_case = [
......@@ -133,6 +137,7 @@ def test_models(
vllm_runner,
model: str,
audio_assets: AudioTestAssets,
granite_speech_attention_config,
dtype: str,
max_model_len: int,
max_tokens: int,
......@@ -159,4 +164,5 @@ def test_models(
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
)
\ No newline at end of file
attention_config=granite_speech_attention_config,
)
......@@ -8,7 +8,7 @@ from PIL.Image import Image
from transformers import AutoProcessor
from vllm import LLM, EngineArgs, SamplingParams
from vllm.multimodal.utils import encode_image_base64
from vllm.multimodal.utils import encode_image_url
MODEL_NAME = "Kwai-Keye/Keye-VL-8B-Preview"
......@@ -31,10 +31,7 @@ def test_keye_vl(
question: str,
):
images = [asset.pil_image for asset in image_assets]
image_urls = [
f"data:image/jpeg;base64,{encode_image_base64(image)}" for image in images
]
image_urls = [encode_image_url(image) for image in images]
engine_args = EngineArgs(
model=MODEL_NAME,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
import pytest
from transformers import AutoModel
from tests.models.utils import check_logprobs_close
from vllm.assets.image import ImageAsset
from ....conftest import HfRunner, PromptImageInput, VllmRunner
from ....utils import create_new_process_for_each_test
IMAGE = ImageAsset("paper-11").pil_image_ext(ext="png").convert("RGB")
PROMPT = "</s><s><predict_bbox><predict_classes><output_markdown>"
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: Sequence[tuple[list[str], PromptImageInput]],
model: str,
*,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
"""Verify that the inference result is the same between hf and vllm."""
with vllm_runner(
model,
dtype=dtype,
max_num_seqs=64,
limit_mm_per_prompt={"image": 1},
trust_remote_code=True,
) as vllm_model:
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
)
for prompts, images in inputs
]
with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model:
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
use_cache=False, # HF Nemotron Parse crashes here without this
)
for prompts, images in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("num_logprobs", [5])
@create_new_process_for_each_test("spawn")
def test_models(
hf_runner, vllm_runner, model: str, dtype: str, num_logprobs: int
) -> None:
run_test(
hf_runner,
vllm_runner,
inputs=[
(
[PROMPT] * 10,
[IMAGE] * 10,
),
],
model=model,
dtype=dtype,
max_tokens=100,
num_logprobs=num_logprobs,
)
......@@ -269,7 +269,7 @@ def run_embedding_input_test(
"""Inference result should be the same between
original image/video input and image/video embeddings input.
"""
from transformers import AutoProcessor # noqa: F401
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained(model)
......
......@@ -14,10 +14,10 @@ import pytest
from transformers import AutoProcessor
from vllm import LLM, EngineArgs, SamplingParams
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.multimodal.utils import encode_image_base64
from vllm.multimodal.utils import encode_image_url
from vllm.multimodal.video import sample_frames_from_video
from vllm.platforms import current_platform
from vllm.v1.attention.backends.registry import AttentionBackendEnum
from ....utils import create_new_process_for_each_test
from ...utils import dummy_hf_overrides
......@@ -178,8 +178,7 @@ def build_dots_ocr_prompt(images, config):
"""Build Dots.OCR specific prompt with OCR instructions."""
# Use only stop_sign image for Dots.OCR
image = images[0] # Already filtered to stop_sign
image_url = f"data:image/jpeg;base64,{encode_image_base64(image)}"
image_url = encode_image_url(image)
placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
messages = [
......@@ -204,9 +203,7 @@ def build_processor_prompt(images, config):
config["model_name"], trust_remote_code=True
)
image_urls = [
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
]
image_urls = [encode_image_url(img) for img in images]
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
......@@ -225,9 +222,7 @@ def build_processor_prompt(images, config):
def build_ovis_prompt(images, config):
"""Build Ovis2.5 specific prompt with custom format."""
image_urls = [
f"data:image/jpeg;base64,{encode_image_base64(img)}" for img in images
]
image_urls = [encode_image_url(img) for img in images]
placeholders = "\n".join(
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
......
......@@ -111,4 +111,5 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]
assert choice.message.content == "In the first audio clip, you hear a brief"
assert choice.finish_reason == "length"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment