Unverified Commit afb4429b authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[CI/Build] Reorganize models tests (#17459)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent aa4502e7
......@@ -8,7 +8,7 @@ import pytest
from vllm.config import PoolerConfig
from vllm.platforms import current_platform
from ..utils import check_embeddings_close
from ...utils import check_embeddings_close
@pytest.mark.parametrize(
......
......@@ -7,11 +7,10 @@ from array import array
import openai
import pytest
import pytest_asyncio
from scipy.spatial.distance import cosine
import vllm
import vllm.config
from vllm import LLM, SamplingParams
from vllm.config import ModelConfig
from vllm.utils import STR_BACKEND_ENV_VAR
from ....utils import RemoteOpenAIServer
......@@ -31,73 +30,45 @@ def _arr(arr):
return array("i", arr)
def test_find_array(monkeypatch: pytest.MonkeyPatch):
# GritLM embedding implementation is only supported by XFormers backend.
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
from vllm.model_executor.models.gritlm import GritLMPooler
# Create an LLM object to get the model config.
llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
with pytest.raises(ValueError):
pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
@pytest.fixture(scope="module")
def server_embedding():
# GritLM embedding implementation is only supported by XFormers backend.
args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
with pytest.MonkeyPatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def server_generate():
args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
with pytest.MonkeyPatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
def test_find_array():
from vllm.model_executor.models.gritlm import GritLMPooler
model_config = ModelConfig(
MODEL_NAME,
task="embed",
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
dtype="bfloat16",
seed=0,
)
pooler = GritLMPooler(model_config=model_config)
@pytest_asyncio.fixture
async def client_embedding(server_embedding: RemoteOpenAIServer):
async with server_embedding.get_async_client() as async_client:
yield async_client
arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
@pytest_asyncio.fixture
async def client_generate(server_generate: RemoteOpenAIServer):
async with server_generate.get_async_client() as async_client:
yield async_client
with pytest.raises(ValueError):
pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
def run_llm_encode(
llm: vllm.LLM,
llm: LLM,
queries: list[str],
instruction: str,
) -> list[float]:
outputs = llm.encode([instruction + q for q in queries], )
) -> list[list[float]]:
outputs = llm.embed([instruction + q for q in queries])
return [output.outputs.embedding for output in outputs]
async def run_client_embeddings(
client: vllm.LLM,
client: openai.AsyncOpenAI,
queries: list[str],
instruction: str,
) -> list[float]:
) -> list[list[float]]:
outputs = await client.embeddings.create(
model=MODEL_NAME,
input=[instruction + q for q in queries],
......@@ -132,7 +103,7 @@ def get_test_data():
return queries, q_instruction, documents, d_instruction
def validate_embed_output(q_rep: list[float], d_rep: list[float]):
def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]):
cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001)
......@@ -143,70 +114,100 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]):
assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001)
cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001)
assert math.isclose(cosine_sim_q1_d1, 0.534, abs_tol=0.001)
def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch):
def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch,
vllm_runner):
# GritLM embedding implementation is only supported by XFormers backend.
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
queries, q_instruction, documents, d_instruction = get_test_data()
llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
with vllm_runner(
MODEL_NAME,
task="embed",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.model
d_rep = run_llm_encode(
llm,
documents,
d_instruction,
)
q_rep = run_llm_encode(
llm,
queries,
q_instruction,
)
validate_embed_output(q_rep, d_rep)
@pytest.mark.asyncio
async def test_gritlm_api_server_embedding():
queries, q_instruction, documents, d_instruction = get_test_data()
# GritLM embedding implementation is only supported by XFormers backend.
args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
env_dict = {STR_BACKEND_ENV_VAR: "XFORMERS"}
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as server:
client_embedding = server.get_async_client()
d_rep = run_llm_encode(
llm,
d_rep = await run_client_embeddings(
client_embedding,
documents,
d_instruction,
)
q_rep = run_llm_encode(
llm,
q_rep = await run_client_embeddings(
client_embedding,
queries,
q_instruction,
)
validate_embed_output(q_rep, d_rep)
@pytest.mark.asyncio
async def test_gritlm_api_server_embedding(
client_embedding: openai.AsyncOpenAI, ):
queries, q_instruction, documents, d_instruction = get_test_data()
validate_embed_output(q_rep, d_rep)
d_rep = await run_client_embeddings(
client_embedding,
documents,
d_instruction,
)
q_rep = await run_client_embeddings(
client_embedding,
queries,
q_instruction,
)
validate_embed_output(q_rep, d_rep)
def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
# GritLM embedding implementation is only supported by XFormers backend.
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "0")
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
def test_gritlm_offline_gen():
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
with vllm_runner(
MODEL_NAME,
task="generate",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.model
llm = vllm.LLM(MODEL_NAME, max_model_len=MAX_MODEL_LEN)
sampling_params = vllm.SamplingParams(temperature=0.0, max_tokens=256)
outputs = llm.generate(input, sampling_params=sampling_params)
sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
outputs = llm.generate(input, sampling_params=sampling_params)
assert outputs[0].outputs[0].text == "The capital of France is Paris."
assert outputs[0].outputs[0].text == "The capital of France is Paris."
@pytest.mark.asyncio
async def test_gritlm_api_server_gen(client_generate: openai.AsyncOpenAI):
async def test_gritlm_api_server_generate():
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
outputs = await client_generate.completions.create(
model=MODEL_NAME,
prompt=input,
max_tokens=256,
temperature=0.0,
)
# GritLM embedding implementation is only supported by XFormers backend.
args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
env_dict = {"VLLM_USE_V1": "0", STR_BACKEND_ENV_VAR: "XFORMERS"}
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as server:
client_generate = server.get_async_client()
outputs = await client_generate.completions.create(
model=MODEL_NAME,
prompt=input,
max_tokens=256,
temperature=0.0,
)
assert outputs.choices[0].text == "The capital of France is Paris."
......@@ -8,9 +8,10 @@ import math
import pytest
from tests.models.embedding.utils import check_embeddings_close, matryoshka_fy
from vllm import PoolingParams
from ...utils import check_embeddings_close, matryoshka_fy
SCORING_MODELS = [
"jinaai/jina-reranker-v2-base-multilingual", # Roberta
]
......
......@@ -5,9 +5,7 @@ Run `pytest tests/models/embedding/language/test_snowflake_arctic_embed.py`.
"""
import pytest
from tests.models.embedding.utils import EmbedModelInfo
from ..utils import check_embeddings_close
from ...utils import EmbedModelInfo, check_embeddings_close
EMBEDDING_PROMPTS = [
'what is snowflake?', 'Where can I get the best tacos?', 'The Data Cloud!',
......
......@@ -267,6 +267,7 @@ VLM_TEST_SETTINGS = {
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
max_model_len=4096,
max_num_seqs=2,
dtype="bfloat16",
auto_cls=AutoModelForImageTextToText,
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
patch_hf_runner=model_utils.gemma3_patch_hf_runner,
......@@ -423,6 +424,8 @@ VLM_TEST_SETTINGS = {
get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
# FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
marks=[pytest.mark.skip("HF import fails")],
),
"minicpmo_26": VLMTestInfo(
models=["openbmb/MiniCPM-o-2_6"],
......@@ -434,6 +437,8 @@ VLM_TEST_SETTINGS = {
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
# FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
marks=[pytest.mark.skip("HF import fails")],
),
"minicpmv_26": VLMTestInfo(
models=["openbmb/MiniCPM-V-2_6"],
......@@ -445,6 +450,8 @@ VLM_TEST_SETTINGS = {
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
# FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
marks=[pytest.mark.skip("HF import fails")],
),
"minimax_vl_01": VLMTestInfo(
models=["MiniMaxAI/MiniMax-VL-01"],
......
......@@ -16,6 +16,7 @@ INTERLEAVED_PROMPT = base_prompt("<image><video><image>\n")
NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n")
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["float16"])
@pytest.mark.parametrize("max_tokens", [128])
......
......@@ -17,7 +17,8 @@ from vllm.sequence import SampleLogprobs
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
_ImageAssets)
from ....quantization.utils import is_quant_method_supported
from ....utils import large_gpu_test
from ....utils import (create_new_process_for_each_test, large_gpu_test,
multi_gpu_test)
from ...utils import check_logprobs_close
_LIMIT_IMAGE_PER_PROMPT = 3
......@@ -393,6 +394,37 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
)
@create_new_process_for_each_test()
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models_distributed(
hf_runner,
vllm_runner,
image_assets,
distributed_executor_backend,
model,
dtype,
max_tokens,
num_logprobs,
) -> None:
run_test(
hf_runner,
vllm_runner,
image_assets,
model=model,
size_factors=[0.25, 0.5, 1.0],
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
)
@large_gpu_test(min_gb=48)
@pytest.mark.core_model
@pytest.mark.parametrize("model", models)
......
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
"""
from typing import Optional
import pytest
from vllm import LLM, SamplingParams
from vllm import SamplingParams
from vllm.assets.audio import AudioAsset
from ....conftest import VllmRunner
from ....utils import create_new_process_for_each_test, multi_gpu_test
PROMPTS = [
......@@ -92,6 +89,7 @@ EXPECTED = {
def run_test(
vllm_runner: type[VllmRunner],
model: str,
*,
tensor_parallel_size: int,
......@@ -100,38 +98,52 @@ def run_test(
prompt_list = PROMPTS * 10
expected_list = EXPECTED[model] * 10
llm = LLM(
model=model,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
)
with vllm_runner(
model,
max_model_len=448,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
llm = vllm_model.model
sampling_params = SamplingParams(
temperature=0,
top_p=1.0,
max_tokens=200,
)
sampling_params = SamplingParams(
temperature=0,
top_p=1.0,
max_tokens=200,
)
outputs = llm.generate(prompt_list, sampling_params)
outputs = llm.generate(prompt_list, sampling_params)
for output, expected in zip(outputs, expected_list):
print(output.outputs[0].text)
assert output.outputs[0].text == expected
@create_new_process_for_each_test()
@create_new_process_for_each_test("spawn")
@pytest.mark.core_model
@pytest.mark.parametrize(
"model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
def test_models(model) -> None:
run_test(model, tensor_parallel_size=1)
def test_models(vllm_runner, model) -> None:
run_test(
vllm_runner,
model,
tensor_parallel_size=1,
)
@create_new_process_for_each_test("spawn")
@multi_gpu_test(num_gpus=2)
@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
def test_models_distributed(model, distributed_executor_backend) -> None:
run_test(model,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend)
def test_models_distributed(
vllm_runner,
model,
distributed_executor_backend,
) -> None:
run_test(
vllm_runner,
model,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment