Unverified Commit afb4429b authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[CI/Build] Reorganize models tests (#17459)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent aa4502e7
...@@ -8,7 +8,7 @@ import pytest ...@@ -8,7 +8,7 @@ import pytest
from vllm.config import PoolerConfig from vllm.config import PoolerConfig
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ..utils import check_embeddings_close from ...utils import check_embeddings_close
@pytest.mark.parametrize( @pytest.mark.parametrize(
......
...@@ -7,11 +7,10 @@ from array import array ...@@ -7,11 +7,10 @@ from array import array
import openai import openai
import pytest import pytest
import pytest_asyncio
from scipy.spatial.distance import cosine from scipy.spatial.distance import cosine
import vllm from vllm import LLM, SamplingParams
import vllm.config from vllm.config import ModelConfig
from vllm.utils import STR_BACKEND_ENV_VAR from vllm.utils import STR_BACKEND_ENV_VAR
from ....utils import RemoteOpenAIServer from ....utils import RemoteOpenAIServer
...@@ -31,73 +30,45 @@ def _arr(arr): ...@@ -31,73 +30,45 @@ def _arr(arr):
return array("i", arr) return array("i", arr)
def test_find_array(monkeypatch: pytest.MonkeyPatch): def test_find_array():
# GritLM embedding implementation is only supported by XFormers backend. from vllm.model_executor.models.gritlm import GritLMPooler
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
from vllm.model_executor.models.gritlm import GritLMPooler
# Create an LLM object to get the model config.
llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN)
pooler = GritLMPooler(model_config=llm.llm_engine.model_config)
arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
with pytest.raises(ValueError):
pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
@pytest.fixture(scope="module")
def server_embedding():
# GritLM embedding implementation is only supported by XFormers backend.
args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
with pytest.MonkeyPatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
@pytest.fixture(scope="module")
def server_generate():
args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
with pytest.MonkeyPatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
model_config = ModelConfig(
MODEL_NAME,
task="embed",
tokenizer=MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
dtype="bfloat16",
seed=0,
)
pooler = GritLMPooler(model_config=model_config)
@pytest_asyncio.fixture arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
async def client_embedding(server_embedding: RemoteOpenAIServer):
async with server_embedding.get_async_client() as async_client:
yield async_client
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=0) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=1) == 3
assert pooler._find_array(arr, _arr([3, 4, 5]), start_idx=5) == -1
assert pooler._find_array(arr, _arr([3, 5]), start_idx=0) == -1
@pytest_asyncio.fixture with pytest.raises(ValueError):
async def client_generate(server_generate: RemoteOpenAIServer): pooler._find_array(arr, _arr([3, 4, 5]), start_idx=-1)
async with server_generate.get_async_client() as async_client:
yield async_client
def run_llm_encode( def run_llm_encode(
llm: vllm.LLM, llm: LLM,
queries: list[str], queries: list[str],
instruction: str, instruction: str,
) -> list[float]: ) -> list[list[float]]:
outputs = llm.encode([instruction + q for q in queries], ) outputs = llm.embed([instruction + q for q in queries])
return [output.outputs.embedding for output in outputs] return [output.outputs.embedding for output in outputs]
async def run_client_embeddings( async def run_client_embeddings(
client: vllm.LLM, client: openai.AsyncOpenAI,
queries: list[str], queries: list[str],
instruction: str, instruction: str,
) -> list[float]: ) -> list[list[float]]:
outputs = await client.embeddings.create( outputs = await client.embeddings.create(
model=MODEL_NAME, model=MODEL_NAME,
input=[instruction + q for q in queries], input=[instruction + q for q in queries],
...@@ -132,7 +103,7 @@ def get_test_data(): ...@@ -132,7 +103,7 @@ def get_test_data():
return queries, q_instruction, documents, d_instruction return queries, q_instruction, documents, d_instruction
def validate_embed_output(q_rep: list[float], d_rep: list[float]): def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]):
cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0]) cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001) assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001)
...@@ -143,70 +114,100 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]): ...@@ -143,70 +114,100 @@ def validate_embed_output(q_rep: list[float], d_rep: list[float]):
assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001) assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001)
cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1]) cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
assert math.isclose(cosine_sim_q1_d1, 0.532, abs_tol=0.001) assert math.isclose(cosine_sim_q1_d1, 0.534, abs_tol=0.001)
def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch): def test_gritlm_offline_embedding(monkeypatch: pytest.MonkeyPatch,
vllm_runner):
# GritLM embedding implementation is only supported by XFormers backend. # GritLM embedding implementation is only supported by XFormers backend.
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS") m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
queries, q_instruction, documents, d_instruction = get_test_data() queries, q_instruction, documents, d_instruction = get_test_data()
llm = vllm.LLM(MODEL_NAME, task="embed", max_model_len=MAX_MODEL_LEN) with vllm_runner(
MODEL_NAME,
task="embed",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.model
d_rep = run_llm_encode(
llm,
documents,
d_instruction,
)
q_rep = run_llm_encode(
llm,
queries,
q_instruction,
)
validate_embed_output(q_rep, d_rep)
@pytest.mark.asyncio
async def test_gritlm_api_server_embedding():
queries, q_instruction, documents, d_instruction = get_test_data()
# GritLM embedding implementation is only supported by XFormers backend.
args = ["--task", "embed", "--max_model_len", str(MAX_MODEL_LEN)]
env_dict = {STR_BACKEND_ENV_VAR: "XFORMERS"}
with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as server:
client_embedding = server.get_async_client()
d_rep = run_llm_encode( d_rep = await run_client_embeddings(
llm, client_embedding,
documents, documents,
d_instruction, d_instruction,
) )
q_rep = run_llm_encode( q_rep = await run_client_embeddings(
llm, client_embedding,
queries, queries,
q_instruction, q_instruction,
) )
validate_embed_output(q_rep, d_rep) validate_embed_output(q_rep, d_rep)
@pytest.mark.asyncio
async def test_gritlm_api_server_embedding(
client_embedding: openai.AsyncOpenAI, ):
queries, q_instruction, documents, d_instruction = get_test_data()
d_rep = await run_client_embeddings(
client_embedding,
documents,
d_instruction,
)
q_rep = await run_client_embeddings(
client_embedding,
queries,
q_instruction,
)
validate_embed_output(q_rep, d_rep) def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
# GritLM embedding implementation is only supported by XFormers backend.
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "0")
m.setenv(STR_BACKEND_ENV_VAR, "XFORMERS")
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
def test_gritlm_offline_gen(): with vllm_runner(
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n" MODEL_NAME,
task="generate",
max_model_len=MAX_MODEL_LEN,
) as vllm_model:
llm = vllm_model.model
llm = vllm.LLM(MODEL_NAME, max_model_len=MAX_MODEL_LEN) sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
sampling_params = vllm.SamplingParams(temperature=0.0, max_tokens=256) outputs = llm.generate(input, sampling_params=sampling_params)
outputs = llm.generate(input, sampling_params=sampling_params)
assert outputs[0].outputs[0].text == "The capital of France is Paris." assert outputs[0].outputs[0].text == "The capital of France is Paris."
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_gritlm_api_server_gen(client_generate: openai.AsyncOpenAI): async def test_gritlm_api_server_generate():
input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n" input = "<|user|>\nWhat is the capital of France?\n<|assistant|>\n"
outputs = await client_generate.completions.create( # GritLM embedding implementation is only supported by XFormers backend.
model=MODEL_NAME, args = ["--task", "generate", "--max_model_len", str(MAX_MODEL_LEN)]
prompt=input, env_dict = {"VLLM_USE_V1": "0", STR_BACKEND_ENV_VAR: "XFORMERS"}
max_tokens=256,
temperature=0.0, with RemoteOpenAIServer(MODEL_NAME, args, env_dict=env_dict) as server:
) client_generate = server.get_async_client()
outputs = await client_generate.completions.create(
model=MODEL_NAME,
prompt=input,
max_tokens=256,
temperature=0.0,
)
assert outputs.choices[0].text == "The capital of France is Paris." assert outputs.choices[0].text == "The capital of France is Paris."
...@@ -8,9 +8,10 @@ import math ...@@ -8,9 +8,10 @@ import math
import pytest import pytest
from tests.models.embedding.utils import check_embeddings_close, matryoshka_fy
from vllm import PoolingParams from vllm import PoolingParams
from ...utils import check_embeddings_close, matryoshka_fy
SCORING_MODELS = [ SCORING_MODELS = [
"jinaai/jina-reranker-v2-base-multilingual", # Roberta "jinaai/jina-reranker-v2-base-multilingual", # Roberta
] ]
......
...@@ -5,9 +5,7 @@ Run `pytest tests/models/embedding/language/test_snowflake_arctic_embed.py`. ...@@ -5,9 +5,7 @@ Run `pytest tests/models/embedding/language/test_snowflake_arctic_embed.py`.
""" """
import pytest import pytest
from tests.models.embedding.utils import EmbedModelInfo from ...utils import EmbedModelInfo, check_embeddings_close
from ..utils import check_embeddings_close
EMBEDDING_PROMPTS = [ EMBEDDING_PROMPTS = [
'what is snowflake?', 'Where can I get the best tacos?', 'The Data Cloud!', 'what is snowflake?', 'Where can I get the best tacos?', 'The Data Cloud!',
......
...@@ -267,6 +267,7 @@ VLM_TEST_SETTINGS = { ...@@ -267,6 +267,7 @@ VLM_TEST_SETTINGS = {
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501 multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
dtype="bfloat16",
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}}, vllm_runner_kwargs={"mm_processor_kwargs": {"do_pan_and_scan": True}},
patch_hf_runner=model_utils.gemma3_patch_hf_runner, patch_hf_runner=model_utils.gemma3_patch_hf_runner,
...@@ -423,6 +424,8 @@ VLM_TEST_SETTINGS = { ...@@ -423,6 +424,8 @@ VLM_TEST_SETTINGS = {
get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id], get_stop_token_ids=lambda tok: [tok.eos_id, tok.eot_id],
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner, patch_hf_runner=model_utils.minicpmv_25_patch_hf_runner,
# FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
marks=[pytest.mark.skip("HF import fails")],
), ),
"minicpmo_26": VLMTestInfo( "minicpmo_26": VLMTestInfo(
models=["openbmb/MiniCPM-o-2_6"], models=["openbmb/MiniCPM-o-2_6"],
...@@ -434,6 +437,8 @@ VLM_TEST_SETTINGS = { ...@@ -434,6 +437,8 @@ VLM_TEST_SETTINGS = {
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner, patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
# FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
marks=[pytest.mark.skip("HF import fails")],
), ),
"minicpmv_26": VLMTestInfo( "minicpmv_26": VLMTestInfo(
models=["openbmb/MiniCPM-V-2_6"], models=["openbmb/MiniCPM-V-2_6"],
...@@ -445,6 +450,8 @@ VLM_TEST_SETTINGS = { ...@@ -445,6 +450,8 @@ VLM_TEST_SETTINGS = {
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501 get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(['<|im_end|>', '<|endoftext|>']), # noqa: E501
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner, patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
# FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
marks=[pytest.mark.skip("HF import fails")],
), ),
"minimax_vl_01": VLMTestInfo( "minimax_vl_01": VLMTestInfo(
models=["MiniMaxAI/MiniMax-VL-01"], models=["MiniMaxAI/MiniMax-VL-01"],
......
...@@ -16,6 +16,7 @@ INTERLEAVED_PROMPT = base_prompt("<image><video><image>\n") ...@@ -16,6 +16,7 @@ INTERLEAVED_PROMPT = base_prompt("<image><video><image>\n")
NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n") NONINTERLEAVED_PROMPT = base_prompt("<image><image><video>\n")
@pytest.mark.core_model
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["float16"]) @pytest.mark.parametrize("dtype", ["float16"])
@pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("max_tokens", [128])
......
...@@ -17,7 +17,8 @@ from vllm.sequence import SampleLogprobs ...@@ -17,7 +17,8 @@ from vllm.sequence import SampleLogprobs
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
_ImageAssets) _ImageAssets)
from ....quantization.utils import is_quant_method_supported from ....quantization.utils import is_quant_method_supported
from ....utils import large_gpu_test from ....utils import (create_new_process_for_each_test, large_gpu_test,
multi_gpu_test)
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
_LIMIT_IMAGE_PER_PROMPT = 3 _LIMIT_IMAGE_PER_PROMPT = 3
...@@ -393,6 +394,37 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model, ...@@ -393,6 +394,37 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
) )
@create_new_process_for_each_test()
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models_distributed(
hf_runner,
vllm_runner,
image_assets,
distributed_executor_backend,
model,
dtype,
max_tokens,
num_logprobs,
) -> None:
run_test(
hf_runner,
vllm_runner,
image_assets,
model=model,
size_factors=[0.25, 0.5, 1.0],
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
)
@large_gpu_test(min_gb=48) @large_gpu_test(min_gb=48)
@pytest.mark.core_model @pytest.mark.core_model
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling.
Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`.
"""
from typing import Optional from typing import Optional
import pytest import pytest
from vllm import LLM, SamplingParams from vllm import SamplingParams
from vllm.assets.audio import AudioAsset from vllm.assets.audio import AudioAsset
from ....conftest import VllmRunner
from ....utils import create_new_process_for_each_test, multi_gpu_test from ....utils import create_new_process_for_each_test, multi_gpu_test
PROMPTS = [ PROMPTS = [
...@@ -92,6 +89,7 @@ EXPECTED = { ...@@ -92,6 +89,7 @@ EXPECTED = {
def run_test( def run_test(
vllm_runner: type[VllmRunner],
model: str, model: str,
*, *,
tensor_parallel_size: int, tensor_parallel_size: int,
...@@ -100,38 +98,52 @@ def run_test( ...@@ -100,38 +98,52 @@ def run_test(
prompt_list = PROMPTS * 10 prompt_list = PROMPTS * 10
expected_list = EXPECTED[model] * 10 expected_list = EXPECTED[model] * 10
llm = LLM( with vllm_runner(
model=model, model,
tensor_parallel_size=tensor_parallel_size, max_model_len=448,
distributed_executor_backend=distributed_executor_backend, tensor_parallel_size=tensor_parallel_size,
) distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
llm = vllm_model.model
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0, temperature=0,
top_p=1.0, top_p=1.0,
max_tokens=200, max_tokens=200,
) )
outputs = llm.generate(prompt_list, sampling_params) outputs = llm.generate(prompt_list, sampling_params)
for output, expected in zip(outputs, expected_list): for output, expected in zip(outputs, expected_list):
print(output.outputs[0].text) print(output.outputs[0].text)
assert output.outputs[0].text == expected assert output.outputs[0].text == expected
@create_new_process_for_each_test() @create_new_process_for_each_test("spawn")
@pytest.mark.core_model @pytest.mark.core_model
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"]) "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"])
def test_models(model) -> None: def test_models(vllm_runner, model) -> None:
run_test(model, tensor_parallel_size=1) run_test(
vllm_runner,
model,
tensor_parallel_size=1,
)
@create_new_process_for_each_test("spawn")
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
@pytest.mark.core_model @pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) @pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
def test_models_distributed(model, distributed_executor_backend) -> None: def test_models_distributed(
run_test(model, vllm_runner,
tensor_parallel_size=2, model,
distributed_executor_backend=distributed_executor_backend) distributed_executor_backend,
) -> None:
run_test(
vllm_runner,
model,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment