Commit 4eabe123 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori

parents 45840cd2 58738772
# SPDX-License-Identifier: Apache-2.0
from unittest.mock import MagicMock
import pytest
from tests.entrypoints.openai.tool_parsers.utils import (
run_tool_extraction, run_tool_extraction_streaming)
from vllm.entrypoints.openai.protocol import FunctionCall
from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
# Test cases similar to pythonic parser but with Llama4 specific format
SIMPLE_FUNCTION_OUTPUT = "[get_weather(city='LA', metric='C')]"
SIMPLE_FUNCTION_CALL = FunctionCall(
name="get_weather",
arguments='{"city": "LA", "metric": "C"}',
)
MORE_TYPES_FUNCTION_OUTPUT = ("[register_user(name='Doe', "
"age=9, "
"address={'city': 'LA', 'state': 'CA'}, "
"role=None, "
"passed_test=True, "
"aliases=['John', 'Johnny'])]")
MORE_TYPES_FUNCTION_CALL = FunctionCall(
name="register_user",
arguments='{"name": "Doe", '
'"age": 9, '
'"address": {"city": "LA", "state": "CA"}, '
'"role": null, '
'"passed_test": true, '
'"aliases": ["John", "Johnny"]}',
)
PARAMETERLESS_FUNCTION_OUTPUT = "[get_weather()]"
PARAMETERLESS_FUNCTION_CALL = FunctionCall(
name="get_weather",
arguments='{}',
)
EMPTY_DICT_FUNCTION_OUTPUT = "[do_something_cool(additional_data={})]"
EMPTY_DICT_FUNCTION_CALL = FunctionCall(
name="do_something_cool",
arguments='{"additional_data": {}}',
)
EMPTY_LIST_FUNCTION_OUTPUT = "[do_something_cool(steps=[])]"
EMPTY_LIST_FUNCTION_CALL = FunctionCall(
name="do_something_cool",
arguments='{"steps": []}',
)
ESCAPED_STRING_FUNCTION_OUTPUT = (
r"[get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')]")
ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
name="get_weather",
arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
)
PYTHON_TAG_FUNCTION_OUTPUT = (
"<|python_start|>[get_weather(city='LA', metric='C')]<|python_end|>")
@pytest.mark.parametrize("streaming", [True, False])
def test_no_tool_call(streaming: bool):
mock_tokenizer = MagicMock()
tool_parser: ToolParser = ToolParserManager.get_tool_parser(
"llama4_pythonic")(mock_tokenizer)
model_output = "How can I help you today?"
content, tool_calls = run_tool_extraction(tool_parser,
model_output,
streaming=streaming)
assert content == model_output
assert len(tool_calls) == 0
test_str = "<|python_start|>"
test_str += "[get_weather(city='LA', metric='C'),"
test_str += "register_user(name='Doe', age=9)]"
TEST_CASES = [
pytest.param(True,
ESCAPED_STRING_FUNCTION_OUTPUT,
[ESCAPED_STRING_FUNCTION_CALL],
id="simple_streaming"),
pytest.param(False,
SIMPLE_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL],
id="simple_nonstreaming"),
pytest.param(True,
MORE_TYPES_FUNCTION_OUTPUT, [MORE_TYPES_FUNCTION_CALL],
id="more_types_streaming"),
pytest.param(False,
MORE_TYPES_FUNCTION_OUTPUT, [MORE_TYPES_FUNCTION_CALL],
id="more_types_nonstreaming"),
pytest.param(True,
PARAMETERLESS_FUNCTION_OUTPUT, [PARAMETERLESS_FUNCTION_CALL],
id="parameterless_streaming"),
pytest.param(False,
PARAMETERLESS_FUNCTION_OUTPUT, [PARAMETERLESS_FUNCTION_CALL],
id="parameterless_nonstreaming"),
pytest.param(True,
EMPTY_DICT_FUNCTION_OUTPUT, [EMPTY_DICT_FUNCTION_CALL],
id="empty_dict_streaming"),
pytest.param(False,
EMPTY_DICT_FUNCTION_OUTPUT, [EMPTY_DICT_FUNCTION_CALL],
id="empty_dict_nonstreaming"),
pytest.param(True,
EMPTY_LIST_FUNCTION_OUTPUT, [EMPTY_LIST_FUNCTION_CALL],
id="empty_list_streaming"),
pytest.param(False,
EMPTY_LIST_FUNCTION_OUTPUT, [EMPTY_LIST_FUNCTION_CALL],
id="empty_list_nonstreaming"),
pytest.param(True,
ESCAPED_STRING_FUNCTION_OUTPUT,
[ESCAPED_STRING_FUNCTION_CALL],
id="escaped_string_streaming"),
pytest.param(False,
ESCAPED_STRING_FUNCTION_OUTPUT,
[ESCAPED_STRING_FUNCTION_CALL],
id="escaped_string_nonstreaming"),
pytest.param(
True,
"[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]",
[
SIMPLE_FUNCTION_CALL,
FunctionCall(name="register_user",
arguments='{"name": "Doe", "age": 9}')
],
id="parallel_calls_streaming"),
pytest.param(
False,
"[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]",
[
SIMPLE_FUNCTION_CALL,
FunctionCall(name="register_user",
arguments='{"name": "Doe", "age": 9}')
],
id="parallel_calls_nonstreaming"),
pytest.param(True,
PYTHON_TAG_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL],
id="python_tag_streaming"),
pytest.param(False,
PYTHON_TAG_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL],
id="python_tag_nonstreaming"),
pytest.param(True,
test_str, [
SIMPLE_FUNCTION_CALL,
FunctionCall(name="register_user",
arguments='{"name": "Doe", "age": 9}')
],
id="parallel_calls_streaming"),
pytest.param(False,
"<|python_start|>[get_weather(city='LA', metric='C'), " +
"register_user(name='Doe', age=9)]", [
SIMPLE_FUNCTION_CALL,
FunctionCall(name="register_user",
arguments='{"name": "Doe", "age": 9}')
],
id="parallel_calls_nonstreaming"),
]
@pytest.mark.parametrize("streaming, model_output, expected_tool_calls",
TEST_CASES)
def test_tool_call(streaming: bool, model_output: str,
expected_tool_calls: list[FunctionCall]):
mock_tokenizer = MagicMock()
tool_parser: ToolParser = ToolParserManager.get_tool_parser(
"llama4_pythonic")(mock_tokenizer)
content, tool_calls = run_tool_extraction(tool_parser,
model_output,
streaming=streaming)
assert len(tool_calls) == len(expected_tool_calls)
for actual, expected in zip(tool_calls, expected_tool_calls):
assert actual.type == "function"
assert actual.function == expected
def test_streaming_tool_call_with_large_steps():
mock_tokenizer = MagicMock()
tool_parser: ToolParser = ToolParserManager.get_tool_parser(
"llama4_pythonic")(mock_tokenizer)
model_output_deltas = [
"<|python_start|>[get_weather(city='LA', metric='C'), "
"get_weather(), "
"do_something_cool(steps=[])]<|python_end|>",
]
reconstructor = run_tool_extraction_streaming(
tool_parser, model_output_deltas, assert_one_tool_per_delta=False)
assert reconstructor.other_content == ""
assert len(reconstructor.tool_calls) == 3
assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
...@@ -148,6 +148,11 @@ def test_paged_attention( ...@@ -148,6 +148,11 @@ def test_paged_attention(
or (version == "rocm" and head_size not in (64, 128))): or (version == "rocm" and head_size not in (64, 128))):
pytest.skip() pytest.skip()
if (version == "rocm" and current_platform.is_navi()
and (kv_cache_dtype == "fp8" or head_size != 128
or block_size != 16 or use_alibi)):
pytest.skip()
global PARTITION_SIZE global PARTITION_SIZE
current_platform.seed_everything(seed) current_platform.seed_everything(seed)
...@@ -275,6 +280,7 @@ def test_paged_attention( ...@@ -275,6 +280,7 @@ def test_paged_attention(
scale, scale,
block_tables, block_tables,
seq_lens, seq_lens,
None,
block_size, block_size,
max_seq_len, max_seq_len,
alibi_slopes, alibi_slopes,
...@@ -286,7 +292,7 @@ def test_paged_attention( ...@@ -286,7 +292,7 @@ def test_paged_attention(
opcheck(torch.ops._rocm_C.paged_attention, opcheck(torch.ops._rocm_C.paged_attention,
(output, exp_sums, max_logits, tmp_output, query, (output, exp_sums, max_logits, tmp_output, query,
key_cache, value_cache, num_kv_heads, scale, block_tables, key_cache, value_cache, num_kv_heads, scale, block_tables,
seq_lens, block_size, max_seq_len, alibi_slopes, seq_lens, None, block_size, max_seq_len, alibi_slopes,
kv_cache_dtype, k_scale, v_scale), kv_cache_dtype, k_scale, v_scale),
cond=(head_size == HEAD_SIZES[0] cond=(head_size == HEAD_SIZES[0]
and block_size == BLOCK_SIZES[0])) and block_size == BLOCK_SIZES[0]))
......
...@@ -575,3 +575,21 @@ def test_moe_align_block_size_opcheck(): ...@@ -575,3 +575,21 @@ def test_moe_align_block_size_opcheck():
opcheck(torch.ops._moe_C.moe_align_block_size, opcheck(torch.ops._moe_C.moe_align_block_size,
(topk_ids, num_experts, block_size, sorted_ids, expert_ids, (topk_ids, num_experts, block_size, sorted_ids, expert_ids,
num_tokens_post_pad)) num_tokens_post_pad))
@pytest.mark.parametrize("m", [1, 33, 64, 222])
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("k", [128, 511, 1024])
@pytest.mark.parametrize("dtype",
[torch.float32, torch.float16, torch.bfloat16])
@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
def test_moe_sum(m: int, topk: int, k: int, dtype: torch.dtype):
input = torch.randn((m, topk, k), device="cuda", dtype=dtype)
actual = torch.empty((m, k), device="cuda", dtype=dtype)
expected = input.sum(dim=1)
torch.ops._moe_C.moe_sum(input, actual)
torch.testing.assert_close(actual, expected, atol=2e-2, rtol=0)
opcheck(torch.ops._moe_C.moe_sum, (input, actual))
...@@ -13,7 +13,7 @@ import torch ...@@ -13,7 +13,7 @@ import torch
from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
from vllm.model_executor.layers.fused_moe.layer import determine_expert_map from vllm.model_executor.layers.fused_moe.layer import determine_expert_map
from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
moe_permute, moe_unpermute) moe_permute, moe_permute_unpermute_supported, moe_unpermute)
from vllm.platforms import current_platform from vllm.platforms import current_platform
NUM_EXPERTS = [16, 64] NUM_EXPERTS = [16, 64]
...@@ -167,6 +167,8 @@ def torch_unpermute(permuted_hidden_states: torch.Tensor, ...@@ -167,6 +167,8 @@ def torch_unpermute(permuted_hidden_states: torch.Tensor,
def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int, def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
n_expert: int, ep_size: int, dtype: torch.dtype, n_expert: int, ep_size: int, dtype: torch.dtype,
align_block_size: Optional[int]): align_block_size: Optional[int]):
if not moe_permute_unpermute_supported():
pytest.skip("moe_permute_unpermute is not supported on this platform.")
fill_invalid_expert = 0 fill_invalid_expert = 0
ep_rank = np.random.randint(0, ep_size) ep_rank = np.random.randint(0, ep_size)
expert_map = None expert_map = None
......
...@@ -36,16 +36,16 @@ vllm_config.scheduler_config.max_model_len = 8192 ...@@ -36,16 +36,16 @@ vllm_config.scheduler_config.max_model_len = 8192
# Test configurations # Test configurations
DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32] DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]
NUM_TOKENS = [7, 83, 2048] NUM_TOKENS = [7, 2050]
D = [512, 4096, 5120, 13824] D = [512, 4096, 5120, 13824]
GROUP_SIZE = [64, 128, 256, 512] GROUP_SIZE = [64, 128, 512]
M = [1, 7, 8, 83, 84, 512, 2048, 4096] M = [1, 7, 8, 83, 84, 4096]
N = [128, 512, 1024, 4096, 7168, 7748, 13824] N = [128, 512, 7168, 7748, 13824]
K = [256, 4096, 5120, 3884, 13824, 16384] K = [256, 3884, 4096, 13824, 16384]
# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8 # Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
# and its hidden size is 7168. # and its hidden size is 7168.
M_moe = [1, 2, 7, 83, 128, 512, 2048] M_moe = [1, 2, 7, 83, 128, 2048]
M_moe_dg = [128, 192, 512, 1335, 2048] M_moe_dg = [128, 192, 1335, 2048]
N_moe = [128, 256, 1024, 4608] # [13824] N_moe = [128, 256, 1024, 4608] # [13824]
K_moe = [256, 512, 7168] # [13824] K_moe = [256, 512, 7168] # [13824]
BLOCK_SIZE = [[128, 128]] BLOCK_SIZE = [[128, 128]]
......
...@@ -8,7 +8,6 @@ from gguf import GGMLQuantizationType, GGUFReader, ReaderTensor, dequantize ...@@ -8,7 +8,6 @@ from gguf import GGMLQuantizationType, GGUFReader, ReaderTensor, dequantize
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
import vllm._custom_ops as ops import vllm._custom_ops as ops
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe import fused_experts
from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf
from vllm.platforms import current_platform from vllm.platforms import current_platform
...@@ -35,11 +34,11 @@ def get_gguf_MoE_tensors( ...@@ -35,11 +34,11 @@ def get_gguf_MoE_tensors(
return GGUFReader(sample_file).tensors return GGUFReader(sample_file).tensors
DTYPES = [torch.half, torch.bfloat16, torch.float32] DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32]
# Hidden_size for testing, must match the sample file in HF repo, # Hidden_size for testing, must match the sample file in HF repo,
# we have `hidden_size = 256, 1024` for test in HF repo currently. # we have `hidden_size = 256, 1024` for test in HF repo currently.
HIDDEN_SIZES = [256, 1024] HIDDEN_SIZES = [256, 1024]
NUM_TOKENS = [7, 83, 128, 2048] # Arbitrary values for testing NUM_TOKENS = [7, 2050] # Arbitrary values for testing
SEEDS = [0] SEEDS = [0]
QUANT_TYPES = [ QUANT_TYPES = [
# i-matrix # i-matrix
...@@ -176,12 +175,11 @@ def test_moe(num_tokens: int, hidden_size: int, dtype: torch.dtype, ...@@ -176,12 +175,11 @@ def test_moe(num_tokens: int, hidden_size: int, dtype: torch.dtype,
w2_dequant = torch.tensor(dequantize(w2.data, quant_type), w2_dequant = torch.tensor(dequantize(w2.data, quant_type),
device="cuda").to(dtype) device="cuda").to(dtype)
act = SiluAndMul()
output = _fused_moe_gguf(x, torch.tensor(w13.data, device="cuda"), output = _fused_moe_gguf(x, torch.tensor(w13.data, device="cuda"),
torch.tensor(w2.data, torch.tensor(w2.data,
device="cuda"), topk_weights, device="cuda"), topk_weights,
topk_ids, quant_type, quant_type, act) topk_ids, quant_type, quant_type, "silu")
ref_output = fused_experts(x, w13_dequant, w2_dequant, topk_weights, ref_output = fused_experts(x, w13_dequant, w2_dequant, topk_weights,
topk_ids).reshape(output.shape) topk_ids).reshape(output.shape)
......
...@@ -13,8 +13,13 @@ from vllm.platforms import current_platform ...@@ -13,8 +13,13 @@ from vllm.platforms import current_platform
device = "cuda" device = "cuda"
triton_scaled_mm_module = importlib.import_module(
"vllm.model_executor.layers.quantization.compressed_tensors."
"triton_scaled_mm")
triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
def scaled_mm_torch(a: torch.Tensor,
def torch_scaled_mm(a: torch.Tensor,
b: torch.Tensor, b: torch.Tensor,
scale_a: torch.Tensor, scale_a: torch.Tensor,
scale_b: torch.Tensor, scale_b: torch.Tensor,
...@@ -101,21 +106,8 @@ def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a, ...@@ -101,21 +106,8 @@ def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a,
if use_bias: if use_bias:
bias = torch.rand((N, ), device=device, dtype=out_dtype) bias = torch.rand((N, ), device=device, dtype=out_dtype)
triton_scaled_mm_module = importlib.import_module(
"vllm.model_executor.layers.quantization.compressed_tensors."
"triton_scaled_mm")
triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias) c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
a_cpu = a.cpu() c_actual = torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
b_cpu = b.cpu()
scale_a_cpu = scale_a.cpu()
scale_b_cpu = scale_b.cpu()
bias_cpu = None if bias is None else bias.cpu()
c_actual = scaled_mm_torch(a_cpu, b_cpu, scale_a_cpu, scale_b_cpu,
out_dtype, bias_cpu)
c_check_cpu = c_check.cpu() torch.testing.assert_close(c_check, c_actual, rtol=1e-1, atol=1e-1)
torch.testing.assert_close(c_check_cpu, c_actual, rtol=1e-1, atol=1e-1)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import subprocess
import sys
from typing import Union
import pytest import pytest
import ray import ray
import vllm import vllm
from vllm import LLM
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
from ..utils import create_new_process_for_each_test, multi_gpu_test from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test
MODEL_PATH = "meta-llama/Llama-2-7b-hf" MODEL_PATH = "meta-llama/Llama-2-7b-hf"
...@@ -36,7 +41,10 @@ def v1(run_with_both_engines_lora): ...@@ -36,7 +41,10 @@ def v1(run_with_both_engines_lora):
pass pass
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: def do_sample(llm: vllm.LLM,
lora_path: str,
lora_id: int,
tensorizer_config_dict: Union[dict, None] = None) -> list[str]:
prompts = [ prompts = [
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501
...@@ -45,10 +53,23 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: ...@@ -45,10 +53,23 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501
] ]
sampling_params = vllm.SamplingParams(temperature=0, sampling_params = vllm.SamplingParams(temperature=0,
max_tokens=256, max_tokens=256,
skip_special_tokens=False, skip_special_tokens=False,
stop=["[/assistant]"]) stop=["[/assistant]"])
if tensorizer_config_dict is not None:
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(
str(lora_id),
lora_id,
lora_path,
tensorizer_config_dict=tensorizer_config_dict)
if lora_id else None)
else:
outputs = llm.generate( outputs = llm.generate(
prompts, prompts,
sampling_params, sampling_params,
...@@ -64,18 +85,32 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: ...@@ -64,18 +85,32 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
return generated_texts return generated_texts
def generate_and_test(llm, sql_lora_files): def generate_and_test(llm,
sql_lora_files,
tensorizer_config_dict: Union[dict, None] = None):
print("lora adapter created") print("lora adapter created")
assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT assert do_sample(llm,
sql_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=0) == EXPECTED_NO_LORA_OUTPUT
print("lora 1") print("lora 1")
assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT assert do_sample(llm,
sql_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=1) == EXPECTED_LORA_OUTPUT
print("no lora") print("no lora")
assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT assert do_sample(llm,
sql_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=0) == EXPECTED_NO_LORA_OUTPUT
print("lora 2") print("lora 2")
assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT assert do_sample(llm,
sql_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=2) == EXPECTED_LORA_OUTPUT
print("removing lora") print("removing lora")
...@@ -153,3 +188,64 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files): ...@@ -153,3 +188,64 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
enable_chunked_prefill=True, enable_chunked_prefill=True,
) )
generate_and_test(llm, sql_lora_files) generate_and_test(llm, sql_lora_files)
@multi_gpu_test(num_gpus=2)
@create_new_process_for_each_test()
def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
sql_lora_huggingface_id):
# Run the tensorizing of the LoRA adapter and the model in a subprocess
# to guarantee cleanup
tp_size = 2
model_name = "model-rank-%03d.tensors"
model_ref = MODEL_PATH
lora_path = sql_lora_huggingface_id
suffix = "test"
try:
result = subprocess.run([
sys.executable,
f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
str(tp_size), "serialize", "--serialized-directory",
str(tmp_path), "--suffix", suffix
],
check=True,
capture_output=True,
text=True)
except subprocess.CalledProcessError as e:
print("Tensorizing failed.")
print("STDOUT:\n", e.stdout)
print("STDERR:\n", e.stderr)
raise
print("STDOUT:\n", result.stdout)
model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
loaded_vllm_model = LLM(model=model_ref,
load_format="tensorizer",
enable_lora=True,
enforce_eager=True,
model_loader_extra_config=tensorizer_config,
max_num_seqs=13,
tensor_parallel_size=2,
max_loras=2)
tensorizer_config_dict = tensorizer_config.to_dict()
print("lora adapter created")
assert do_sample(loaded_vllm_model,
sql_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=0) == EXPECTED_NO_LORA_OUTPUT
print("lora 1")
assert do_sample(loaded_vllm_model,
sql_lora_files,
tensorizer_config_dict=tensorizer_config_dict,
lora_id=1) == EXPECTED_LORA_OUTPUT
...@@ -69,7 +69,7 @@ def test_lora_functions_sync(): ...@@ -69,7 +69,7 @@ def test_lora_functions_sync():
run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11]) run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11])
run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11]) run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11])
# Remove all LoRAs # Remove all LoRAs.
run_check(llm.remove_lora, 13, [12, 10, 11]) run_check(llm.remove_lora, 13, [12, 10, 11])
run_check(llm.remove_lora, 12, [10, 11]) run_check(llm.remove_lora, 12, [10, 11])
run_check(llm.remove_lora, 11, [10]) run_check(llm.remove_lora, 11, [10])
......
...@@ -31,7 +31,7 @@ HYBRID_MODELS = [ ...@@ -31,7 +31,7 @@ HYBRID_MODELS = [
# not compatible with pip-compile. # not compatible with pip-compile.
"pfnet/plamo-2-1b", "pfnet/plamo-2-1b",
"Zyphra/Zamba2-1.2B-instruct", "Zyphra/Zamba2-1.2B-instruct",
"hmellor/bamba-tiny-random", "hmellor/tiny-random-BambaForCausalLM",
] ]
# Avoid OOM # Avoid OOM
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import math
from collections.abc import Sequence from collections.abc import Sequence
import mteb import mteb
...@@ -115,4 +114,4 @@ def mteb_test_embed_models(hf_runner, ...@@ -115,4 +114,4 @@ def mteb_test_embed_models(hf_runner,
print("SentenceTransformer:", model_dtype, st_main_score) print("SentenceTransformer:", model_dtype, st_main_score)
print("Difference:", st_main_score - vllm_main_score) print("Difference:", st_main_score - vllm_main_score)
assert math.isclose(st_main_score, vllm_main_score, rel_tol=MTEB_EMBED_TOL) assert st_main_score == pytest.approx(vllm_main_score, rel=MTEB_EMBED_TOL)
...@@ -15,13 +15,12 @@ from ...utils import check_embeddings_close ...@@ -15,13 +15,12 @@ from ...utils import check_embeddings_close
marks=[pytest.mark.core_model, pytest.mark.cpu_model]), marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
pytest.param("sentence-transformers/all-MiniLM-L12-v2"), pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
pytest.param("intfloat/multilingual-e5-small"), pytest.param("intfloat/multilingual-e5-small"),
pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"), pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
# [Decoder-only] # [Decoder-only]
pytest.param("BAAI/bge-multilingual-gemma2", pytest.param("BAAI/bge-multilingual-gemma2",
marks=[pytest.mark.core_model]), marks=[pytest.mark.core_model]),
pytest.param("intfloat/e5-mistral-7b-instruct", pytest.param("intfloat/e5-mistral-7b-instruct",
marks=[pytest.mark.core_model, pytest.mark.cpu_model]), marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"), pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
# [Cross-Encoder] # [Cross-Encoder]
pytest.param("sentence-transformers/stsb-roberta-base-v2"), pytest.param("sentence-transformers/stsb-roberta-base-v2"),
...@@ -47,9 +46,6 @@ def test_models( ...@@ -47,9 +46,6 @@ def test_models(
vllm_extra_kwargs["override_pooler_config"] = \ vllm_extra_kwargs["override_pooler_config"] = \
PoolerConfig(pooling_type="MEAN") PoolerConfig(pooling_type="MEAN")
if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
vllm_extra_kwargs["hf_overrides"] = {"is_causal": True}
# The example_prompts has ending "\n", for example: # The example_prompts has ending "\n", for example:
# "Write a short story about a robot that dreams for the first time.\n" # "Write a short story about a robot that dreams for the first time.\n"
# sentence_transformers will strip the input texts, see: # sentence_transformers will strip the input texts, see:
......
...@@ -2,7 +2,6 @@ ...@@ -2,7 +2,6 @@
from __future__ import annotations from __future__ import annotations
import importlib.util import importlib.util
import math
from array import array from array import array
import openai import openai
...@@ -104,16 +103,16 @@ def get_test_data(): ...@@ -104,16 +103,16 @@ def get_test_data():
def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]): def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]):
cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0]) cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001) assert cosine_sim_q0_d0 == pytest.approx(0.609, abs=0.001)
cosine_sim_q0_d1 = 1 - cosine(q_rep[0], d_rep[1]) cosine_sim_q0_d1 = 1 - cosine(q_rep[0], d_rep[1])
assert math.isclose(cosine_sim_q0_d1, 0.101, abs_tol=0.001) assert cosine_sim_q0_d1 == pytest.approx(0.101, abs=0.001)
cosine_sim_q1_d0 = 1 - cosine(q_rep[1], d_rep[0]) cosine_sim_q1_d0 = 1 - cosine(q_rep[1], d_rep[0])
assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001) assert cosine_sim_q1_d0 == pytest.approx(0.120, abs=0.001)
cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1]) cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
assert math.isclose(cosine_sim_q1_d1, 0.534, abs_tol=0.001) assert cosine_sim_q1_d1 == pytest.approx(0.534, abs=0.001)
def test_gritlm_offline_embedding(vllm_runner): def test_gritlm_offline_embedding(vllm_runner):
......
...@@ -45,9 +45,6 @@ MODELS = [ ...@@ -45,9 +45,6 @@ MODELS = [
EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct", EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
architecture="Qwen2ForCausalLM", architecture="Qwen2ForCausalLM",
enable_test=True), enable_test=True),
EmbedModelInfo("Alibaba-NLP/gte-Qwen2-7B-instruct",
architecture="Qwen2ForCausalLM",
enable_test=False),
########## ModernBertModel ########## ModernBertModel
EmbedModelInfo("Alibaba-NLP/gte-modernbert-base", EmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
architecture="ModernBertModel", architecture="ModernBertModel",
...@@ -58,14 +55,9 @@ MODELS = [ ...@@ -58,14 +55,9 @@ MODELS = [
@pytest.mark.parametrize("model_info", MODELS) @pytest.mark.parametrize("model_info", MODELS)
def test_models_mteb(hf_runner, vllm_runner, def test_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None: model_info: EmbedModelInfo) -> None:
pytest.skip("Skipping mteb test.")
from .mteb_utils import mteb_test_embed_models from .mteb_utils import mteb_test_embed_models
vllm_extra_kwargs: dict[str, Any] = {} vllm_extra_kwargs: dict[str, Any] = {}
if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
vllm_extra_kwargs["hf_overrides"] = {"is_causal": True}
if model_info.architecture == "GteNewModel": if model_info.architecture == "GteNewModel":
vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]} vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
...@@ -83,9 +75,6 @@ def test_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo, ...@@ -83,9 +75,6 @@ def test_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo,
example_prompts = [str(s).strip() for s in example_prompts] example_prompts = [str(s).strip() for s in example_prompts]
vllm_extra_kwargs: dict[str, Any] = {} vllm_extra_kwargs: dict[str, Any] = {}
if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
vllm_extra_kwargs["hf_overrides"] = {"is_causal": True}
if model_info.architecture == "GteNewModel": if model_info.architecture == "GteNewModel":
vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]} vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import math
import pytest import pytest
from vllm import PoolingParams from vllm import PoolingParams
...@@ -60,7 +58,7 @@ def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str): ...@@ -60,7 +58,7 @@ def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
assert len(vllm_outputs) == 1 assert len(vllm_outputs) == 1
assert len(hf_outputs) == 1 assert len(hf_outputs) == 1
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
...@@ -78,8 +76,8 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str): ...@@ -78,8 +76,8 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
assert len(vllm_outputs) == 10 assert len(vllm_outputs) == 10
assert len(hf_outputs) == 10 assert len(hf_outputs) == 10
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
@pytest.fixture(scope="module", params=EMBEDDING_MODELS) @pytest.fixture(scope="module", params=EMBEDDING_MODELS)
......
...@@ -23,7 +23,6 @@ MODELS = [ ...@@ -23,7 +23,6 @@ MODELS = [
@pytest.mark.parametrize("model_info", MODELS) @pytest.mark.parametrize("model_info", MODELS)
def test_models_mteb(hf_runner, vllm_runner, def test_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None: model_info: EmbedModelInfo) -> None:
pytest.skip("Skipping mteb test.")
from .mteb_utils import mteb_test_embed_models from .mteb_utils import mteb_test_embed_models
mteb_test_embed_models(hf_runner, vllm_runner, model_info) mteb_test_embed_models(hf_runner, vllm_runner, model_info)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import math
import pytest import pytest
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
...@@ -45,7 +43,7 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name): ...@@ -45,7 +43,7 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
assert len(vllm_outputs) == 1 assert len(vllm_outputs) == 1
assert len(hf_outputs) == 1 assert len(hf_outputs) == 1
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name): def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
...@@ -64,8 +62,8 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name): ...@@ -64,8 +62,8 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
assert len(vllm_outputs) == 2 assert len(vllm_outputs) == 2
assert len(hf_outputs) == 2 assert len(hf_outputs) == 2
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name): def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
...@@ -84,8 +82,8 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name): ...@@ -84,8 +82,8 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
assert len(vllm_outputs) == 2 assert len(vllm_outputs) == 2
assert len(hf_outputs) == 2 assert len(hf_outputs) == 2
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
@pytest.fixture(scope="module", params=EMBEDDING_MODELS) @pytest.fixture(scope="module", params=EMBEDDING_MODELS)
...@@ -112,7 +110,7 @@ def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name): ...@@ -112,7 +110,7 @@ def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
assert len(vllm_outputs) == 1 assert len(vllm_outputs) == 1
assert len(hf_outputs) == 1 assert len(hf_outputs) == 1
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name): def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
...@@ -140,8 +138,8 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name): ...@@ -140,8 +138,8 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
assert len(vllm_outputs) == 2 assert len(vllm_outputs) == 2
assert len(hf_outputs) == 2 assert len(hf_outputs) == 2
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name): def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
...@@ -169,5 +167,5 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name): ...@@ -169,5 +167,5 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
assert len(vllm_outputs) == 2 assert len(vllm_outputs) == 2
assert len(hf_outputs) == 2 assert len(hf_outputs) == 2
assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01) assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01) assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
...@@ -46,7 +46,6 @@ def test_models_mteb( ...@@ -46,7 +46,6 @@ def test_models_mteb(
vllm_runner, vllm_runner,
model_info: EmbedModelInfo, model_info: EmbedModelInfo,
) -> None: ) -> None:
pytest.skip("Skipping mteb test.")
from .mteb_utils import mteb_test_embed_models from .mteb_utils import mteb_test_embed_models
mteb_test_embed_models(hf_runner, vllm_runner, model_info) mteb_test_embed_models(hf_runner, vllm_runner, model_info)
......
...@@ -349,6 +349,17 @@ VLM_TEST_SETTINGS = { ...@@ -349,6 +349,17 @@ VLM_TEST_SETTINGS = {
use_tokenizer_eos=True, use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner, patch_hf_runner=model_utils.internvl_patch_hf_runner,
), ),
"intern_vl-video": VLMTestInfo(
models=[
"OpenGVLab/InternVL3-1B",
],
test_type=VLMTestType.VIDEO,
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
video_idx_to_prompt=lambda idx: "<video>",
max_model_len=8192,
use_tokenizer_eos=True,
patch_hf_runner=model_utils.internvl_patch_hf_runner,
),
"kimi_vl": VLMTestInfo( "kimi_vl": VLMTestInfo(
models=["moonshotai/Kimi-VL-A3B-Instruct"], models=["moonshotai/Kimi-VL-A3B-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
......
...@@ -4,6 +4,7 @@ import pytest ...@@ -4,6 +4,7 @@ import pytest
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset from vllm.assets.video import VideoAsset
from vllm.multimodal.image import convert_image_mode
models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"] models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]
...@@ -26,8 +27,9 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None: ...@@ -26,8 +27,9 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
give the same result. give the same result.
""" """
image_cherry = ImageAsset("cherry_blossom").pil_image.convert("RGB") image_cherry = convert_image_mode(
image_stop = ImageAsset("stop_sign").pil_image.convert("RGB") ImageAsset("cherry_blossom").pil_image, "RGB")
image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB")
images = [image_cherry, image_stop] images = [image_cherry, image_stop]
video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment