Commit ec5e299c authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.3' into v0.7.3-dev

parents 47bd229c ed6e9075
# SPDX-License-Identifier: Apache-2.0
from itertools import accumulate, product
from typing import Dict, List, Optional
from typing import Callable, Dict, List, Optional
import pytest
import torch
......@@ -24,7 +24,21 @@ CUDA_DEVICES = [
]
def _get_flat_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
head_size: int) -> tuple[int, ...]:
return (batch_size, seq_len, num_heads * head_size)
def _get_batch_tensor_shape(batch_size: int, seq_len: int, num_heads: int,
head_size: int) -> tuple[int, ...]:
return (batch_size, seq_len, num_heads, head_size)
TENSORS_SHAPES_FN = [_get_batch_tensor_shape, _get_flat_tensor_shape]
@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
@pytest.mark.parametrize("tensor_shape_fn", TENSORS_SHAPES_FN)
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
@pytest.mark.parametrize("seq_len", SEQ_LENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
......@@ -36,6 +50,7 @@ CUDA_DEVICES = [
@torch.inference_mode()
def test_rotary_embedding(
is_neox_style: bool,
tensor_shape_fn: Callable[[int, int, int, int], tuple[int]],
batch_size: int,
seq_len: int,
num_heads: int,
......@@ -55,13 +70,11 @@ def test_rotary_embedding(
if rotary_dim is None:
rotary_dim = head_size
rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
rope = rope.to(dtype=dtype)
rope = rope.to(dtype=dtype, device=torch.get_default_device())
positions = torch.randint(0, max_position, (batch_size, seq_len))
query = torch.randn(batch_size,
seq_len,
num_heads * head_size,
dtype=dtype)
query_shape = tensor_shape_fn(batch_size, seq_len, num_heads, head_size)
query = torch.randn(query_shape, dtype=dtype)
key = torch.randn_like(query)
# NOTE(woosuk): The reference implementation should be executed first
......@@ -80,6 +93,7 @@ def test_rotary_embedding(
@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
@pytest.mark.parametrize("tensor_shape_fn", TENSORS_SHAPES_FN)
@pytest.mark.parametrize("batch_size", BATCH_SIZES)
@pytest.mark.parametrize("seq_len", SEQ_LENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
......@@ -91,6 +105,7 @@ def test_rotary_embedding(
@torch.inference_mode()
def test_batched_rotary_embedding(
is_neox_style: bool,
tensor_shape_fn: Callable[[int, int, int, int], tuple[int]],
batch_size: int,
seq_len: int,
num_heads: int,
......@@ -110,13 +125,11 @@ def test_batched_rotary_embedding(
"rope_type": "linear",
"factor": (1, )
})
rope = rope.to(dtype=dtype)
rope = rope.to(dtype=dtype, device=torch.get_default_device())
positions = torch.randint(0, max_position, (batch_size, seq_len))
query = torch.randn(batch_size,
seq_len,
num_heads * head_size,
dtype=dtype)
query_shape = tensor_shape_fn(batch_size, seq_len, num_heads, head_size)
query = torch.randn(query_shape, dtype=dtype)
key = torch.randn_like(query)
# NOTE(woosuk): The reference implementation should be executed first
......@@ -171,7 +184,7 @@ def test_batched_rotary_embedding_multi_lora(
"rope_type": "linear",
"factor": tuple(scaling_factors)
})
rope = rope.to(dtype=dtype)
rope = rope.to(dtype=dtype, device=torch.get_default_device())
positions = torch.randint(0, max_position, (batch_size, seq_len))
query = torch.randn(batch_size,
......
......@@ -41,7 +41,7 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position,
is_neox_style, rotary_dim, head_size,
seq_len):
batch_size = 1
base = 0
base = 10000
num_heads = 7
rot = RotaryEmbedding(head_size, rotary_dim, max_position, base,
is_neox_style, torch.float32)
......
......@@ -273,6 +273,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv(
seq_lens: List[Tuple[int, int]], num_heads: Tuple[int, int],
head_size: int, dtype: torch.dtype, block_size: int,
soft_cap: Optional[float]) -> None:
pytest.skip("TODO: fix the accuracy issue")
torch.set_default_device("cuda")
current_platform.seed_everything(0)
num_seqs = len(seq_lens)
......@@ -384,6 +385,7 @@ def test_flashinfer_decode_with_paged_fp8_kv(
block_size: int,
soft_cap: Optional[float],
) -> None:
pytest.skip("TODO: fix the accuracy issue")
# test doesn't work for num_heads = (16,16)
torch.set_default_device("cuda")
current_platform.seed_everything(0)
......
......@@ -30,7 +30,7 @@ def setup_servers():
"-m",
"vllm.entrypoints.openai.api_server",
"--model",
os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3.1-8B-Instruct"),
os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
"--port",
"8100",
"--gpu-memory-utilization",
......@@ -51,7 +51,7 @@ def setup_servers():
"-m",
"vllm.entrypoints.openai.api_server",
"--model",
os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3.1-8B-Instruct"),
os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
"--port",
"8200",
"--gpu-memory-utilization",
......@@ -102,8 +102,7 @@ def test_disaggregated_prefilling(prompt):
response = requests.post("http://localhost:8100/v1/completions",
headers={"Content-Type": "application/json"},
json={
"model":
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.2-1B-Instruct",
"prompt": prompt,
"max_tokens": 1,
"temperature": 0
......@@ -114,8 +113,7 @@ def test_disaggregated_prefilling(prompt):
response = requests.post("http://localhost:8200/v1/completions",
headers={"Content-Type": "application/json"},
json={
"model":
"meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.2-1B-Instruct",
"prompt": prompt,
"max_tokens": 10,
"temperature": 0
......
......@@ -245,6 +245,11 @@ def qwen2vl_lora_files():
return snapshot_download(repo_id="jeeejeee/qwen2-vl-lora-pokemon")
@pytest.fixture(scope="session")
def qwen25vl_lora_files():
return snapshot_download(repo_id="jeeejeee/qwen25-vl-lora-pokemon")
@pytest.fixture(scope="session")
def tinyllama_lora_files():
# return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
......@@ -324,3 +329,20 @@ def llama_2_7b_engine_extra_embeddings():
def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
yield (llama_2_7b_engine_extra_embeddings.model_executor.driver_worker.
model_runner.model)
@pytest.fixture(params=[True, False])
def run_with_both_engines_lora(request, monkeypatch):
# Automatically runs tests twice, once with V1 and once without
use_v1 = request.param
# Tests decorated with `@skip_v1` are only run without v1
skip_v1 = request.node.get_closest_marker("skip_v1")
if use_v1:
if skip_v1:
pytest.skip("Skipping test on vllm V1")
monkeypatch.setenv('VLLM_USE_V1', '1')
else:
monkeypatch.setenv('VLLM_USE_V1', '0')
yield
# SPDX-License-Identifier: Apache-2.0
import asyncio
import time
from pathlib import Path
from typing import List
import pytest
from huggingface_hub import snapshot_download
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.inputs import TextPrompt
from vllm.lora.request import LoRARequest
from vllm.sampling_params import SamplingParams
from vllm.utils import merge_async_iterators
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
LORA_MODULE_DOWNLOAD_PATH = None # Populated by download_and_prepare_lora_module() #noqa
LORA_RANK = 8
DEFAULT_MAX_LORAS = 16 * 3
def download_and_prepare_lora_module():
"""
Request submission is expensive when the LoRA adapters have their own
tokenizers. This is because, for each request with a new LoRA adapter ID,
the front-end loads the tokenizer from disk.
In this test, as we are comparing request processing times, we want to
minimize any extra activity. To this effect, we download the LoRA
adapter and remove all the tokenizer files, so the engine will default
to the base model tokenizer.
"""
global LORA_MODULE_DOWNLOAD_PATH
LORA_MODULE_HF_PATH = "yard1/llama-2-7b-sql-lora-test"
LORA_MODULE_DOWNLOAD_PATH = snapshot_download(repo_id=LORA_MODULE_HF_PATH)
tokenizer_files = [
'added_tokens.json', 'tokenizer_config.json', 'tokenizer.json',
'tokenizer.model'
]
for tokenizer_file in tokenizer_files:
del_path = Path(LORA_MODULE_DOWNLOAD_PATH) / tokenizer_file
del_path.unlink(missing_ok=True)
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def get_lora_requests() -> List[LoRARequest]:
lora_requests: List[LoRARequest] = [
LoRARequest(lora_name=f"{i}",
lora_int_id=i,
lora_path=LORA_MODULE_DOWNLOAD_PATH)
for i in range(1, DEFAULT_MAX_LORAS + 1)
]
return lora_requests
async def requests_processing_time(llm,
lora_requests: List[LoRARequest]) -> float:
sampling_params = SamplingParams(n=1,
temperature=0.0,
top_p=1.0,
ignore_eos=True,
max_tokens=1)
generators = []
start = time.perf_counter()
for lora_request in lora_requests:
lora_int_id = lora_request.lora_int_id
generator = llm.generate(
prompt=TextPrompt(prompt=f"hello {lora_int_id}",
multi_modal_data=None), # type: ignore
sampling_params=sampling_params,
lora_request=lora_request,
request_id=f"test{lora_int_id}")
generators.append(generator)
all_gens = merge_async_iterators(*generators)
async for i, res in all_gens:
pass
end = time.perf_counter()
return end - start
@pytest.mark.asyncio
async def test_add_lora():
"""
The add_lora function is used to pre-load some LoRA adapters into the
engine in anticipation of future requests using these adapters. To test
this functionality, we use the async engine to process some requests - We
do it twice, once with add_lora() pre-loading and once without.
We measure the request processing time in both cases and expect the time
to be lesser in the case with add_lora() calls.
"""
download_and_prepare_lora_module()
lora_requests: List[LoRARequest] = get_lora_requests()
max_loras = len(set([lr.lora_int_id for lr in lora_requests]))
# Create engine in eager-mode. Due to high max_loras, the CI can
# OOM during cuda-graph capture.
engine_args = AsyncEngineArgs(
model=MODEL_PATH,
enable_lora=True,
max_loras=max_loras,
max_lora_rank=LORA_RANK,
max_model_len=128,
gpu_memory_utilization=0.8, #avoid OOM
enforce_eager=True)
# The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
# environment variable. reload vllm.enging.async_llm_engine as
# vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the
# env var.
import importlib
import vllm.engine.async_llm_engine
importlib.reload(vllm.engine.async_llm_engine)
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
# split lora_requests into 3 parts
part_size = len(lora_requests) // 3
dummy_run_requests = lora_requests[:part_size]
warmup_run_requests = lora_requests[part_size:part_size * 2]
cold_run_requests = lora_requests[part_size * 2:]
async with build_async_engine_client_from_engine_args(engine_args) as llm:
# Dummy run - So any 1-time functionality like triton kernel compilation
# is complete here.
await requests_processing_time(llm, dummy_run_requests)
# Run with warmup
for lr in warmup_run_requests:
await llm.add_lora(lr)
# Wait for the add_lora function to complete on the server side.
await asyncio.sleep(30)
time_with_add_lora = await requests_processing_time(
llm, warmup_run_requests)
# Run without any warmup
time_cold_start = await requests_processing_time(
llm, cold_run_requests)
print(f"time hot-start {time_with_add_lora} vs "
f"time cold-start {time_cold_start} ")
assert time_with_add_lora < time_cold_start, (
f"time_with_add_lora={time_with_add_lora}, "
f"time_cold_start={time_cold_start}"
"The engine request processing time with LoRA pre-loading "
"must be less than the version that does on-demand LoRA loading.")
......@@ -45,6 +45,14 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
return generated_texts
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def test_baichuan_lora(baichuan_lora_files):
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
......
......@@ -3,6 +3,8 @@
from typing import List
import os
import pytest
import vllm
from tests.utils import fork_new_process_for_each_test
from vllm.lora.request import LoRARequest
......@@ -49,6 +51,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
return generated_texts
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.mark.skip_v1
@fork_new_process_for_each_test
def test_chatglm3_lora(chatglm3_lora_files):
llm = vllm.LLM(MODEL_PATH,
......@@ -68,6 +79,7 @@ def test_chatglm3_lora(chatglm3_lora_files):
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
@pytest.mark.skip_v1
@multi_gpu_test(num_gpus=4)
@fork_new_process_for_each_test
def test_chatglm3_lora_tp4(chatglm3_lora_files):
......@@ -89,6 +101,7 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
@pytest.mark.skip_v1
@multi_gpu_test(num_gpus=4)
@fork_new_process_for_each_test
def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
......
......@@ -36,6 +36,14 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
return generated_texts
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.mark.xfail(current_platform.is_rocm(),
reason="There can be output mismatch on ROCm")
def test_gemma_lora(gemma_lora_files):
......
......@@ -2,6 +2,7 @@
from typing import List
import pytest
import ray
import vllm
......@@ -74,6 +75,14 @@ def generate_and_test(llm, sql_lora_files):
print("removing lora")
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@fork_new_process_for_each_test
def test_llama_lora(sql_lora_files):
......@@ -86,6 +95,9 @@ def test_llama_lora(sql_lora_files):
generate_and_test(llm, sql_lora_files)
# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
# used by the engine yet.
@pytest.mark.skip_v1
@fork_new_process_for_each_test
def test_llama_lora_warmup(sql_lora_files):
"""Test that the LLM initialization works with a warmup LORA path and
......
......@@ -32,6 +32,17 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
return generated_texts
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
# Skipping for V1 for now as we are hitting,
# "Head size 80 is not supported by FlashAttention." error.
@pytest.mark.skip_v1
@pytest.mark.parametrize("lora_bias", [True])
@pytest.mark.parametrize("fully_sharded", [True, False])
def test_lora_bias(lora_bias_files: str, lora_bias: bool, fully_sharded: bool):
......
......@@ -606,20 +606,26 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
assert isinstance(model.get_submodule("gate_up_proj"),
MergedColumnParallelLinearWithLoRA)
# Verify packed lora is correct
model_lora_clone = model_lora.clone(1)
model_lora_clone1 = model_lora1.clone(1)
assert manager.add_adapter(model_lora)
assert manager.add_adapter(model_lora1)
assert model_lora.get_lora("gate_proj") is None
assert model_lora.get_lora("up_proj") is None
assert model_lora1.get_lora("up_proj") is None
packed_lora = model_lora.get_lora("gate_up_proj")
assert packed_lora and isinstance(packed_lora, PackedLoRALayerWeights)
torch.testing.assert_close(packed_lora.lora_a[0],
model_lora.get_lora("gate_proj").lora_a)
model_lora_clone.get_lora("gate_proj").lora_a)
torch.testing.assert_close(packed_lora.lora_b[0],
model_lora.get_lora("gate_proj").lora_b)
model_lora_clone.get_lora("gate_proj").lora_b)
torch.testing.assert_close(packed_lora.lora_a[1],
model_lora.get_lora("up_proj").lora_a)
model_lora_clone.get_lora("up_proj").lora_a)
torch.testing.assert_close(packed_lora.lora_b[1],
model_lora.get_lora("up_proj").lora_b)
model_lora_clone.get_lora("up_proj").lora_b)
packed_lora1 = model_lora1.get_lora("gate_up_proj")
assert packed_lora1 and isinstance(packed_lora1, PackedLoRALayerWeights)
......@@ -627,6 +633,6 @@ def test_packed_loras(dist_init, dummy_model_gate_up, device):
assert packed_lora1.lora_a[0] is None
assert packed_lora1.lora_b[0] is None
torch.testing.assert_close(packed_lora1.lora_a[1],
model_lora1.get_lora("up_proj").lora_a)
model_lora_clone1.get_lora("up_proj").lora_a)
torch.testing.assert_close(packed_lora1.lora_b[1],
model_lora1.get_lora("up_proj").lora_b)
model_lora_clone1.get_lora("up_proj").lora_b)
......@@ -3,6 +3,8 @@
from typing import List
import os
import pytest
import vllm
from vllm.lora.request import LoRARequest
from ..utils import models_path_prefix
......@@ -50,6 +52,17 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
return generated_texts
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
# Skipping for V1 for now as we are hitting,
# "Head size 80 is not supported by FlashAttention." error.
@pytest.mark.skip_v1
def test_phi2_lora(phi2_lora_files):
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
# Otherwise, the lora-test will fail due to CUDA OOM.
......
# SPDX-License-Identifier: Apache-2.0
from threading import Lock
from typing import List
import pytest
import torch
import vllm.lora.ops.triton_ops # noqa: F401
from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
bgmv_shrink, sgmv_expand,
sgmv_expand_slice, sgmv_shrink)
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
from vllm.platforms import current_platform
from .utils import (PunicaTensors, assert_close, generate_data,
generate_data_for_expand_nslices,
generate_data_for_nslices)
# Utility shrink and expand operations used as reference implementations.
def sgmv_shrink_for_nslices(
nslices: int, inputs_tensor: torch.Tensor,
lora_weights_lst: List[torch.Tensor], out_tensor: torch.Tensor,
b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor,
prompt_lora_mapping: torch.Tensor, batches: int, max_seq_length: int,
num_tokens: int, scaling: float):
"""
Wrapper around sgmv_shrink that handles any nslices.
"""
for index in range(nslices):
sgmv_shrink(
inputs_tensor,
lora_weights_lst[index],
out_tensor[index],
b_seq_start_loc,
seq_len_tensor,
prompt_lora_mapping,
batches,
max_seq_length,
num_tokens,
scaling,
)
def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
inputs_tensor: torch.Tensor,
lora_weights_lst: List[torch.Tensor],
out_tensor: torch.Tensor,
b_seq_start_loc: torch.Tensor,
seq_len_tensor: torch.Tensor,
prompt_lora_mapping: torch.Tensor, batches: int,
max_seq_length: int, num_tokens: int,
add_inputs: bool) -> None:
"""
Wrapper around sgmv_expand that handles any nslices.
"""
if nslices == 1:
# Verify the torch's sgmv_expand op
sgmv_expand(
inputs_tensor[0],
lora_weights_lst[0],
out_tensor,
b_seq_start_loc,
seq_len_tensor,
prompt_lora_mapping,
batches,
max_seq_length,
num_tokens,
add_inputs=add_inputs,
)
else:
slice_offset = 0
for index in range(nslices):
lora_weights = lora_weights_lst[index]
sgmv_expand_slice(
inputs_tensor[index],
lora_weights,
out_tensor,
b_seq_start_loc,
seq_len_tensor,
prompt_lora_mapping,
batches,
max_seq_length,
num_tokens,
slice_offset,
hidden_size,
add_inputs=add_inputs,
)
slice_offset += hidden_size
_dict_lock = Lock()
def check_sgmv_shrink(batches: int, num_loras: int, rank: int,
hidden_size: int, nslices: int, dtype: torch.dtype,
device: str, seq_length: int, scaling: float):
"""
Compare outputs of vllm.sgmv_shrink kernel against a reference
implementation.
"""
data: PunicaTensors = generate_data_for_nslices(
batches,
hidden_size,
num_loras,
rank,
seq_length,
nslices,
dtype,
"shrink",
device,
)
max_seq_length, token_nums = data.meta()
# Preventing cache error pointer.
with _dict_lock:
_LORA_A_PTR_DICT.clear()
torch.ops.vllm.sgmv_shrink(
data.inputs_tensor,
data.lora_weights,
data.our_out_tensor,
data.b_seq_start_loc,
data.seq_len_tensor,
data.prompt_lora_mapping,
batches,
max_seq_length,
token_nums,
scaling,
)
sgmv_shrink_for_nslices(
nslices,
data.inputs_tensor,
data.lora_weights,
data.ref_out_tensor,
data.b_seq_start_loc,
data.seq_len_tensor,
data.prompt_lora_mapping,
batches,
max_seq_length,
token_nums,
scaling,
)
assert_close(data.our_out_tensor, data.ref_out_tensor)
def check_sgmv_expand(batches: int, num_loras: int, rank: int,
hidden_size: int, nslices: int, dtype: torch.dtype,
device: str, seq_length: int, add_inputs: bool):
"""
Compare outputs of vllm.sgmv_expand kernel against a reference
implementation.
"""
data: PunicaTensors = generate_data_for_nslices(
batches,
hidden_size,
num_loras,
rank,
seq_length,
nslices,
dtype,
"expand",
device,
)
max_seq_length, token_nums = data.meta()
with _dict_lock:
_LORA_B_PTR_DICT.clear()
torch.ops.vllm.sgmv_expand(
data.inputs_tensor,
data.lora_weights,
data.our_out_tensor,
data.b_seq_start_loc,
data.seq_len_tensor,
data.prompt_lora_mapping,
batches,
max_seq_length,
token_nums,
offset_start=0,
add_inputs=add_inputs,
)
sgmv_expand_for_nslices(nslices,
hidden_size,
data.inputs_tensor,
data.lora_weights,
data.ref_out_tensor,
data.b_seq_start_loc,
data.seq_len_tensor,
data.prompt_lora_mapping,
batches,
max_seq_length,
token_nums,
add_inputs=add_inputs)
assert_close(data.our_out_tensor, data.ref_out_tensor)
def check_bgmv_shrink(batches: int, num_loras: int, rank: int,
hidden_size: int, dtype: torch.dtype, device: str,
scaling: float):
"""
Compare vllm.bgmv_shrink against a reference implementation.
"""
seq_length = 1
data: PunicaTensors = generate_data(
batches,
hidden_size,
num_loras,
rank,
seq_length,
dtype,
"shrink",
device,
)
torch.ops.vllm.bgmv_shrink(
data.inputs_tensor,
data.lora_weights,
data.our_out_tensor,
data.token_lora_mapping,
scaling,
)
bgmv_shrink(
data.inputs_tensor,
data.lora_weights,
data.ref_out_tensor,
data.token_lora_mapping,
scaling,
)
data.ref_out_tensor = data.ref_out_tensor.to(torch.float32)
assert_close(data.our_out_tensor, data.ref_out_tensor)
def check_bgmv_expand(batches: int, num_loras: int, rank: int,
hidden_size: int, dtype: torch.dtype, device: str,
add_inputs: bool):
"""
Compare vllm.bgmv_expand against a reference implementation.
"""
seq_length = 1
data: PunicaTensors = generate_data(
batches,
hidden_size,
num_loras,
rank,
seq_length,
dtype,
"expand",
device,
)
torch.ops.vllm.bgmv_expand(
data.inputs_tensor,
data.lora_weights,
data.our_out_tensor,
data.token_lora_mapping,
add_inputs=add_inputs,
)
bgmv_expand(
data.inputs_tensor,
data.lora_weights,
data.ref_out_tensor,
data.token_lora_mapping,
add_inputs=add_inputs,
)
assert_close(data.our_out_tensor, data.ref_out_tensor)
def check_bgmv_expand_slice(batches: int, num_loras: int, rank: int,
hidden_size: int, nslices: int, dtype: torch.dtype,
device: str, add_inputs: bool):
"""
Compare vllm.bgmv_expand_slice against a reference implementation.
"""
seq_length = 1
data: PunicaTensors = generate_data_for_expand_nslices(
batches,
hidden_size,
num_loras,
rank,
seq_length,
dtype,
nslices,
device,
)
slice_offset = 0
for index in range(nslices):
torch.ops.vllm.bgmv_expand_slice(
data.inputs_tensor,
data.lora_weights[index],
data.our_out_tensor,
data.token_lora_mapping,
slice_offset,
slice_size=hidden_size,
add_inputs=add_inputs,
)
bgmv_expand_slice(
data.inputs_tensor,
data.lora_weights[index],
data.ref_out_tensor,
data.token_lora_mapping,
slice_offset,
slice_size=hidden_size,
add_inputs=add_inputs,
)
slice_offset += hidden_size
assert_close(data.our_out_tensor, data.ref_out_tensor)
# Tests
# We test the punica kernels along 2 verticals mainly.
# 1. Variations in hidden_dim size
# 2. Variations in all other parameters like (batch_size, max_rank, num_loras
# etc.)
# We have collected the hidden_sizes included in the LoRA models
# currently supported by vLLM. It tests whether the corresponding Triton
# kernel can run normally when tensor parallelism is set to
# [1, 2, 4, 8, 16, 32, 64].
HIDDEN_SIZES = [
128,
256,
512,
896,
1024,
1152,
1216,
1280,
1536,
1664,
2048,
2240,
2304,
2368,
2432,
2560,
2752,
3072,
3328,
3456,
3584,
3712,
4096,
4480,
4608,
4736,
4864,
5120,
5504,
5632,
5888,
6144,
6400,
6848,
6912,
7168,
7424,
8192,
8960,
9216,
9472,
10240,
11008,
11264,
13824,
14336,
14784,
14848,
15360,
18944,
22016,
22528,
24576,
27392,
27648,
29568,
29696,
32000,
32256,
32512,
32768,
33024,
36864,
43264,
49152,
49408,
60544,
60672,
64000,
64256,
102400,
102656,
128000,
128256,
]
#The size of TP
divisibility = [1, 2, 8, 16, 64]
all_hidden_size = []
for div in divisibility:
for hidden_size in HIDDEN_SIZES:
all_hidden_size.append(hidden_size // div)
HIDDEN_SIZES = list(set(all_hidden_size))
# Test params that focuses on hidden_size variation.
hs_test_params = {
"hidden_sizes": HIDDEN_SIZES,
"batches": [4],
"num_loras": [4],
"max_ranks": [32],
}
# General tests params that tests for variations in all dimensions
# except hidden_size.
test_params = {
"hidden_sizes": [2049],
"batches": [1, 4, 16, 32],
"num_loras": [1, 8, 32, 128],
"max_ranks": [1, 4, 8, 16, 32, 64, 128, 256],
}
DTYPES = [torch.float16, torch.bfloat16]
DEVICES = [f"cuda:{0}"]
SEED = [0]
@pytest.mark.parametrize("batches", test_params['batches'])
@pytest.mark.parametrize("num_loras", test_params['num_loras'])
@pytest.mark.parametrize("rank", test_params['max_ranks'])
@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
@pytest.mark.parametrize("nslices", [1, 2, 3])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
def test_punica_sgmv(
batches: int,
num_loras: int,
rank: int,
hidden_size: int,
nslices: int,
dtype: torch.dtype,
device: str,
seed: int,
op_type: str,
):
torch.set_default_device(device)
current_platform.seed_everything(seed)
if op_type == "shrink":
check_sgmv_shrink(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
scaling=0.5)
else:
check_sgmv_expand(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
add_inputs=True)
@pytest.mark.parametrize("batches", hs_test_params['batches'])
@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
@pytest.mark.parametrize("nslices", [1, 2, 3])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
def test_punica_sgmv_hidden_size(
batches: int,
num_loras: int,
rank: int,
hidden_size: int,
nslices: int,
dtype: torch.dtype,
device: str,
seed: int,
op_type: str,
):
torch.set_default_device(device)
current_platform.seed_everything(seed)
if op_type == "shrink":
check_sgmv_shrink(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
scaling=0.5)
else:
check_sgmv_expand(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
add_inputs=True)
@pytest.mark.parametrize("batches", test_params['batches'])
@pytest.mark.parametrize("num_loras", test_params['num_loras'])
@pytest.mark.parametrize("rank", test_params['max_ranks'])
@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
def test_punica_bgmv(
batches: int,
num_loras: int,
rank: int,
hidden_size: int,
dtype: torch.dtype,
device: str,
seed: int,
op_type: str,
):
torch.set_default_device(device)
current_platform.seed_everything(seed)
if op_type == "shrink":
check_bgmv_shrink(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
dtype=dtype,
device=device,
scaling=0.5)
else:
check_bgmv_expand(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
dtype=dtype,
device=device,
add_inputs=True)
@pytest.mark.parametrize("batches", hs_test_params['batches'])
@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
def test_punica_bgmv_hidden_size(
batches: int,
num_loras: int,
rank: int,
hidden_size: int,
dtype: torch.dtype,
device: str,
seed: int,
op_type: str,
):
torch.set_default_device(device)
current_platform.seed_everything(seed)
if op_type == "shrink":
check_bgmv_shrink(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
dtype=dtype,
device=device,
scaling=0.5)
else:
check_bgmv_expand(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
dtype=dtype,
device=device,
add_inputs=True)
@pytest.mark.parametrize("batches", test_params['batches'])
@pytest.mark.parametrize("num_loras", test_params['num_loras'])
@pytest.mark.parametrize("rank", test_params['max_ranks'])
@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
@pytest.mark.parametrize("nslices", [2, 3])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("seed", SEED)
def test_punica_bgmv_expand_nslices(batches: int, num_loras: int, rank: int,
hidden_size: int, nslices: int,
dtype: torch.dtype, device: str,
seed: int):
torch.set_default_device(device)
current_platform.seed_everything(seed)
check_bgmv_expand_slice(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
add_inputs=True)
@pytest.mark.parametrize("batches", hs_test_params['batches'])
@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
@pytest.mark.parametrize("nslices", [2, 3])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("seed", SEED)
def test_punica_bgmv_expand_nslices_hidden_size(batches: int, num_loras: int,
rank: int, hidden_size: int,
nslices: int,
dtype: torch.dtype,
device: str, seed: int):
torch.set_default_device(device)
current_platform.seed_everything(seed)
check_bgmv_expand_slice(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
add_inputs=True)
# SPDX-License-Identifier: Apache-2.0
"""
This script is mainly used to tests various hidden_sizes. We have collected the
hidden_sizes included in the LoRA models currently supported by vLLM. It tests
whether the corresponding Triton kernel can run normally when tensor parallelism
is set to [1, 2, 4, 8, 16, 32, 64].
"""
from threading import Lock
import pytest
import torch
import vllm.lora.ops.triton_ops # noqa: F401
from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
bgmv_shrink, sgmv_expand,
sgmv_expand_slice, sgmv_shrink)
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
from vllm.platforms import current_platform
from .utils import (assert_close, generate_data,
generate_data_for_expand_nslices,
generate_data_for_nslices)
HIDDEN_SIZES = [
128,
256,
512,
896,
1024,
1152,
1216,
1280,
1536,
1664,
2048,
2240,
2304,
2368,
2432,
2560,
2752,
3072,
3328,
3456,
3584,
3712,
4096,
4480,
4608,
4736,
4864,
5120,
5504,
5632,
5888,
6144,
6400,
6848,
6912,
7168,
7424,
8192,
8960,
9216,
9472,
10240,
11008,
11264,
13824,
14336,
14784,
14848,
15360,
18944,
22016,
22528,
24576,
27392,
27648,
29568,
29696,
32000,
32256,
32512,
32768,
33024,
36864,
43264,
49152,
49408,
60544,
60672,
64000,
64256,
102400,
102656,
128000,
128256,
]
#The size of TP
divisibility = [1, 2, 8, 16, 64]
all_hidden_size = []
for div in divisibility:
for hidden_size in HIDDEN_SIZES:
all_hidden_size.append(hidden_size // div)
HIDDEN_SIZES = list(set(all_hidden_size))
BATCHES = [4]
NUM_LORA = [4]
DTYPES = [torch.float16, torch.bfloat16]
MAX_RANKS = [32]
SCALES = [0.5]
SEED = [0]
DEVICES = [f"cuda:{0}"]
_dict_lock = Lock()
@pytest.mark.parametrize("batches", BATCHES)
@pytest.mark.parametrize("num_loras", NUM_LORA)
@pytest.mark.parametrize("rank", MAX_RANKS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("scaling", SCALES)
@pytest.mark.parametrize("nslices", [1, 2, 3])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("device", DEVICES)
def test_punica_sgmv(
batches: int,
num_loras: int,
rank: int,
hidden_size: int,
scaling: float,
nslices: int,
dtype: torch.dtype,
op_type: str,
seed: int,
device: str,
):
torch.set_default_device(device)
current_platform.seed_everything(seed)
seq_length = 128
(
inputs_tensor,
lora_weights_lst,
our_out_tensor,
ref_out_tensor,
b_seq_start_loc,
lora_indices_tensor,
seq_len_tensor,
indices,
) = generate_data_for_nslices(
batches,
hidden_size,
num_loras,
rank,
seq_length,
nslices,
dtype,
op_type,
device,
)
max_seq_length = seq_len_tensor.max()
token_nums = seq_len_tensor.sum().item()
if isinstance(max_seq_length, tuple):
max_seq_length = max_seq_length[0].item()
else:
max_seq_length = max_seq_length.item()
if op_type == "shrink":
# Preventing cache error pointer.
with _dict_lock:
_LORA_A_PTR_DICT.clear()
torch.ops.vllm.sgmv_shrink(
inputs_tensor,
lora_weights_lst,
our_out_tensor,
b_seq_start_loc,
seq_len_tensor,
lora_indices_tensor,
batches,
max_seq_length,
token_nums,
scaling,
)
for index in range(nslices):
sgmv_shrink(
inputs_tensor,
lora_weights_lst[index],
ref_out_tensor[index],
b_seq_start_loc,
seq_len_tensor,
lora_indices_tensor,
batches,
max_seq_length,
token_nums,
scaling,
)
else:
with _dict_lock:
_LORA_B_PTR_DICT.clear()
torch.ops.vllm.sgmv_expand(
inputs_tensor,
lora_weights_lst,
our_out_tensor,
b_seq_start_loc,
seq_len_tensor,
lora_indices_tensor,
batches,
max_seq_length,
token_nums,
offset_start=0,
add_inputs=True,
)
if nslices == 1:
# Verify the torch's sgmv_expand op
sgmv_expand(
inputs_tensor[0],
lora_weights_lst[0],
ref_out_tensor,
b_seq_start_loc,
seq_len_tensor,
lora_indices_tensor,
batches,
max_seq_length,
token_nums,
add_inputs=True,
)
else:
slice_offset = 0
for index in range(nslices):
lora_weights = lora_weights_lst[index]
sgmv_expand_slice(
inputs_tensor[index],
lora_weights,
ref_out_tensor,
b_seq_start_loc,
seq_len_tensor,
lora_indices_tensor,
batches,
max_seq_length,
token_nums,
slice_offset,
hidden_size,
add_inputs=True,
)
slice_offset += hidden_size
assert_close(our_out_tensor, ref_out_tensor)
@pytest.mark.parametrize("batches", BATCHES)
@pytest.mark.parametrize("num_loras", NUM_LORA)
@pytest.mark.parametrize("rank", MAX_RANKS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("scaling", SCALES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("device", DEVICES)
def test_punica_bgmv(
batches: int,
num_loras: int,
rank: int,
hidden_size: int,
scaling: float,
dtype: torch.dtype,
op_type: str,
seed: int,
device: str,
):
torch.set_default_device(device)
current_platform.seed_everything(seed)
seq_length = 1
(
inputs_tensor,
lora_weights,
our_out_tensor,
ref_out_tensor,
b_seq_start_loc,
lora_indices_tensor,
seq_len_tensor,
indices,
) = generate_data(
batches,
hidden_size,
num_loras,
rank,
seq_length,
dtype,
op_type,
device,
)
if op_type == "shrink":
torch.ops.vllm.bgmv_shrink(
inputs_tensor,
lora_weights,
our_out_tensor,
indices,
scaling,
)
bgmv_shrink(
inputs_tensor,
lora_weights,
ref_out_tensor,
indices,
scaling,
)
else:
torch.ops.vllm.bgmv_expand(
inputs_tensor,
lora_weights,
our_out_tensor,
indices,
add_inputs=True,
)
bgmv_expand(
inputs_tensor,
lora_weights,
ref_out_tensor,
indices,
add_inputs=True,
)
if op_type == "shrink":
ref_out_tensor = ref_out_tensor.to(torch.float32)
assert_close(our_out_tensor, ref_out_tensor)
@pytest.mark.parametrize("batches", BATCHES)
@pytest.mark.parametrize("num_loras", NUM_LORA)
@pytest.mark.parametrize("rank", MAX_RANKS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("nslices", [2, 3])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("device", DEVICES)
def test_punica_bgmv_expand_nslices(
batches: int,
num_loras: int,
rank: int,
hidden_size: int,
nslices: int,
dtype: torch.dtype,
seed: int,
device: str,
):
torch.set_default_device(device)
current_platform.seed_everything(seed)
seq_length = 1
(
inputs_tensor,
lora_weights_lst,
our_outputs,
ref_outputs,
b_seq_start_loc,
lora_indices_tensor,
seq_len_tensor,
indices,
) = generate_data_for_expand_nslices(
batches,
hidden_size,
num_loras,
rank,
seq_length,
dtype,
nslices,
device,
)
slice_offset = 0
for index in range(nslices):
lora_weights = lora_weights_lst[index]
torch.ops.vllm.bgmv_expand_slice(
inputs_tensor,
lora_weights,
our_outputs,
indices,
slice_offset,
slice_size=hidden_size,
add_inputs=True,
)
bgmv_expand_slice(
inputs_tensor,
lora_weights,
ref_outputs,
indices,
slice_offset,
slice_size=hidden_size,
add_inputs=True,
)
slice_offset += hidden_size
assert_close(our_outputs, ref_outputs)
......@@ -73,6 +73,14 @@ def do_sample(llm: vllm.LLM,
return generated_texts
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("tp_size", [1])
def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
......
# SPDX-License-Identifier: Apache-2.0
from typing import List
from dataclasses import dataclass
from typing import Dict, List, Optional
import pytest
from packaging.version import Version
from transformers import __version__ as TRANSFORMERS_VERSION
import vllm
from vllm.assets.image import ImageAsset
from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform
MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
PROMPT_TEMPLATE = (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
"\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
"What is in the image?<|im_end|>\n"
"<|im_start|>assistant\n")
@dataclass
class TestConfig:
model_path: str
lora_path: str
max_num_seqs: int = 2
max_loras: int = 2
max_lora_rank: int = 16
max_model_len: int = 4096
mm_processor_kwargs: Optional[Dict[str, int]] = None
def __post_init__(self):
if self.mm_processor_kwargs is None:
self.mm_processor_kwargs = {
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
}
class Qwen2VLTester:
"""Test helper for Qwen2 VL models with LoRA"""
PROMPT_TEMPLATE = (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
"\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
"What is in the image?<|im_end|>\n"
"<|im_start|>assistant\n")
def __init__(self, config: TestConfig):
self.config = config
self.llm = self._initialize_llm()
def _initialize_llm(self) -> vllm.LLM:
"""Initialize the LLM with given configuration"""
return vllm.LLM(
model=self.config.model_path,
max_num_seqs=self.config.max_num_seqs,
enable_lora=True,
max_loras=self.config.max_loras,
max_lora_rank=self.config.max_lora_rank,
trust_remote_code=True,
mm_processor_kwargs=self.config.mm_processor_kwargs,
max_model_len=self.config.max_model_len,
)
def run_test(self,
images: List[ImageAsset],
expected_outputs: List[str],
lora_id: Optional[int] = None,
temperature: float = 0,
max_tokens: int = 5) -> List[str]:
sampling_params = vllm.SamplingParams(
temperature=temperature,
max_tokens=max_tokens,
)
inputs = [{
"prompt": self.PROMPT_TEMPLATE,
"multi_modal_data": {
"image": asset.pil_image
},
} for asset in images]
lora_request = LoRARequest(str(lora_id), lora_id,
self.config.lora_path)
outputs = self.llm.generate(inputs,
sampling_params,
lora_request=lora_request)
generated_texts = [
output.outputs[0].text.strip() for output in outputs
]
IMAGE_ASSETS = [
# Validate outputs
for generated, expected in zip(generated_texts, expected_outputs):
assert expected.startswith(
generated), f"Generated text {generated} doesn't "
f"match expected pattern {expected}"
return generated_texts
TEST_IMAGES = [
ImageAsset("stop_sign"),
ImageAsset("cherry_blossom"),
]
# After fine-tuning with LoRA, all generated content should start begin `A`.
EXPECTED_OUTPUT = [
EXPECTED_OUTPUTS = [
"A red stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.", # noqa: E501
"A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.", # noqa: E501
]
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
sampling_params = vllm.SamplingParams(
temperature=0,
max_tokens=5,
)
inputs = [{
"prompt": PROMPT_TEMPLATE,
"multi_modal_data": {
"image": asset.pil_image
},
} for asset in IMAGE_ASSETS]
outputs = llm.generate(
inputs,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None,
)
# Print the outputs.
generated_texts: List[str] = []
for output in outputs:
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
print(f"Generated text: {generated_text!r}")
return generated_texts
QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="Qwen2-VL dependency xformers incompatible with ROCm")
def test_qwen2vl_lora(qwen2vl_lora_files):
llm = vllm.LLM(
MODEL_PATH,
max_num_seqs=2,
enable_lora=True,
max_loras=2,
max_lora_rank=16,
trust_remote_code=True,
mm_processor_kwargs={
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
},
max_model_len=4096,
)
output1 = do_sample(llm, qwen2vl_lora_files, lora_id=1)
for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output1[i])
output2 = do_sample(llm, qwen2vl_lora_files, lora_id=2)
for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output2[i])
"""Test Qwen 2.0 VL model with LoRA"""
config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
lora_path=qwen2vl_lora_files)
tester = Qwen2VLTester(config)
# Test with different LoRA IDs
for lora_id in [1, 2]:
tester.run_test(TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS,
lora_id=lora_id)
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
)
@pytest.mark.skipif(
Version(TRANSFORMERS_VERSION) < Version("4.49.0"),
reason="Qwen2.5-VL require transformers version no lower than 4.49.0",
)
def test_qwen25vl_lora(qwen25vl_lora_files):
"""Test Qwen 2.5 VL model with LoRA"""
config = TestConfig(model_path=QWEN25VL_MODEL_PATH,
lora_path=qwen25vl_lora_files)
tester = Qwen2VLTester(config)
# Test with different LoRA IDs
for lora_id in [1, 2]:
tester.run_test(TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS,
lora_id=lora_id)
# SPDX-License-Identifier: Apache-2.0
from typing import Dict, List, Optional
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Union
import torch
......@@ -106,6 +107,31 @@ def assert_close(a, b):
torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
@dataclass
class PunicaTensors:
inputs_tensor: torch.Tensor
lora_weights: Union[torch.Tensor, List[torch.Tensor]]
our_out_tensor: torch.Tensor
ref_out_tensor: torch.Tensor
b_seq_start_loc: torch.Tensor
prompt_lora_mapping: torch.Tensor
seq_len_tensor: torch.Tensor
token_lora_mapping: torch.Tensor
def meta(self) -> Tuple[int, int]:
"""
Infer max_seq_length and token_nums from the tensors
and return them.
"""
max_seq_length = self.seq_len_tensor.max()
token_nums = self.seq_len_tensor.sum().item()
if isinstance(max_seq_length, tuple):
max_seq_length = max_seq_length[0].item()
else:
max_seq_length = max_seq_length.item()
return max_seq_length, token_nums
def generate_data(
batches,
hidden_size,
......@@ -115,7 +141,7 @@ def generate_data(
dtype,
op_type,
device,
):
) -> PunicaTensors:
seq_len_tensor = torch.randint(seq_length, seq_length + 1,
(batches, )).to(device)
b_seq_start_loc = torch.cumsum(
......@@ -164,7 +190,8 @@ def generate_data(
indices[current_offset:current_offset +
seq_len_tensor[b_id]].copy_(lora_index)
current_offset += seq_len_tensor[b_id].item()
return (
return PunicaTensors(
inputs_tensor,
lora_weights,
our_out_tensor,
......@@ -185,7 +212,7 @@ def generate_data_for_expand_nslices(
dtype,
nslices,
device,
):
) -> PunicaTensors:
seq_len_tensor = torch.randint(seq_length, seq_length + 1,
(batches, )).to(device)
b_seq_start_loc = torch.cumsum(
......@@ -222,7 +249,7 @@ def generate_data_for_expand_nslices(
current_offset += seq_len_tensor[b_id].item()
lora_indices_tensor = lora_indices_tensor.to(device)
return (
return PunicaTensors(
inputs_tensor,
lora_weights_lst,
our_out_tensor,
......@@ -244,7 +271,7 @@ def generate_data_for_nslices(
dtype,
op_type,
device,
):
) -> PunicaTensors:
seq_len_tensor = torch.randint(seq_length, seq_length + 1,
(batches, )).to(device)
b_seq_start_loc = torch.cumsum(
......@@ -302,7 +329,7 @@ def generate_data_for_nslices(
current_offset += seq_len_tensor[b_id].item()
lora_indices_tensor = lora_indices_tensor.to(device)
return (
return PunicaTensors(
inputs_tensor,
lora_weights_lst,
our_out_tensor,
......
......@@ -9,6 +9,7 @@ import ray
from prometheus_client import REGISTRY
from vllm import EngineArgs, LLMEngine
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
......@@ -18,10 +19,14 @@ import vllm.envs as envs
from ..utils import models_path_prefix
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
MODELS = [
os.path.join(models_path_prefix, "facebook/opt-125m"),
os.path.join(models_path_prefix, "distilbert/distilgpt2"),
]
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", [("float" if envs.VLLM_USE_TRITON_FLASH_ATTN else "half")])
......@@ -145,8 +150,9 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
metrics_tag_content = stat_logger.labels["model_name"]
if served_model_name is None or served_model_name == []:
assert metrics_tag_content == model, (
f"Metrics tag model_name is wrong! expect: {model!r}\n"
actual_model_name = f"{MODEL_WEIGHTS_S3_BUCKET}/{model.split('/')[-1]}"
assert metrics_tag_content == actual_model_name, (
f"Metrics tag model_name is wrong! expect: {actual_model_name!r}\n"
f"actual: {metrics_tag_content!r}")
else:
assert metrics_tag_content == served_model_name[0], (
......@@ -174,7 +180,8 @@ async def test_async_engine_log_metrics_regression(
"""
engine_args = AsyncEngineArgs(model=model,
dtype=dtype,
disable_log_stats=disable_log_stats)
disable_log_stats=disable_log_stats,
load_format=RUNAI_STREAMER_LOAD_FORMAT)
async_engine = AsyncLLMEngine.from_engine_args(engine_args)
for i, prompt in enumerate(example_prompts):
results = async_engine.generate(
......@@ -203,7 +210,8 @@ def test_engine_log_metrics_regression(
) -> None:
engine_args = EngineArgs(model=model,
dtype=dtype,
disable_log_stats=disable_log_stats)
disable_log_stats=disable_log_stats,
load_format=RUNAI_STREAMER_LOAD_FORMAT)
engine = LLMEngine.from_engine_args(engine_args)
for i, prompt in enumerate(example_prompts):
engine.add_request(
......@@ -287,7 +295,8 @@ def test_metric_spec_decode_interval(
gpu_memory_utilization=0.4,
speculative_model=model,
num_speculative_tokens=k,
enforce_eager=True)
enforce_eager=True,
load_format=RUNAI_STREAMER_LOAD_FORMAT)
engine = LLMEngine.from_engine_args(engine_args)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment