Commit afd0da21 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.1' into v0.7.1-dev

parents 1a11f127 4f4d427a
...@@ -20,7 +20,7 @@ def test_run(my_rank, buffer, device): ...@@ -20,7 +20,7 @@ def test_run(my_rank, buffer, device):
assert buffer.buffer_size == 0 assert buffer.buffer_size == 0
assert len(buffer.buffer) == 0 assert len(buffer.buffer) == 0
print("My rank: %d, device: %s" % (my_rank, device)) print(f"My rank: {my_rank}, device: {device}")
# insert # insert
tokens = torch.tensor([1, 2, 3]).to(device) tokens = torch.tensor([1, 2, 3]).to(device)
...@@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device): ...@@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
assert buffer.buffer_size == 0 assert buffer.buffer_size == 0
assert len(buffer.buffer) == 0 assert len(buffer.buffer) == 0
print("My rank: %d, Test run passed!" % (my_rank)) print(f"My rank: {my_rank}, Test run passed!")
def stress_test(my_rank, buf, device): def stress_test(my_rank, buf, device):
...@@ -94,7 +94,7 @@ def stress_test(my_rank, buf, device): ...@@ -94,7 +94,7 @@ def stress_test(my_rank, buf, device):
assert torch.allclose(k, k_) assert torch.allclose(k, k_)
assert torch.allclose(v, v_) assert torch.allclose(v, v_)
assert torch.allclose(h, h_) assert torch.allclose(h, h_)
print('Rank %d done' % my_rank) print(f"Rank {my_rank} done")
torch.distributed.barrier() torch.distributed.barrier()
if my_rank == 0: if my_rank == 0:
...@@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device): ...@@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
else: else:
torch.distributed.send(torch.tensor([n]), 0) torch.distributed.send(torch.tensor([n]), 0)
print("My rank: %d, Passed stress test!" % (my_rank)) print(f"My rank: {my_rank}, Passed stress test!")
if __name__ == "__main__": if __name__ == "__main__":
...@@ -122,7 +122,7 @@ if __name__ == "__main__": ...@@ -122,7 +122,7 @@ if __name__ == "__main__":
rank=my_rank, rank=my_rank,
) )
print("initialized! My rank is %d" % my_rank) print(f"initialized! My rank is {my_rank}")
config = KVTransferConfig( config = KVTransferConfig(
kv_connector='PyNcclConnector', kv_connector='PyNcclConnector',
......
...@@ -22,13 +22,13 @@ def test_run(my_rank, pipe): ...@@ -22,13 +22,13 @@ def test_run(my_rank, pipe):
x2 = pipe.recv_tensor() x2 = pipe.recv_tensor()
print(f"rank {my_rank} received x2 = ", x2) print(f"rank {my_rank} received x2 = ", x2)
y2 = pipe.recv_tensor() y2 = pipe.recv_tensor()
print(f"rank {my_rank} received y2 = ", x2) print(f"rank {my_rank} received y2 = ", y2)
else: else:
x2 = pipe.recv_tensor() x2 = pipe.recv_tensor()
print(f"rank {my_rank} received x2 = ", x2) print(f"rank {my_rank} received x2 = ", x2)
y2 = pipe.recv_tensor() y2 = pipe.recv_tensor()
print(f"rank {my_rank} received y2 = ", x2) print(f"rank {my_rank} received y2 = ", y2)
pipe.send_tensor(x) pipe.send_tensor(x)
print(f"rank {my_rank} sent tensor x") print(f"rank {my_rank} sent tensor x")
pipe.send_tensor(y) pipe.send_tensor(y)
......
...@@ -5,6 +5,7 @@ from unittest.mock import MagicMock, patch ...@@ -5,6 +5,7 @@ from unittest.mock import MagicMock, patch
import pytest import pytest
import os import os
import safetensors
import torch import torch
import torch.nn as nn import torch.nn as nn
...@@ -22,6 +23,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor ...@@ -22,6 +23,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.sampler import Sampler from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader import get_model from vllm.model_executor.model_loader import get_model
from vllm.platforms import current_platform
from ..utils import models_path_prefix from ..utils import models_path_prefix
...@@ -67,13 +69,16 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool): ...@@ -67,13 +69,16 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
@pytest.fixture @pytest.fixture
def dist_init(): def dist_init():
temp_file = tempfile.mkstemp()[1] temp_file = tempfile.mkstemp()[1]
init_distributed_environment(
world_size=1, backend = "nccl"
rank=0, if current_platform.is_cpu():
distributed_init_method=f"file://{temp_file}", backend = "gloo"
local_rank=0,
backend="nccl", init_distributed_environment(world_size=1,
) rank=0,
distributed_init_method=f"file://{temp_file}",
local_rank=0,
backend=backend)
initialize_model_parallel(1, 1) initialize_model_parallel(1, 1)
yield yield
cleanup_dist_env_and_memory(shutdown_ray=True) cleanup_dist_env_and_memory(shutdown_ray=True)
...@@ -83,13 +88,15 @@ def dist_init(): ...@@ -83,13 +88,15 @@ def dist_init():
def dist_init_torch_only(): def dist_init_torch_only():
if torch.distributed.is_initialized(): if torch.distributed.is_initialized():
return return
backend = "nccl"
if current_platform.is_cpu():
backend = "gloo"
temp_file = tempfile.mkstemp()[1] temp_file = tempfile.mkstemp()[1]
torch.distributed.init_process_group( torch.distributed.init_process_group(world_size=1,
backend="nccl", rank=0,
world_size=1, init_method=f"file://{temp_file}",
rank=0, backend=backend)
init_method=f"file://{temp_file}",
)
@pytest.fixture @pytest.fixture
...@@ -173,6 +180,29 @@ def mixtral_lora_files_all_target_modules(): ...@@ -173,6 +180,29 @@ def mixtral_lora_files_all_target_modules():
return snapshot_download(repo_id="dyang415/mixtral-lora-v0") return snapshot_download(repo_id="dyang415/mixtral-lora-v0")
@pytest.fixture(scope="session")
def jamba_lora_files():
# some of the adapters have unnecessary weights for serving,
# hence we remove them
def remove_unnecessary_weights(path):
lora_path = f"{adapter_path}/adapter_model.safetensors"
tensors = safetensors.torch.load_file(lora_path)
nonlora_keys = []
for k in list(tensors.keys()):
if "lora" not in k:
nonlora_keys.append(k)
for k in nonlora_keys:
del tensors[k]
safetensors.torch.save_file(tensors, lora_path)
adapter_path = snapshot_download(
repo_id=
"hf-100/Jamba-1.5-mini-Spellbound-StoryWriter-0.1-6583896-ckpt53-lora")
remove_unnecessary_weights(adapter_path)
return adapter_path
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def gemma_lora_files(): def gemma_lora_files():
# return snapshot_download(repo_id="wskwon/gemma-7b-test-lora") # return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
......
from typing import List
import pytest
import torch
import vllm
from vllm.lora.request import LoRARequest
MODEL_PATH = "ai21labs/AI21-Jamba-1.5-Mini"
MAX_TOKENS = 40
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
prompts: List[str]) -> List[str]:
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=MAX_TOKENS)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
# Print the outputs.
generated_texts: List[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
return generated_texts
@pytest.mark.parametrize("tp_size", [4])
def test_jamba_lora(jamba_lora_files, tp_size):
"""Original test, the LoRA model has the common target modules, not all"""
if torch.cuda.device_count() < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
prompts = ["Write a story about a sheep and a goat."]
llm = vllm.LLM(
MODEL_PATH,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
distributed_executor_backend="ray",
tensor_parallel_size=tp_size,
)
expected_jamba_output = [
"""Once upon a time, in a lush green meadow, there lived a sheep named Clara and a goat named Billy. Clara was a gentle creature, always nibbling on the soft grass and humming""" # noqa: E501
]
assert do_sample(llm, jamba_lora_files, lora_id=1,
prompts=prompts) == expected_jamba_output
...@@ -48,10 +48,14 @@ TOLERANCES = { ...@@ -48,10 +48,14 @@ TOLERANCES = {
torch.float32: (5e-3, 5e-3), torch.float32: (5e-3, 5e-3),
torch.bfloat16: (3e-2, 2e-2), torch.bfloat16: (3e-2, 2e-2),
} }
# TODO: Modify this based on platform
DEVICES = [ pytestmark = pytest.mark.skipif(
not (current_platform.is_cuda_alike() or current_platform.is_cpu()),
reason="Backend not supported")
DEVICES = ([
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
] ] if current_platform.is_cuda_alike() else ["cpu"])
#For GPU, we will launch different triton kernels between the prefill and decode #For GPU, we will launch different triton kernels between the prefill and decode
# stages, so we need to verify this. prefill stage(True) or decode stage(False) # stages, so we need to verify this. prefill stage(True) or decode stage(False)
...@@ -198,6 +202,10 @@ def check_punica_wrapper(punica_wrapper) -> bool: ...@@ -198,6 +202,10 @@ def check_punica_wrapper(punica_wrapper) -> bool:
from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU from vllm.lora.punica_wrapper.punica_gpu import PunicaWrapperGPU
return type(punica_wrapper) is PunicaWrapperGPU return type(punica_wrapper) is PunicaWrapperGPU
elif current_platform.is_cpu():
from vllm.lora.punica_wrapper.punica_cpu import PunicaWrapperCPU
return type(punica_wrapper) is PunicaWrapperCPU
else: else:
return False return False
...@@ -211,7 +219,8 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: ...@@ -211,7 +219,8 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
# For multi-GPU testing of Triton kernel, we must explicitly set the CUDA # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA
# device, see: https://github.com/triton-lang/triton/issues/2925 # device, see: https://github.com/triton-lang/triton/issues/2925
# Same below. # Same below.
torch.cuda.set_device(device) if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
torch.set_default_device(device) torch.set_default_device(device)
max_loras = 8 max_loras = 8
...@@ -313,7 +322,9 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: ...@@ -313,7 +322,9 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
def test_embeddings_with_new_embeddings(dist_init, num_loras, device, def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
vocab_size, stage) -> None: vocab_size, stage) -> None:
torch.cuda.set_device(device) if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
torch.set_default_device(device) torch.set_default_device(device)
max_loras = 8 max_loras = 8
punica_wrapper = get_punica_wrapper(8192, 256, device) punica_wrapper = get_punica_wrapper(8192, 256, device)
...@@ -450,7 +461,9 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device, ...@@ -450,7 +461,9 @@ def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size, def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
stage) -> None: stage) -> None:
torch.cuda.set_device(device) if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
torch.set_default_device(device) torch.set_default_device(device)
max_loras = 8 max_loras = 8
punica_wrapper = get_punica_wrapper(8192, 256, device) punica_wrapper = get_punica_wrapper(8192, 256, device)
...@@ -582,7 +595,9 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size, ...@@ -582,7 +595,9 @@ def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
def test_linear_replicated(dist_init, num_loras, device, stage, def test_linear_replicated(dist_init, num_loras, device, stage,
bias_enabled) -> None: bias_enabled) -> None:
torch.cuda.set_device(device) if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
torch.set_default_device(device) torch.set_default_device(device)
punica_wrapper = get_punica_wrapper(8192, 256, device) punica_wrapper = get_punica_wrapper(8192, 256, device)
assert check_punica_wrapper(punica_wrapper) assert check_punica_wrapper(punica_wrapper)
...@@ -695,7 +710,9 @@ def test_linear_replicated(dist_init, num_loras, device, stage, ...@@ -695,7 +710,9 @@ def test_linear_replicated(dist_init, num_loras, device, stage,
def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
device, stage, bias_enabled) -> None: device, stage, bias_enabled) -> None:
torch.cuda.set_device(device) if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
torch.set_default_device(device) torch.set_default_device(device)
punica_wrapper = get_punica_wrapper(8192, 256, device) punica_wrapper = get_punica_wrapper(8192, 256, device)
assert check_punica_wrapper(punica_wrapper) assert check_punica_wrapper(punica_wrapper)
...@@ -818,7 +835,9 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard, ...@@ -818,7 +835,9 @@ def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
device, stage, bias_enabled) -> None: device, stage, bias_enabled) -> None:
torch.cuda.set_device(device) if current_platform.is_cuda_alike():
torch.cuda.set_device(device)
torch.set_default_device(device) torch.set_default_device(device)
punica_wrapper = get_punica_wrapper(8192, 256, device) punica_wrapper = get_punica_wrapper(8192, 256, device)
assert check_punica_wrapper(punica_wrapper) assert check_punica_wrapper(punica_wrapper)
...@@ -971,6 +990,8 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard, ...@@ -971,6 +990,8 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
@pytest.mark.parametrize("rotary_dim", [None, 32]) @pytest.mark.parametrize("rotary_dim", [None, 32])
@pytest.mark.parametrize("head_size", [32, 108]) @pytest.mark.parametrize("head_size", [32, 108])
@pytest.mark.parametrize("seq_len", [11, 1024]) @pytest.mark.parametrize("seq_len", [11, 1024])
@pytest.mark.skipif(not current_platform.is_cuda_alike(),
reason="Only CUDA backends are supported")
def test_rotary_embedding_long_context(dist_init, num_loras, device, def test_rotary_embedding_long_context(dist_init, num_loras, device,
scaling_factors, max_position, scaling_factors, max_position,
is_neox_style, rotary_dim, head_size, is_neox_style, rotary_dim, head_size,
......
...@@ -3,6 +3,7 @@ from typing import List ...@@ -3,6 +3,7 @@ from typing import List
import pytest import pytest
from vllm.lora.models import LoRAModel from vllm.lora.models import LoRAModel
from vllm.lora.peft_helper import PEFTHelper
from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
from vllm.model_executor.models.utils import WeightsMapper from vllm.model_executor.models.utils import WeightsMapper
...@@ -30,11 +31,14 @@ def test_load_checkpoints( ...@@ -30,11 +31,14 @@ def test_load_checkpoints(
else: else:
expected_lora_modules.append(module) expected_lora_modules.append(module)
if lora_name == "baichuan7B": if lora_name == "baichuan7B":
peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
max_position_embeddings=4096)
# For the baichuan7B model, load it's LoRA, # For the baichuan7B model, load it's LoRA,
# and the test should pass. # and the test should pass.
LoRAModel.from_local_checkpoint( LoRAModel.from_local_checkpoint(
baichuan_lora_files, baichuan_lora_files,
expected_lora_modules, expected_lora_modules,
peft_helper=peft_helper,
lora_model_id=1, lora_model_id=1,
device="cpu", device="cpu",
embedding_modules=embedding_modules, embedding_modules=embedding_modules,
...@@ -43,9 +47,12 @@ def test_load_checkpoints( ...@@ -43,9 +47,12 @@ def test_load_checkpoints(
# Test that the target_modules contain prefix # Test that the target_modules contain prefix
# such as "model.layers.0.self_atten.W_pack", and # such as "model.layers.0.self_atten.W_pack", and
# the test should pass. # the test should pass.
peft_helper = PEFTHelper.from_local_dir(baichuan_zero_lora_files,
max_position_embeddings=4096)
LoRAModel.from_local_checkpoint( LoRAModel.from_local_checkpoint(
baichuan_zero_lora_files, baichuan_zero_lora_files,
expected_lora_modules, expected_lora_modules,
peft_helper=peft_helper,
lora_model_id=1, lora_model_id=1,
device="cpu", device="cpu",
embedding_modules=embedding_modules, embedding_modules=embedding_modules,
...@@ -53,9 +60,12 @@ def test_load_checkpoints( ...@@ -53,9 +60,12 @@ def test_load_checkpoints(
elif lora_name == "baichuan7B-zero-regex": elif lora_name == "baichuan7B-zero-regex":
# Test that the `target_modules` in the form of regular expressions, # Test that the `target_modules` in the form of regular expressions,
# such as `model\\..*(W_pack|o_proj)`, and the test should pass. # such as `model\\..*(W_pack|o_proj)`, and the test should pass.
peft_helper = PEFTHelper.from_local_dir(baichuan_regex_lora_files,
max_position_embeddings=4096)
LoRAModel.from_local_checkpoint( LoRAModel.from_local_checkpoint(
baichuan_regex_lora_files, baichuan_regex_lora_files,
expected_lora_modules, expected_lora_modules,
peft_helper=peft_helper,
lora_model_id=1, lora_model_id=1,
device="cpu", device="cpu",
embedding_modules=embedding_modules, embedding_modules=embedding_modules,
...@@ -64,10 +74,13 @@ def test_load_checkpoints( ...@@ -64,10 +74,13 @@ def test_load_checkpoints(
# For the baichuan7B model, load chatglm3-6b's LoRA, # For the baichuan7B model, load chatglm3-6b's LoRA,
# and the test should raise the following error. # and the test should raise the following error.
expected_error = "Please verify that the loaded LoRA module is correct" # noqa: E501 expected_error = "Please verify that the loaded LoRA module is correct" # noqa: E501
peft_helper = PEFTHelper.from_local_dir(chatglm3_lora_files,
max_position_embeddings=4096)
with pytest.raises(ValueError, match=expected_error): with pytest.raises(ValueError, match=expected_error):
LoRAModel.from_local_checkpoint( LoRAModel.from_local_checkpoint(
chatglm3_lora_files, chatglm3_lora_files,
expected_lora_modules, expected_lora_modules,
peft_helper=peft_helper,
lora_model_id=1, lora_model_id=1,
device="cpu", device="cpu",
embedding_modules=embedding_modules, embedding_modules=embedding_modules,
...@@ -94,9 +107,12 @@ def test_lora_weights_mapping(baichuan_lora_files): ...@@ -94,9 +107,12 @@ def test_lora_weights_mapping(baichuan_lora_files):
".layers.": ".baichuan_layers.", ".layers.": ".baichuan_layers.",
}, },
) )
peft_helper = PEFTHelper.from_local_dir(baichuan_lora_files,
max_position_embeddings=4096)
lora_model = LoRAModel.from_local_checkpoint( lora_model = LoRAModel.from_local_checkpoint(
baichuan_lora_files, baichuan_lora_files,
expected_lora_modules, expected_lora_modules,
peft_helper=peft_helper,
lora_model_id=1, lora_model_id=1,
device="cpu", device="cpu",
embedding_modules=embedding_modules, embedding_modules=embedding_modules,
......
...@@ -3,6 +3,7 @@ from typing import List ...@@ -3,6 +3,7 @@ from typing import List
import pytest import pytest
from vllm.lora.models import LoRAModel from vllm.lora.models import LoRAModel
from vllm.lora.peft_helper import PEFTHelper
from vllm.lora.utils import get_adapter_absolute_path from vllm.lora.utils import get_adapter_absolute_path
from vllm.model_executor.models.llama import LlamaForCausalLM from vllm.model_executor.models.llama import LlamaForCausalLM
...@@ -27,9 +28,11 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request): ...@@ -27,9 +28,11 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
lora_path = get_adapter_absolute_path(lora_name) lora_path = get_adapter_absolute_path(lora_name)
# lora loading should work for either absolute path and hugggingface id. # lora loading should work for either absolute path and hugggingface id.
peft_helper = PEFTHelper.from_local_dir(lora_path, 4096)
lora_model = LoRAModel.from_local_checkpoint( lora_model = LoRAModel.from_local_checkpoint(
lora_path, lora_path,
expected_lora_modules, expected_lora_modules,
peft_helper=peft_helper,
lora_model_id=1, lora_model_id=1,
device="cpu", device="cpu",
embedding_modules=embedding_modules, embedding_modules=embedding_modules,
......
import json
import os import os
from typing import Dict, List from typing import Dict, List
...@@ -19,6 +18,7 @@ from vllm.lora.request import LoRARequest ...@@ -19,6 +18,7 @@ from vllm.lora.request import LoRARequest
from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager, from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
WorkerLoRAManager) WorkerLoRAManager)
from vllm.model_executor.layers.linear import RowParallelLinear from vllm.model_executor.layers.linear import RowParallelLinear
from vllm.platforms import current_platform
EMBEDDING_MODULES = { EMBEDDING_MODULES = {
"embed_tokens": "input_embeddings", "embed_tokens": "input_embeddings",
...@@ -27,68 +27,20 @@ EMBEDDING_MODULES = { ...@@ -27,68 +27,20 @@ EMBEDDING_MODULES = {
EMBEDDING_PADDING_MODULES = ["lm_head"] EMBEDDING_PADDING_MODULES = ["lm_head"]
CUDA_DEVICES = [ DEVICES = ([
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
] ] if current_platform.is_cuda_alike() else ["cpu"])
def test_peft_helper(sql_lora_files): @pytest.mark.parametrize("device", DEVICES)
lora_config_path = os.path.join(sql_lora_files, "adapter_config.json")
with open(lora_config_path) as f:
config = json.load(f)
peft_helper = PEFTHelper.from_dict(config)
assert peft_helper.r == 8
assert peft_helper.lora_alpha == 16
assert peft_helper.target_modules == [
"q_proj",
"v_proj",
"k_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
"embed_tokens",
"lm_head",
]
expected_error = "vLLM only supports modules_to_save being None."
with pytest.raises(ValueError, match=expected_error):
config = dict(
r=8,
lora_alpha=16,
target_modules=["gate_proj"],
modules_to_save=["lm_head"],
)
PEFTHelper.from_dict(config)
expected_error = "vLLM does not yet support RSLoRA."
with pytest.raises(ValueError, match=expected_error):
config = dict(r=8,
lora_alpha=16,
target_modules=["gate_proj"],
use_rslora=True)
PEFTHelper.from_dict(config)
expected_error = "vLLM does not yet support DoRA."
with pytest.raises(ValueError, match=expected_error):
config = dict(r=8,
lora_alpha=16,
target_modules=["gate_proj"],
use_dora=True)
PEFTHelper.from_dict(config)
@pytest.mark.parametrize("device", CUDA_DEVICES)
def test_from_lora_tensors(sql_lora_files, device): def test_from_lora_tensors(sql_lora_files, device):
tensors = load_file( tensors = load_file(
os.path.join(sql_lora_files, "adapter_model.safetensors")) os.path.join(sql_lora_files, "adapter_model.safetensors"))
new_embeddings = load_file( new_embeddings = load_file(
os.path.join(sql_lora_files, "new_embeddings.safetensors")) os.path.join(sql_lora_files, "new_embeddings.safetensors"))
lora_config_path = os.path.join(sql_lora_files, "adapter_config.json") peft_helper = PEFTHelper.from_local_dir(sql_lora_files,
with open(lora_config_path) as f: max_position_embeddings=4096)
config = json.load(f)
peft_helper = PEFTHelper.from_dict(config)
lora_model = LoRAModel.from_lora_tensors( lora_model = LoRAModel.from_lora_tensors(
1, 1,
tensors, tensors,
...@@ -165,7 +117,7 @@ def test_replace_submodules(dist_init, dummy_model): ...@@ -165,7 +117,7 @@ def test_replace_submodules(dist_init, dummy_model):
manager = LoRAModelManager( manager = LoRAModelManager(
model, 1, 1, 1, model, 1, 1, 1,
LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8), LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8),
torch.device("cuda")) torch.device(DEVICES[0]))
model = manager.model model = manager.model
assert isinstance(model.get_submodule("dense1"), assert isinstance(model.get_submodule("dense1"),
...@@ -177,7 +129,7 @@ def test_replace_submodules(dist_init, dummy_model): ...@@ -177,7 +129,7 @@ def test_replace_submodules(dist_init, dummy_model):
RowParallelLinearWithLoRA) RowParallelLinearWithLoRA)
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_lora_model_manager(dist_init, dummy_model, device): def test_lora_model_manager(dist_init, dummy_model, device):
model = dummy_model model = dummy_model
model.supported_lora_modules = ["dense1", "dense2", "lm_head"] model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
...@@ -238,7 +190,7 @@ def test_lora_model_manager(dist_init, dummy_model, device): ...@@ -238,7 +190,7 @@ def test_lora_model_manager(dist_init, dummy_model, device):
assert manager.punica_wrapper.device == device assert manager.punica_wrapper.device == device
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_lora_lru_cache_model_manager(dist_init, dummy_model, device): def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
model = dummy_model model = dummy_model
model.supported_lora_modules = ["dense1", "dense2", "lm_head"] model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
...@@ -330,7 +282,7 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device): ...@@ -330,7 +282,7 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
assert manager.device == device assert manager.device == device
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_lru_lora_model_manager(dist_init, dummy_model, device): def test_lru_lora_model_manager(dist_init, dummy_model, device):
# This tests just the LRU cache functionality, everything else is # This tests just the LRU cache functionality, everything else is
# tested in test_lora_model_manager # tested in test_lora_model_manager
...@@ -460,7 +412,7 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device): ...@@ -460,7 +412,7 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
assert manager.device == device assert manager.device == device
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
sql_lora_files, device): sql_lora_files, device):
lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4) lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
...@@ -539,7 +491,7 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings, ...@@ -539,7 +491,7 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
device) device)
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
sql_lora_files, device): sql_lora_files, device):
# Should remove every LoRA not specified in the request. # Should remove every LoRA not specified in the request.
...@@ -615,7 +567,7 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings, ...@@ -615,7 +567,7 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
device) device)
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_packed_loras(dist_init, dummy_model_gate_up, device): def test_packed_loras(dist_init, dummy_model_gate_up, device):
model = dummy_model_gate_up model = dummy_model_gate_up
model.supported_lora_modules = ["gate_up_proj"] model.supported_lora_modules = ["gate_up_proj"]
......
...@@ -4,10 +4,11 @@ import os ...@@ -4,10 +4,11 @@ import os
import pytest import pytest
import vllm import vllm
from tests.utils import fork_new_process_for_each_test
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform
from ..utils import multi_gpu_test, models_path_prefix from ..utils import models_path_prefix
MODEL_PATH = os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5") MODEL_PATH = os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5")
...@@ -18,13 +19,11 @@ PROMPT_TEMPLATE = ( ...@@ -18,13 +19,11 @@ PROMPT_TEMPLATE = (
IMAGE_ASSETS = [ IMAGE_ASSETS = [
ImageAsset("stop_sign"), ImageAsset("stop_sign"),
ImageAsset("cherry_blossom"),
] ]
# After fine-tuning with LoRA, all generated content should start begin `A`. # After fine-tuning with LoRA, all generated content should start begin `A`.
EXPECTED_OUTPUT = [ EXPECTED_OUTPUT = [
"A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.", # noqa: E501 "A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.", # noqa: E501
"A pink cherry blossom tree with a blue sky in the background.",
] ]
...@@ -51,48 +50,75 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: ...@@ -51,48 +50,75 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
# Print the outputs. # Print the outputs.
generated_texts: List[str] = [] generated_texts: List[str] = []
for output in outputs: for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip() generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text) generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Generated text: {generated_text!r}")
return generated_texts return generated_texts
@multi_gpu_test(num_gpus=2) @pytest.mark.xfail(
@pytest.mark.parametrize("fully_sharded", [True, False]) current_platform.is_rocm(),
def test_minicpmv_tp2(minicpmv_lora_files, fully_sharded): reason="MiniCPM-V dependency xformers incompatible with ROCm")
@fork_new_process_for_each_test
def test_minicpmv_lora(minicpmv_lora_files):
llm = vllm.LLM(
MODEL_PATH,
max_num_seqs=2,
enable_lora=True,
max_loras=2,
max_lora_rank=8,
enforce_eager=True,
trust_remote_code=True,
enable_chunked_prefill=True,
)
output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output1[i])
output2 = do_sample(llm, minicpmv_lora_files, lora_id=2)
for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output2[i])
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="MiniCPM-V dependency xformers incompatible with ROCm")
@fork_new_process_for_each_test
def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
llm = vllm.LLM( llm = vllm.LLM(
MODEL_PATH, MODEL_PATH,
enable_lora=True, enable_lora=True,
max_num_seqs=2, max_num_seqs=2,
max_loras=4, max_loras=4,
max_lora_rank=64, max_lora_rank=64,
tensor_parallel_size=2, tensor_parallel_size=4,
trust_remote_code=True, trust_remote_code=True,
fully_sharded_loras=fully_sharded, enforce_eager=True,
enable_chunked_prefill=True, enable_chunked_prefill=True,
) )
output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1) output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
for i in range(len(EXPECTED_OUTPUT)): for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output_tp[i]) assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
@multi_gpu_test(num_gpus=4) @pytest.mark.xfail(
@pytest.mark.parametrize("fully_sharded", [True, False]) current_platform.is_rocm(),
def test_minicpmv_tp4(minicpmv_lora_files, fully_sharded): reason="MiniCPM-V dependency xformers incompatible with ROCm")
@fork_new_process_for_each_test
def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
llm = vllm.LLM( llm = vllm.LLM(
MODEL_PATH, MODEL_PATH,
enable_lora=True, enable_lora=True,
max_num_seqs=2, max_num_seqs=2,
max_loras=4, max_loras=2,
max_lora_rank=64, max_lora_rank=8,
tensor_parallel_size=4, tensor_parallel_size=4,
trust_remote_code=True, trust_remote_code=True,
fully_sharded_loras=fully_sharded, fully_sharded_loras=True,
enable_chunked_prefill=True, enable_chunked_prefill=True,
) )
output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1) output_tp = do_sample(llm, minicpmv_lora_files, lora_id=1)
for i in range(len(EXPECTED_OUTPUT)): for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output_tp[i]) assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
output_tp = do_sample(llm, minicpmv_lora_files, lora_id=2)
for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
...@@ -6,6 +6,7 @@ import os ...@@ -6,6 +6,7 @@ import os
import vllm import vllm
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform
from ..utils import models_path_prefix from ..utils import models_path_prefix
MODEL_PATH = os.path.join(models_path_prefix, "mistralai/Mixtral-8x7B-Instruct-v0.1") MODEL_PATH = os.path.join(models_path_prefix, "mistralai/Mixtral-8x7B-Instruct-v0.1")
...@@ -33,7 +34,8 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, ...@@ -33,7 +34,8 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
@pytest.mark.parametrize("tp_size", [4]) @pytest.mark.parametrize("tp_size", [4])
def test_mixtral_lora(mixtral_lora_files, tp_size): def test_mixtral_lora(mixtral_lora_files, tp_size):
"""Original test, the LoRA model has the common target modules, not all""" """Original test, the LoRA model has the common target modules, not all"""
if torch.cuda.device_count() < tp_size: if torch.cuda.device_count(
) < tp_size and tp_size > 1 and current_platform.is_cuda_alike():
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
prompts = [ prompts = [
......
import json
import math
import shutil
import pytest
from vllm.config import LoRAConfig
from vllm.lora.peft_helper import PEFTHelper
ERROR_CASES = [
(
"test_rank",
{
"r": 1024
},
"is greater than max_lora_rank",
),
(
"test_bias",
{
"bias": "all"
},
"Adapter bias cannot be used without bias_enabled",
),
("test_dora", {
"use_dora": True
}, "does not yet support DoRA"),
(
"test_modules_to_save",
{
"modules_to_save": ["lm_head"]
},
"only supports modules_to_save being None",
),
]
def test_peft_helper_pass(long_context_lora_files_16k_1, tmp_path):
peft_helper = PEFTHelper.from_local_dir(long_context_lora_files_16k_1,
max_position_embeddings=4096)
lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
peft_helper.validate_legal(lora_config)
assert peft_helper.r == 8
assert peft_helper.lora_alpha == 16
assert peft_helper.target_modules == [
"q_proj",
"v_proj",
"k_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
"embed_tokens",
"lm_head",
]
assert peft_helper.context_length == 16384
assert peft_helper.vllm_max_position_embeddings == 4096
assert peft_helper.vllm_long_context_scaling_factor == float(
math.ceil(peft_helper.context_length /
peft_helper.vllm_max_position_embeddings))
# test RSLoRA
rslora_config = dict(use_rslora=True)
test_dir = tmp_path / "test_rslora"
shutil.copytree(long_context_lora_files_16k_1, test_dir)
# Load and modify configuration
config_path = test_dir / "adapter_config.json"
with open(config_path) as f:
adapter_config = json.load(f)
# Apply configuration changes
adapter_config.update(rslora_config)
# Save modified configuration
with open(config_path, "w") as f:
json.dump(adapter_config, f)
peft_helper = PEFTHelper.from_local_dir(test_dir,
max_position_embeddings=4096)
peft_helper.validate_legal(lora_config)
scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r)
assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3
@pytest.mark.parametrize("test_name,config_change,expected_error", ERROR_CASES)
def test_peft_helper_error(
sql_lora_files,
tmp_path,
test_name: str,
config_change: dict,
expected_error: str,
):
test_dir = tmp_path / test_name
shutil.copytree(sql_lora_files, test_dir)
# Load and modify configuration
config_path = test_dir / "adapter_config.json"
with open(config_path) as f:
adapter_config = json.load(f)
# Apply configuration changes
adapter_config.update(config_change)
# Save modified configuration
with open(config_path, "w") as f:
json.dump(adapter_config, f)
lora_config = LoRAConfig(max_lora_rank=16, max_cpu_loras=3, max_loras=2)
# Test loading the adapter
with pytest.raises(ValueError, match=expected_error):
PEFTHelper.from_local_dir(
test_dir, max_position_embeddings=4096).validate_legal(lora_config)
...@@ -4,19 +4,21 @@ hidden_sizes included in the LoRA models currently supported by vLLM. It tests ...@@ -4,19 +4,21 @@ hidden_sizes included in the LoRA models currently supported by vLLM. It tests
whether the corresponding Triton kernel can run normally when tensor parallelism whether the corresponding Triton kernel can run normally when tensor parallelism
is set to [1, 2, 4, 8, 16, 32, 64]. is set to [1, 2, 4, 8, 16, 32, 64].
""" """
from threading import Lock
import pytest import pytest
import torch import torch
from vllm.lora.ops.bgmv_expand import bgmv_expand import vllm.lora.ops.triton_ops # noqa: F401
from vllm.lora.ops.bgmv_expand_slice import bgmv_expand_slice from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
from vllm.lora.ops.bgmv_shrink import bgmv_shrink bgmv_shrink, sgmv_expand,
from vllm.lora.ops.sgmv_expand import sgmv_expand sgmv_expand_slice, sgmv_shrink)
from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
from vllm.lora.ops.sgmv_shrink import sgmv_shrink
from vllm.platforms import current_platform from vllm.platforms import current_platform
from .utils import (generate_data, generate_data_for_expand_nslices, from .utils import (assert_close, generate_data,
ref_torch_groupgemm) generate_data_for_expand_nslices,
generate_data_for_nslices)
HIDDEN_SIZES = [ HIDDEN_SIZES = [
128, 128,
...@@ -110,16 +112,9 @@ DTYPES = [torch.float16, torch.bfloat16] ...@@ -110,16 +112,9 @@ DTYPES = [torch.float16, torch.bfloat16]
MAX_RANKS = [32] MAX_RANKS = [32]
SCALES = [0.5] SCALES = [0.5]
SEED = [0] SEED = [0]
CUDA_DEVICES = [f"cuda:{0}"] DEVICES = [f"cuda:{0}"]
def assert_close(a, b): _dict_lock = Lock()
rtol, atol = {
torch.float16: (6e-2, 6e-2),
torch.bfloat16: (6e-2, 6e-2),
torch.float32: (1e-2, 1e-2),
}[a.dtype]
torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
@pytest.mark.parametrize("batches", BATCHES) @pytest.mark.parametrize("batches", BATCHES)
...@@ -127,16 +122,18 @@ def assert_close(a, b): ...@@ -127,16 +122,18 @@ def assert_close(a, b):
@pytest.mark.parametrize("rank", MAX_RANKS) @pytest.mark.parametrize("rank", MAX_RANKS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("scaling", SCALES) @pytest.mark.parametrize("scaling", SCALES)
@pytest.mark.parametrize("nslices", [1, 2, 3])
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("op_type", ["shrink", "expand"]) @pytest.mark.parametrize("op_type", ["shrink", "expand"])
@pytest.mark.parametrize("seed", SEED) @pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_punica_sgmv( def test_punica_sgmv(
batches: int, batches: int,
num_loras: int, num_loras: int,
rank: int, rank: int,
hidden_size: int, hidden_size: int,
scaling: float, scaling: float,
nslices: int,
dtype: torch.dtype, dtype: torch.dtype,
op_type: str, op_type: str,
seed: int, seed: int,
...@@ -148,19 +145,20 @@ def test_punica_sgmv( ...@@ -148,19 +145,20 @@ def test_punica_sgmv(
seq_length = 128 seq_length = 128
( (
inputs_tensor, inputs_tensor,
lora_weights, lora_weights_lst,
our_out_tensor, our_out_tensor,
ref_out_tensor, ref_out_tensor,
b_seq_start_loc, b_seq_start_loc,
lora_indices_tensor, lora_indices_tensor,
seq_len_tensor, seq_len_tensor,
indices, indices,
) = generate_data( ) = generate_data_for_nslices(
batches, batches,
hidden_size, hidden_size,
num_loras, num_loras,
rank, rank,
seq_length, seq_length,
nslices,
dtype, dtype,
op_type, op_type,
device, device,
...@@ -172,43 +170,85 @@ def test_punica_sgmv( ...@@ -172,43 +170,85 @@ def test_punica_sgmv(
else: else:
max_seq_length = max_seq_length.item() max_seq_length = max_seq_length.item()
if op_type == "shrink": if op_type == "shrink":
sgmv_shrink( # Preventing cache error pointer.
inputs_tensor, with _dict_lock:
lora_weights, _LORA_A_PTR_DICT.clear()
our_out_tensor, torch.ops.vllm.sgmv_shrink(
b_seq_start_loc, inputs_tensor,
seq_len_tensor, lora_weights_lst,
lora_indices_tensor, our_out_tensor,
batches, b_seq_start_loc,
max_seq_length, seq_len_tensor,
token_nums, lora_indices_tensor,
scaling, batches,
) max_seq_length,
token_nums,
scaling,
)
for index in range(nslices):
sgmv_shrink(
inputs_tensor,
lora_weights_lst[index],
ref_out_tensor[index],
b_seq_start_loc,
seq_len_tensor,
lora_indices_tensor,
batches,
max_seq_length,
token_nums,
scaling,
)
else: else:
sgmv_expand( with _dict_lock:
inputs_tensor, _LORA_B_PTR_DICT.clear()
lora_weights, torch.ops.vllm.sgmv_expand(
our_out_tensor, inputs_tensor,
b_seq_start_loc, lora_weights_lst,
seq_len_tensor, our_out_tensor,
lora_indices_tensor, b_seq_start_loc,
batches, seq_len_tensor,
max_seq_length, lora_indices_tensor,
token_nums, batches,
add_inputs=True, max_seq_length,
) token_nums,
ref_torch_groupgemm( offset_start=0,
ref_out_tensor, add_inputs=True,
inputs_tensor, )
lora_weights, if nslices == 1:
lora_indices_tensor, # Verify the torch's sgmv_expand op
seq_len_tensor, sgmv_expand(
batches, inputs_tensor[0],
scaling if op_type == "shrink" else 1.0, lora_weights_lst[0],
op_type, ref_out_tensor,
) b_seq_start_loc,
if op_type == "shrink": seq_len_tensor,
ref_out_tensor = ref_out_tensor.to(torch.float32) lora_indices_tensor,
batches,
max_seq_length,
token_nums,
add_inputs=True,
)
else:
slice_offset = 0
for index in range(nslices):
lora_weights = lora_weights_lst[index]
sgmv_expand_slice(
inputs_tensor[index],
lora_weights,
ref_out_tensor,
b_seq_start_loc,
seq_len_tensor,
lora_indices_tensor,
batches,
max_seq_length,
token_nums,
slice_offset,
hidden_size,
add_inputs=True,
)
slice_offset += hidden_size
assert_close(our_out_tensor, ref_out_tensor) assert_close(our_out_tensor, ref_out_tensor)
...@@ -220,7 +260,7 @@ def test_punica_sgmv( ...@@ -220,7 +260,7 @@ def test_punica_sgmv(
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("op_type", ["shrink", "expand"]) @pytest.mark.parametrize("op_type", ["shrink", "expand"])
@pytest.mark.parametrize("seed", SEED) @pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_punica_bgmv( def test_punica_bgmv(
batches: int, batches: int,
num_loras: int, num_loras: int,
...@@ -256,31 +296,38 @@ def test_punica_bgmv( ...@@ -256,31 +296,38 @@ def test_punica_bgmv(
device, device,
) )
if op_type == "shrink": if op_type == "shrink":
bgmv_shrink( torch.ops.vllm.bgmv_shrink(
inputs_tensor, inputs_tensor,
lora_weights, lora_weights,
our_out_tensor, our_out_tensor,
indices, indices,
scaling, scaling,
) )
bgmv_shrink(
inputs_tensor,
lora_weights,
ref_out_tensor,
indices,
scaling,
)
else: else:
bgmv_expand( torch.ops.vllm.bgmv_expand(
inputs_tensor, inputs_tensor,
lora_weights, lora_weights,
our_out_tensor, our_out_tensor,
indices, indices,
add_inputs=True, add_inputs=True,
) )
ref_torch_groupgemm( bgmv_expand(
ref_out_tensor, inputs_tensor,
inputs_tensor, lora_weights,
lora_weights, ref_out_tensor,
lora_indices_tensor, indices,
seq_len_tensor, add_inputs=True,
batches, )
scaling if op_type == "shrink" else 1.0,
op_type,
)
if op_type == "shrink": if op_type == "shrink":
ref_out_tensor = ref_out_tensor.to(torch.float32) ref_out_tensor = ref_out_tensor.to(torch.float32)
assert_close(our_out_tensor, ref_out_tensor) assert_close(our_out_tensor, ref_out_tensor)
...@@ -292,25 +339,22 @@ def test_punica_bgmv( ...@@ -292,25 +339,22 @@ def test_punica_bgmv(
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("nslices", [2, 3]) @pytest.mark.parametrize("nslices", [2, 3])
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"])
@pytest.mark.parametrize("seed", SEED) @pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_punica_expand_nslices( def test_punica_bgmv_expand_nslices(
batches: int, batches: int,
num_loras: int, num_loras: int,
rank: int, rank: int,
hidden_size: int, hidden_size: int,
nslices: int, nslices: int,
dtype: torch.dtype, dtype: torch.dtype,
op_type: str,
seed: int, seed: int,
device: str, device: str,
): ):
torch.set_default_device(device) torch.set_default_device(device)
current_platform.seed_everything(seed) current_platform.seed_everything(seed)
seq_length = 128 if op_type == "sgmv" else 1 seq_length = 1
( (
inputs_tensor, inputs_tensor,
lora_weights_lst, lora_weights_lst,
...@@ -330,50 +374,26 @@ def test_punica_expand_nslices( ...@@ -330,50 +374,26 @@ def test_punica_expand_nslices(
nslices, nslices,
device, device,
) )
max_seq_length = seq_len_tensor.max()
token_nums = seq_len_tensor.sum().item()
if isinstance(max_seq_length, tuple):
max_seq_length = max_seq_length[0].item()
else:
max_seq_length = max_seq_length.item()
slice_offset = 0 slice_offset = 0
for index in range(nslices): for index in range(nslices):
lora_weights = lora_weights_lst[index] lora_weights = lora_weights_lst[index]
if op_type == "sgmv": torch.ops.vllm.bgmv_expand_slice(
sgmv_expand_slice(
inputs_tensor,
lora_weights,
our_outputs,
b_seq_start_loc,
seq_len_tensor,
lora_indices_tensor,
batches,
max_seq_length,
token_nums,
slice_offset,
hidden_size,
add_inputs=True,
)
else:
bgmv_expand_slice(
inputs_tensor,
lora_weights,
our_outputs,
indices,
slice_offset,
slice_size=hidden_size,
add_inputs=True,
)
ref_torch_groupgemm(
ref_outputs[:, slice_offset:slice_offset + hidden_size],
inputs_tensor, inputs_tensor,
lora_weights, lora_weights,
lora_indices_tensor, our_outputs,
seq_len_tensor, indices,
batches, slice_offset,
1.0, slice_size=hidden_size,
op_type="expand", add_inputs=True,
)
bgmv_expand_slice(
inputs_tensor,
lora_weights,
ref_outputs,
indices,
slice_offset,
slice_size=hidden_size,
add_inputs=True,
) )
slice_offset += hidden_size slice_offset += hidden_size
......
...@@ -3,22 +3,24 @@ This script is mainly used to test whether trtion kernels can run normally ...@@ -3,22 +3,24 @@ This script is mainly used to test whether trtion kernels can run normally
under different conditions, including various batches, numbers of LoRA , and under different conditions, including various batches, numbers of LoRA , and
maximum ranks. maximum ranks.
""" """
from threading import Lock
import pytest import pytest
import torch import torch
# Enable custom op register # Enable custom op register
import vllm.lora.ops.bgmv_expand import vllm.lora.ops.triton_ops # noqa: F401
import vllm.lora.ops.bgmv_expand_slice from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
import vllm.lora.ops.bgmv_shrink bgmv_shrink, sgmv_expand,
import vllm.lora.ops.sgmv_expand sgmv_expand_slice, sgmv_shrink)
import vllm.lora.ops.sgmv_expand_slice from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
import vllm.lora.ops.sgmv_shrink # noqa: F401
from vllm.platforms import current_platform from vllm.platforms import current_platform
from .utils import (generate_data, generate_data_for_expand_nslices, from .utils import (assert_close, generate_data,
ref_torch_groupgemm) generate_data_for_expand_nslices,
generate_data_for_nslices)
HIDDEN_SIZES = [1024] HIDDEN_SIZES = [1024] # [2049]
BATCHES = [1, 4, 16, 32] BATCHES = [1, 4, 16, 32]
NUM_LORA = [1, 8, 32, 128] NUM_LORA = [1, 8, 32, 128]
...@@ -26,26 +28,9 @@ DTYPES = [torch.float16, torch.bfloat16] ...@@ -26,26 +28,9 @@ DTYPES = [torch.float16, torch.bfloat16]
MAX_RANKS = [1, 4, 8, 16, 32, 64, 128, 256] MAX_RANKS = [1, 4, 8, 16, 32, 64, 128, 256]
SCALES = [0.5] SCALES = [0.5]
SEED = [0] SEED = [0]
CUDA_DEVICES = [f"cuda:{0}"] DEVICES = [f"cuda:{0}"]
def assert_close(a, b):
rtol, atol = {
torch.float16: (6e-2, 6e-2),
torch.bfloat16: (6e-2, 6e-2),
torch.float32: (1e-2, 1e-2),
}[a.dtype]
torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
# Unlike test_punica_sizes.py, we directly utilize custom op for _dict_lock = Lock()
# testing, which verifies the correct registration of these ops.
bgmv_expand = torch.ops.vllm.bgmv_expand
bgmv_expand_slice = torch.ops.vllm.bgmv_expand_slice
bgmv_shrink = torch.ops.vllm.bgmv_shrink
sgmv_expand = torch.ops.vllm.sgmv_expand
sgmv_expand_slice = torch.ops.vllm.sgmv_expand_slice
sgmv_shrink = torch.ops.vllm.sgmv_shrink
@pytest.mark.parametrize("batches", BATCHES) @pytest.mark.parametrize("batches", BATCHES)
...@@ -53,16 +38,18 @@ sgmv_shrink = torch.ops.vllm.sgmv_shrink ...@@ -53,16 +38,18 @@ sgmv_shrink = torch.ops.vllm.sgmv_shrink
@pytest.mark.parametrize("rank", MAX_RANKS) @pytest.mark.parametrize("rank", MAX_RANKS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("scaling", SCALES) @pytest.mark.parametrize("scaling", SCALES)
@pytest.mark.parametrize("nslices", [1, 2, 3])
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("op_type", ["shrink", "expand"]) @pytest.mark.parametrize("op_type", ["shrink", "expand"])
@pytest.mark.parametrize("seed", SEED) @pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_punica_sgmv( def test_punica_sgmv(
batches: int, batches: int,
num_loras: int, num_loras: int,
rank: int, rank: int,
hidden_size: int, hidden_size: int,
scaling: float, scaling: float,
nslices: int,
dtype: torch.dtype, dtype: torch.dtype,
op_type: str, op_type: str,
seed: int, seed: int,
...@@ -74,19 +61,20 @@ def test_punica_sgmv( ...@@ -74,19 +61,20 @@ def test_punica_sgmv(
seq_length = 128 seq_length = 128
( (
inputs_tensor, inputs_tensor,
lora_weights, lora_weights_lst,
our_out_tensor, our_out_tensor,
ref_out_tensor, ref_out_tensor,
b_seq_start_loc, b_seq_start_loc,
lora_indices_tensor, lora_indices_tensor,
seq_len_tensor, seq_len_tensor,
indices, indices,
) = generate_data( ) = generate_data_for_nslices(
batches, batches,
hidden_size, hidden_size,
num_loras, num_loras,
rank, rank,
seq_length, seq_length,
nslices,
dtype, dtype,
op_type, op_type,
device, device,
...@@ -98,43 +86,85 @@ def test_punica_sgmv( ...@@ -98,43 +86,85 @@ def test_punica_sgmv(
else: else:
max_seq_length = max_seq_length.item() max_seq_length = max_seq_length.item()
if op_type == "shrink": if op_type == "shrink":
sgmv_shrink( # Preventing cache error pointer.
inputs_tensor, with _dict_lock:
lora_weights, _LORA_A_PTR_DICT.clear()
our_out_tensor, torch.ops.vllm.sgmv_shrink(
b_seq_start_loc, inputs_tensor,
seq_len_tensor, lora_weights_lst,
lora_indices_tensor, our_out_tensor,
batches, b_seq_start_loc,
max_seq_length, seq_len_tensor,
token_nums, lora_indices_tensor,
scaling, batches,
) max_seq_length,
token_nums,
scaling,
)
for index in range(nslices):
sgmv_shrink(
inputs_tensor,
lora_weights_lst[index],
ref_out_tensor[index],
b_seq_start_loc,
seq_len_tensor,
lora_indices_tensor,
batches,
max_seq_length,
token_nums,
scaling,
)
else: else:
sgmv_expand( with _dict_lock:
inputs_tensor, _LORA_B_PTR_DICT.clear()
lora_weights, torch.ops.vllm.sgmv_expand(
our_out_tensor, inputs_tensor,
b_seq_start_loc, lora_weights_lst,
seq_len_tensor, our_out_tensor,
lora_indices_tensor, b_seq_start_loc,
batches, seq_len_tensor,
max_seq_length, lora_indices_tensor,
token_nums, batches,
add_inputs=True, max_seq_length,
) token_nums,
ref_torch_groupgemm( offset_start=0,
ref_out_tensor, add_inputs=True,
inputs_tensor, )
lora_weights, slice_offset = 0
lora_indices_tensor, if nslices == 1:
seq_len_tensor, # Verify the torch's sgmv_expand op
batches, sgmv_expand(
scaling if op_type == "shrink" else 1.0, inputs_tensor[0],
op_type, lora_weights_lst[0],
) ref_out_tensor,
if op_type == "shrink": b_seq_start_loc,
ref_out_tensor = ref_out_tensor.to(torch.float32) seq_len_tensor,
lora_indices_tensor,
batches,
max_seq_length,
token_nums,
add_inputs=True,
)
else:
for index in range(nslices):
lora_weights = lora_weights_lst[index]
sgmv_expand_slice(
inputs_tensor[index],
lora_weights,
ref_out_tensor,
b_seq_start_loc,
seq_len_tensor,
lora_indices_tensor,
batches,
max_seq_length,
token_nums,
slice_offset,
hidden_size,
add_inputs=True,
)
slice_offset += hidden_size
assert_close(our_out_tensor, ref_out_tensor) assert_close(our_out_tensor, ref_out_tensor)
...@@ -146,7 +176,7 @@ def test_punica_sgmv( ...@@ -146,7 +176,7 @@ def test_punica_sgmv(
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("op_type", ["shrink", "expand"]) @pytest.mark.parametrize("op_type", ["shrink", "expand"])
@pytest.mark.parametrize("seed", SEED) @pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_punica_bgmv( def test_punica_bgmv(
batches: int, batches: int,
num_loras: int, num_loras: int,
...@@ -158,7 +188,6 @@ def test_punica_bgmv( ...@@ -158,7 +188,6 @@ def test_punica_bgmv(
seed: int, seed: int,
device: str, device: str,
): ):
torch.set_default_device(device) torch.set_default_device(device)
current_platform.seed_everything(seed) current_platform.seed_everything(seed)
...@@ -183,32 +212,38 @@ def test_punica_bgmv( ...@@ -183,32 +212,38 @@ def test_punica_bgmv(
device, device,
) )
if op_type == "shrink": if op_type == "shrink":
bgmv_shrink( torch.ops.vllm.bgmv_shrink(
inputs_tensor, inputs_tensor,
lora_weights, lora_weights,
our_out_tensor, our_out_tensor,
indices, indices,
scaling, scaling,
) )
else:
bgmv_expand( bgmv_shrink(
inputs_tensor,
lora_weights,
ref_out_tensor,
indices,
scaling,
)
else:
torch.ops.vllm.bgmv_expand(
inputs_tensor, inputs_tensor,
lora_weights, lora_weights,
our_out_tensor, our_out_tensor,
indices, indices,
add_inputs=True, add_inputs=True,
) )
ref_torch_groupgemm( bgmv_expand(
ref_out_tensor, inputs_tensor,
inputs_tensor, lora_weights,
lora_weights, ref_out_tensor,
lora_indices_tensor, indices,
seq_len_tensor, add_inputs=True,
batches, )
scaling if op_type == "shrink" else 1.0,
op_type,
)
if op_type == "shrink": if op_type == "shrink":
ref_out_tensor = ref_out_tensor.to(torch.float32) ref_out_tensor = ref_out_tensor.to(torch.float32)
assert_close(our_out_tensor, ref_out_tensor) assert_close(our_out_tensor, ref_out_tensor)
...@@ -220,24 +255,22 @@ def test_punica_bgmv( ...@@ -220,24 +255,22 @@ def test_punica_bgmv(
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("nslices", [2, 3]) @pytest.mark.parametrize("nslices", [2, 3])
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("op_type", ["sgmv", "bgmv"])
@pytest.mark.parametrize("seed", SEED) @pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", DEVICES)
def test_punica_expand_nslices( def test_punica_bgmv_expand_nslices(
batches: int, batches: int,
num_loras: int, num_loras: int,
rank: int, rank: int,
hidden_size: int, hidden_size: int,
nslices: int, nslices: int,
dtype: torch.dtype, dtype: torch.dtype,
op_type: str,
seed: int, seed: int,
device: str, device: str,
): ):
torch.set_default_device(device) torch.set_default_device(device)
current_platform.seed_everything(seed) current_platform.seed_everything(seed)
seq_length = 128 if op_type == "sgmv" else 1 seq_length = 1
( (
inputs_tensor, inputs_tensor,
lora_weights_lst, lora_weights_lst,
...@@ -257,49 +290,26 @@ def test_punica_expand_nslices( ...@@ -257,49 +290,26 @@ def test_punica_expand_nslices(
nslices, nslices,
device, device,
) )
max_seq_length = seq_len_tensor.max()
token_nums = seq_len_tensor.sum().item()
if isinstance(max_seq_length, tuple):
max_seq_length = max_seq_length[0].item()
else:
max_seq_length = max_seq_length.item()
slice_offset = 0 slice_offset = 0
for index in range(nslices): for index in range(nslices):
lora_weights = lora_weights_lst[index] lora_weights = lora_weights_lst[index]
if op_type == "sgmv": torch.ops.vllm.bgmv_expand_slice(
sgmv_expand_slice(
inputs_tensor,
lora_weights,
our_outputs,
b_seq_start_loc,
seq_len_tensor,
lora_indices_tensor,
batches,
max_seq_length,
token_nums,
slice_offset,
hidden_size,
add_inputs=True,
)
else:
bgmv_expand_slice(
inputs_tensor,
lora_weights,
our_outputs,
indices,
slice_offset,
slice_size=hidden_size,
add_inputs=True,
)
ref_torch_groupgemm(
ref_outputs[:, slice_offset:slice_offset + hidden_size],
inputs_tensor, inputs_tensor,
lora_weights, lora_weights,
lora_indices_tensor, our_outputs,
seq_len_tensor, indices,
batches, slice_offset,
1.0, slice_size=hidden_size,
op_type="expand", add_inputs=True,
)
bgmv_expand_slice(
inputs_tensor,
lora_weights,
ref_outputs,
indices,
slice_offset,
slice_size=hidden_size,
add_inputs=True,
) )
slice_offset += hidden_size slice_offset += hidden_size
......
...@@ -75,7 +75,8 @@ def do_sample(llm: vllm.LLM, ...@@ -75,7 +75,8 @@ def do_sample(llm: vllm.LLM,
@pytest.mark.parametrize("tp_size", [1]) @pytest.mark.parametrize("tp_size", [1])
def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model, def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
tp_size): tp_size):
if num_gpus_available < tp_size: if num_gpus_available < tp_size and \
tp_size > 1 and current_platform.is_cuda_alike():
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
llm = vllm.LLM( llm = vllm.LLM(
......
...@@ -7,7 +7,7 @@ from vllm.assets.image import ImageAsset ...@@ -7,7 +7,7 @@ from vllm.assets.image import ImageAsset
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform from vllm.platforms import current_platform
MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct" MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
PROMPT_TEMPLATE = ( PROMPT_TEMPLATE = (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>" "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
...@@ -49,16 +49,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: ...@@ -49,16 +49,15 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
# Print the outputs. # Print the outputs.
generated_texts: List[str] = [] generated_texts: List[str] = []
for output in outputs: for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip() generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text) generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Generated text: {generated_text!r}")
return generated_texts return generated_texts
@pytest.mark.xfail(current_platform.is_rocm(), @pytest.mark.xfail(
reason="Qwen2-VL dependency xformers incompatible with ROCm" current_platform.is_rocm(),
) reason="Qwen2-VL dependency xformers incompatible with ROCm")
def test_qwen2vl_lora(qwen2vl_lora_files): def test_qwen2vl_lora(qwen2vl_lora_files):
llm = vllm.LLM( llm = vllm.LLM(
MODEL_PATH, MODEL_PATH,
......
...@@ -18,11 +18,13 @@ class DummyLoRAManager: ...@@ -18,11 +18,13 @@ class DummyLoRAManager:
def get_module_lora(self, module_name: str) -> LoRALayerWeights: def get_module_lora(self, module_name: str) -> LoRALayerWeights:
return self._loras[module_name] return self._loras[module_name]
def init_random_lora(self, def init_random_lora(
module_name: str, self,
weight: torch.Tensor, module_name: str,
rank: int = 8, weight: torch.Tensor,
generate_embeddings_tensor: int = 0): rank: int = 8,
generate_embeddings_tensor: int = 0,
):
lora = LoRALayerWeights( lora = LoRALayerWeights(
module_name, module_name,
rank=rank, rank=rank,
...@@ -35,21 +37,25 @@ class DummyLoRAManager: ...@@ -35,21 +37,25 @@ class DummyLoRAManager:
device=self._device), device=self._device),
) )
if generate_embeddings_tensor: if generate_embeddings_tensor:
lora.embeddings_tensor = torch.rand(5, lora.embeddings_tensor = torch.rand(
generate_embeddings_tensor, 5,
dtype=weight.dtype, generate_embeddings_tensor,
device=self._device) dtype=weight.dtype,
device=self._device,
)
self.set_module_lora(module_name, lora) self.set_module_lora(module_name, lora)
return lora return lora
def init_lora(self, def init_lora(
module_name: str, self,
input_dim: int, module_name: str,
output_dim: int, input_dim: int,
rank=8, output_dim: int,
noop=False, rank=8,
embeddings_tensor=None): noop=False,
embeddings_tensor=None,
):
lora = LoRALayerWeights( lora = LoRALayerWeights(
module_name, module_name,
rank=rank, rank=rank,
...@@ -98,35 +104,16 @@ def assert_close(a, b): ...@@ -98,35 +104,16 @@ def assert_close(a, b):
torch.testing.assert_close(a, b, rtol=rtol, atol=atol) torch.testing.assert_close(a, b, rtol=rtol, atol=atol)
def ref_torch_groupgemm( def generate_data(
out_tensor,
inputs,
lora_weights,
lora_indices_tensor,
seq_len_tensor,
batches, batches,
scaling, hidden_size,
lora_nums,
max_rank,
seq_length,
dtype,
op_type, op_type,
) -> torch.Tensor: device,
out_list = [] ):
current_offset = 0
for lora_index, b_length in zip(range(batches), seq_len_tensor):
input_weight = inputs[current_offset:b_length + current_offset, :]
current_offset += b_length
lora_weight = lora_weights[lora_indices_tensor[lora_index]]
result = torch.nn.functional.linear(input_weight, lora_weight)
result *= scaling
out_list.append(result)
cat_result = torch.cat(out_list, dim=0)
if op_type == "expand":
out_tensor += cat_result
else:
out_tensor.copy_(cat_result)
return
def generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, dtype,
op_type, device):
seq_len_tensor = torch.randint(seq_length, seq_length + 1, seq_len_tensor = torch.randint(seq_length, seq_length + 1,
(batches, )).to(device) (batches, )).to(device)
b_seq_start_loc = torch.cumsum( b_seq_start_loc = torch.cumsum(
...@@ -187,8 +174,16 @@ def generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, dtype, ...@@ -187,8 +174,16 @@ def generate_data(batches, hidden_size, lora_nums, max_rank, seq_length, dtype,
) )
def generate_data_for_expand_nslices(batches, hidden_size, lora_nums, max_rank, def generate_data_for_expand_nslices(
seq_length, dtype, nslices, device): batches,
hidden_size,
lora_nums,
max_rank,
seq_length,
dtype,
nslices,
device,
):
seq_len_tensor = torch.randint(seq_length, seq_length + 1, seq_len_tensor = torch.randint(seq_length, seq_length + 1,
(batches, )).to(device) (batches, )).to(device)
b_seq_start_loc = torch.cumsum( b_seq_start_loc = torch.cumsum(
...@@ -221,7 +216,87 @@ def generate_data_for_expand_nslices(batches, hidden_size, lora_nums, max_rank, ...@@ -221,7 +216,87 @@ def generate_data_for_expand_nslices(batches, hidden_size, lora_nums, max_rank,
for b_id in range(batches): for b_id in range(batches):
lora_index = lora_indices_tensor[b_id] lora_index = lora_indices_tensor[b_id]
indices[current_offset:current_offset + indices[current_offset:current_offset +
seq_len_tensor[b_id]] = lora_index.item() seq_len_tensor[b_id]] = (lora_index.item())
current_offset += seq_len_tensor[b_id].item()
lora_indices_tensor = lora_indices_tensor.to(device)
return (
inputs_tensor,
lora_weights_lst,
our_out_tensor,
ref_out_tensor,
b_seq_start_loc,
lora_indices_tensor,
seq_len_tensor,
indices,
)
def generate_data_for_nslices(
batches,
hidden_size,
lora_nums,
max_rank,
seq_length,
nslices,
dtype,
op_type,
device,
):
seq_len_tensor = torch.randint(seq_length, seq_length + 1,
(batches, )).to(device)
b_seq_start_loc = torch.cumsum(
torch.tensor([0] + seq_len_tensor[:-1].tolist(), dtype=torch.long),
dim=0,
).to(device)
total_tokens = seq_len_tensor.sum()
lora_weights_lst = []
if op_type == "shrink":
inputs_tensor = torch.rand((total_tokens, hidden_size),
dtype=dtype).to(device)
for _ in range(nslices):
if op_type == "shrink":
lora_weights_lst.append(
torch.rand(
(lora_nums, max_rank, hidden_size), # col-major
dtype=dtype,
).to(device))
# NOTE shrink kernel using torch.float32 as output type
# shrink op need atomic_add, so output is initinized by 0
our_out_tensor = torch.zeros(
(nslices, total_tokens, max_rank),
dtype=torch.float32,
).to(device)
else:
inputs_tensor = torch.rand(
(nslices, total_tokens, max_rank),
dtype=dtype,
).to(device)
for _ in range(nslices):
lora_weights_lst.append(
torch.rand(
(lora_nums, hidden_size, max_rank), # col-major
dtype=dtype,
).to(device))
# expand op needs to complete y+=a@lora_b, so output is
# initinized randomly
our_out_tensor = torch.rand((total_tokens, hidden_size * nslices),
dtype=dtype).to(device)
# Ensure the same input.
ref_out_tensor = our_out_tensor.clone()
lora_indices_tensor = torch.randint(0,
lora_nums - 1 if lora_nums > 1 else 1,
(batches, ))
indices = torch.zeros((total_tokens), dtype=torch.long).to(device)
current_offset = 0
for b_id in range(batches):
lora_index = lora_indices_tensor[b_id]
indices[current_offset:current_offset +
seq_len_tensor[b_id]] = (lora_index.item())
current_offset += seq_len_tensor[b_id].item() current_offset += seq_len_tensor[b_id].item()
lora_indices_tensor = lora_indices_tensor.to(device) lora_indices_tensor = lora_indices_tensor.to(device)
......
...@@ -2,7 +2,7 @@ import os ...@@ -2,7 +2,7 @@ import os
import pytest import pytest
from vllm.model_executor.layers.pooler import PoolingType from vllm.model_executor.layers.pooler import CLSPool, PoolingType
from vllm.model_executor.models.bert import BertEmbeddingModel from vllm.model_executor.models.bert import BertEmbeddingModel
from vllm.model_executor.models.roberta import RobertaEmbeddingModel from vllm.model_executor.models.roberta import RobertaEmbeddingModel
from vllm.platforms import current_platform from vllm.platforms import current_platform
...@@ -26,13 +26,12 @@ def test_model_loading_with_params(vllm_runner): ...@@ -26,13 +26,12 @@ def test_model_loading_with_params(vllm_runner):
with vllm_runner(model_name=MODEL_NAME, with vllm_runner(model_name=MODEL_NAME,
revision=REVISION, revision=REVISION,
dtype="float16", dtype="float16",
max_model_len=MAX_MODEL_LEN) as model: max_model_len=MAX_MODEL_LEN) as vllm_model:
output = model.encode("Write a short story about a robot that" output = vllm_model.encode("Write a short story about a robot that"
" dreams for the first time.\n") " dreams for the first time.\n")
model_config = model.model.llm_engine.model_config model_config = vllm_model.model.llm_engine.model_config
model_tokenizer = vllm_model.model.llm_engine.tokenizer
model_tokenizer = model.model.llm_engine.tokenizer
# asserts on the bert model config file # asserts on the bert model config file
assert model_config.encoder_config["max_seq_length"] == 512 assert model_config.encoder_config["max_seq_length"] == 512
...@@ -47,11 +46,13 @@ def test_model_loading_with_params(vllm_runner): ...@@ -47,11 +46,13 @@ def test_model_loading_with_params(vllm_runner):
assert model_tokenizer.tokenizer_config["do_lower_case"] assert model_tokenizer.tokenizer_config["do_lower_case"]
assert model_tokenizer.tokenizer.model_max_length == 512 assert model_tokenizer.tokenizer.model_max_length == 512
model = model.model.llm_engine.model_executor\ def check_model(model):
.driver_worker.model_runner.model assert isinstance(model, BertEmbeddingModel)
assert isinstance(model, BertEmbeddingModel) assert model._pooler.pooling_type == PoolingType.CLS
assert model._pooler.pooling_type == PoolingType.CLS assert model._pooler.normalize
assert model._pooler.normalize
vllm_model.apply_model(check_model)
# assert output # assert output
assert output assert output
...@@ -65,13 +66,12 @@ def test_roberta_model_loading_with_params(vllm_runner): ...@@ -65,13 +66,12 @@ def test_roberta_model_loading_with_params(vllm_runner):
with vllm_runner(model_name=MODEL_NAME_ROBERTA, with vllm_runner(model_name=MODEL_NAME_ROBERTA,
revision=REVISION_ROBERTA, revision=REVISION_ROBERTA,
dtype="float16", dtype="float16",
max_model_len=MAX_MODEL_LEN) as model: max_model_len=MAX_MODEL_LEN) as vllm_model:
output = model.encode("Write a short story about a robot that" output = vllm_model.encode("Write a short story about a robot that"
" dreams for the first time.\n") " dreams for the first time.\n")
model_config = model.model.llm_engine.model_config model_config = vllm_model.model.llm_engine.model_config
model_tokenizer = vllm_model.model.llm_engine.tokenizer
model_tokenizer = model.model.llm_engine.tokenizer
# asserts on the bert model config file # asserts on the bert model config file
assert model_config.encoder_config["max_seq_length"] == 512 assert model_config.encoder_config["max_seq_length"] == 512
...@@ -85,11 +85,38 @@ def test_roberta_model_loading_with_params(vllm_runner): ...@@ -85,11 +85,38 @@ def test_roberta_model_loading_with_params(vllm_runner):
assert model_tokenizer.tokenizer_id == os.path.join(models_path_prefix, "intfloat/multilingual-e5-large") assert model_tokenizer.tokenizer_id == os.path.join(models_path_prefix, "intfloat/multilingual-e5-large")
assert not model_tokenizer.tokenizer_config["do_lower_case"] assert not model_tokenizer.tokenizer_config["do_lower_case"]
model = model.model.llm_engine.model_executor\ def check_model(model):
.driver_worker.model_runner.model assert isinstance(model, RobertaEmbeddingModel)
assert isinstance(model, RobertaEmbeddingModel) assert model._pooler.pooling_type == PoolingType.MEAN
assert model._pooler.pooling_type == PoolingType.MEAN assert model._pooler.normalize
assert model._pooler.normalize
vllm_model.apply_model(check_model)
# assert output # assert output
assert output assert output
@pytest.mark.skipif(current_platform.is_rocm(),
reason="Xformers backend is not supported on ROCm.")
def test_facebook_roberta_model_loading_with_params(vllm_runner):
"""
Test loading roberta-base model with no lm_head.
"""
model_name = "FacebookAI/roberta-base"
with vllm_runner(model_name=model_name,
dtype="float16",
max_model_len=MAX_MODEL_LEN) as vllm_model:
output = vllm_model.encode("Write a short story about a robot that"
" dreams for the first time.\n")
model_tokenizer = vllm_model.model.llm_engine.tokenizer
assert model_tokenizer.tokenizer_id == model_name
def check_model(model):
assert isinstance(model, RobertaEmbeddingModel)
assert not hasattr(model, "lm_head")
assert isinstance(model._pooler, CLSPool)
vllm_model.apply_model(check_model)
assert output
...@@ -240,8 +240,8 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str, ...@@ -240,8 +240,8 @@ def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_online_inference(client, audio_assets): async def test_online_serving(client, audio_assets):
"""Exercises online inference with/without chunked prefill enabled.""" """Exercises online serving with/without chunked prefill enabled."""
messages = [{ messages = [{
"role": "role":
......
...@@ -20,18 +20,17 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true" ...@@ -20,18 +20,17 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
@pytest.mark.skipif(not is_quant_method_supported("fp8"), @pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.") reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize( @pytest.mark.parametrize(
"kv_cache_dtype,base_model,test_model,scale_path", "kv_cache_dtype,base_model,test_model",
[ [
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors. # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
("fp8_e4m3", os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), ("fp8_e4m3", os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
os.path.join(models_path_prefix, "nm-testing/Llama-3.2-1B-Instruct-FP8-KV"), None), os.path.join(models_path_prefix, "nm-testing/Llama-3.2-1B-Instruct-FP8-KV")),
# Test FP16 checkpoint w. fp8_e5m2 kv-cache. # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
("fp8_e5m2", os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), ("fp8_e5m2", os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), None), os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")),
# Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json. # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
("fp8_e4m3", os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"), ("fp8_e4m3", os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"), os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"))
"./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
]) ])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens # Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4]) @pytest.mark.parametrize("max_tokens", [4])
...@@ -49,7 +48,6 @@ def test_models( ...@@ -49,7 +48,6 @@ def test_models(
kv_cache_dtype: str, kv_cache_dtype: str,
base_model: str, base_model: str,
test_model: str, test_model: str,
scale_path: Optional[str],
max_tokens: int, max_tokens: int,
enforce_eager: bool, enforce_eager: bool,
backend: str, backend: str,
...@@ -77,10 +75,6 @@ def test_models( ...@@ -77,10 +75,6 @@ def test_models(
baseline_outputs = vllm_model.generate_greedy_logprobs( baseline_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS) example_prompts, max_tokens, NUM_LOG_PROBS)
extra_kwargs = {}
if scale_path is not None:
extra_kwargs["quantization_param_path"] = scale_path
with vllm_runner( with vllm_runner(
test_model, test_model,
max_model_len=MAX_MODEL_LEN, max_model_len=MAX_MODEL_LEN,
...@@ -88,7 +82,6 @@ def test_models( ...@@ -88,7 +82,6 @@ def test_models(
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype, kv_cache_dtype=kv_cache_dtype,
disable_async_output_proc=disable_async_output_proc, disable_async_output_proc=disable_async_output_proc,
**extra_kwargs,
) as vllm_model: ) as vllm_model:
test_outputs = vllm_model.generate_greedy_logprobs( test_outputs = vllm_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS) example_prompts, max_tokens, NUM_LOG_PROBS)
......
...@@ -4,6 +4,7 @@ Note: To pass the test, quantization higher than Q4 should be used ...@@ -4,6 +4,7 @@ Note: To pass the test, quantization higher than Q4 should be used
""" """
import os import os
from typing import List, NamedTuple, Type
import pytest import pytest
from huggingface_hub import hf_hub_download from huggingface_hub import hf_hub_download
...@@ -11,6 +12,7 @@ from transformers import AutoTokenizer ...@@ -11,6 +12,7 @@ from transformers import AutoTokenizer
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from ....conftest import VllmRunner
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
from ....utils import models_path_prefix from ....utils import models_path_prefix
...@@ -19,31 +21,78 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true" ...@@ -19,31 +21,78 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
MAX_MODEL_LEN = 1024 MAX_MODEL_LEN = 1024
class GGUFTestConfig(NamedTuple):
original_model: str
gguf_repo: str
gguf_filename: str
@property
def gguf_model(self):
return hf_hub_download(self.gguf_repo, filename=self.gguf_filename)
LLAMA_CONFIG = GGUFTestConfig(
original_model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
gguf_repo=os.path.join(models_path_prefix, "bartowski/Llama-3.2-1B-Instruct-GGUF"),
gguf_filename=os.path.join(models_path_prefix, "Llama-3.2-1B-Instruct-IQ4_XS.gguf"),
)
QWEN2_CONFIG = GGUFTestConfig(
original_model=os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct"),
gguf_repo=os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct-GGUF"),
gguf_filename=os.path.join(models_path_prefix, "qwen2.5-1.5b-instruct-q6_k.gguf"),
)
PHI3_CONFIG = GGUFTestConfig(
original_model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-mini-instruct"),
gguf_repo=os.path.join(models_path_prefix, "bartowski/Phi-3.5-mini-instruct-GGUF"),
gguf_filename=os.path.join(models_path_prefix, "Phi-3.5-mini-instruct-IQ4_XS.gguf"),
)
GPT2_CONFIG = GGUFTestConfig(
original_model=os.path.join(models_path_prefix, "openai-community/gpt2-large"),
gguf_repo=os.path.join(models_path_prefix, "QuantFactory/gpt2-large-GGUF"),
gguf_filename=os.path.join(models_path_prefix, "gpt2-large.Q4_K_M.gguf"),
)
STABLELM_CONFIG = GGUFTestConfig(
original_model=os.path.join(models_path_prefix, "stabilityai/stablelm-3b-4e1t"),
gguf_repo=os.path.join(models_path_prefix, "afrideva/stablelm-3b-4e1t-GGUF"),
gguf_filename=os.path.join(models_path_prefix, "stablelm-3b-4e1t.q4_k_m.gguf"),
)
STARCODER_CONFIG = GGUFTestConfig(
original_model=os.path.join(models_path_prefix, "bigcode/starcoder2-3b"),
gguf_repo=os.path.join(models_path_prefix, "QuantFactory/starcoder2-3b-GGUF"),
gguf_filename=os.path.join(models_path_prefix, "starcoder2-3b.Q6_K.gguf"),
)
DOLPHIN_CONFIG = GGUFTestConfig(
# Test VocabParallelEmbedding sharding issue.
original_model=os.path.join(models_path_prefix, "cognitivecomputations/TinyDolphin-2.8-1.1b"),
gguf_repo=os.path.join(models_path_prefix, "tsunemoto/TinyDolphin-2.8-1.1b-GGUF"),
gguf_filename=os.path.join(models_path_prefix, "tinydolphin-2.8-1.1b.Q6_K.gguf"),
)
MODELS = [
LLAMA_CONFIG, QWEN2_CONFIG, PHI3_CONFIG, GPT2_CONFIG, STABLELM_CONFIG,
DOLPHIN_CONFIG
# STARCODER_CONFIG, # broken
]
@pytest.mark.skipif(not is_quant_method_supported("gguf"), @pytest.mark.skipif(not is_quant_method_supported("gguf"),
reason="gguf is not supported on this GPU type.") reason="gguf is not supported on this GPU type.")
@pytest.mark.parametrize(("original_model", "gguf_id", "gguf_path"), [ @pytest.mark.parametrize("model", MODELS)
(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
os.path.join(models_path_prefix, "bartowski/Llama-3.2-1B-Instruct-GGUF"),
os.path.join(models_path_prefix, "Llama-3.2-1B-Instruct-Q4_K_M.gguf")),
(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
os.path.join(models_path_prefix, "bartowski/Llama-3.2-1B-Instruct-GGUF"),
os.path.join(models_path_prefix, "Llama-3.2-1B-Instruct-IQ4_XS.gguf")),
(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct"), os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-GGUF"),
os.path.join(models_path_prefix, "qwen2-1_5b-instruct-q4_k_m.gguf")),
(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct"), os.path.join(models_path_prefix, "legraphista/Qwen2-1.5B-Instruct-IMat-GGUF"),
os.path.join(models_path_prefix, "Qwen2-1.5B-Instruct.IQ4_XS.gguf")),
])
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [32]) @pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("tp_size", [1, 2]) @pytest.mark.parametrize("tp_size", [1, 2])
def test_models( def test_models(
num_gpus_available, num_gpus_available: int,
vllm_runner, vllm_runner: Type[VllmRunner],
example_prompts, example_prompts: List[str],
original_model, model: GGUFTestConfig,
gguf_id,
gguf_path,
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
num_logprobs: int, num_logprobs: int,
...@@ -52,28 +101,29 @@ def test_models( ...@@ -52,28 +101,29 @@ def test_models(
if num_gpus_available < tp_size: if num_gpus_available < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
gguf_model = hf_hub_download(gguf_id, filename=gguf_path) tokenizer = AutoTokenizer.from_pretrained(model.original_model)
if tokenizer.chat_template is not None:
tokenizer = AutoTokenizer.from_pretrained(original_model) messages = [[{
messages = [[{ 'role': 'user',
'role': 'user', 'content': prompt
'content': prompt }] for prompt in example_prompts]
}] for prompt in example_prompts] example_prompts = tokenizer.apply_chat_template(
example_prompts = tokenizer.apply_chat_template(messages, messages, tokenize=False, add_generation_prompt=True)
tokenize=False,
add_generation_prompt=True)
# Run unquantized model. # Run unquantized model.
with vllm_runner(model_name=original_model, with vllm_runner(
dtype=dtype, model_name=model.original_model,
max_model_len=MAX_MODEL_LEN, enforce_eager=True, # faster tests
tensor_parallel_size=tp_size) as original_model: dtype=dtype,
max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tp_size) as original_model:
original_outputs = original_model.generate_greedy_logprobs( original_outputs = original_model.generate_greedy_logprobs(
example_prompts[:-1], max_tokens, num_logprobs) example_prompts[:-1], max_tokens, num_logprobs)
# Run gguf model. # Run gguf model.
with vllm_runner(model_name=gguf_model, with vllm_runner(model_name=model.gguf_model,
enforce_eager=True,
tokenizer_name=model.original_model,
dtype=dtype, dtype=dtype,
max_model_len=MAX_MODEL_LEN, max_model_len=MAX_MODEL_LEN,
tensor_parallel_size=tp_size) as gguf_model: tensor_parallel_size=tp_size) as gguf_model:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment