Commit 469e903b authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.2' into v0.8.2-dev

parents 389ebcf7 25f560a6
# SPDX-License-Identifier: Apache-2.0
import ast
from typing import List, Optional, Tuple
from typing import Optional
import numpy as np
import pytest
......@@ -9,7 +9,7 @@ import os
import vllm
from vllm import SamplingParams
from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLora
from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLoRA
from vllm.lora.request import LoRARequest
from vllm.model_executor.layers.rotary_embedding import (
LinearScalingRotaryEmbedding)
......@@ -88,7 +88,7 @@ def evaluate_json_response(model_response, golden_response):
def generate(
llm: vllm.LLM,
inputs: Tuple[str, SamplingParams, Optional[LoRARequest]],
inputs: tuple[str, SamplingParams, Optional[LoRARequest]],
):
prompts, sampling_param, lora_request = inputs
outputs = llm.generate(prompts, sampling_param, lora_request=lora_request)
......@@ -97,7 +97,7 @@ def generate(
def batched_generate(
llm: vllm.LLM,
inputs: List[Tuple[str, SamplingParams, Optional[LoRARequest]]],
inputs: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
):
for input in inputs:
prompt, sampling_param, lora_req = input
......@@ -153,7 +153,7 @@ def test_rotary_emb_replaced(dist_init):
if "rotary_emb" in module_name:
if "base_layer" not in module_name:
rotary_emb_count += 1
assert isinstance(module, LinearScalingRotaryEmbeddingWithLora)
assert isinstance(module, LinearScalingRotaryEmbeddingWithLoRA)
else:
assert isinstance(module, LinearScalingRotaryEmbedding)
# Llama 2 has 32 layers.
......@@ -166,7 +166,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
non-batched generation.
"""
# Create non batched results first to compare against batched results
non_batched_results: List[str] = []
non_batched_results: list[str] = []
for lora_id, info in long_context_infos.items():
context_len = info["context_length"]
......@@ -179,7 +179,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
# Create batched results
# Each element of the batch must be
# (prompt, prompt_sampling_params, prompt_lora_request)
batched_prompts: List[Tuple[str, SamplingParams,
batched_prompts: list[tuple[str, SamplingParams,
Optional[LoRARequest]]] = []
for lora_id, info in long_context_infos.items():
context_len = info["context_length"]
......@@ -204,7 +204,7 @@ def test_self_consistency(lora_llm, long_context_infos):
num_loras = len(long_context_infos)
# Create results in order of long_context_infos
batched_prompts: List[Tuple[str, SamplingParams,
batched_prompts: list[tuple[str, SamplingParams,
Optional[LoRARequest]]] = []
for lora_id, info in long_context_infos.items():
context_len = info["context_length"]
......@@ -253,7 +253,7 @@ def test_quality(lora_llm, long_context_infos):
The test is expected to run for about 1 minute on a p4de.24xlarge
instance.
"""
scores: List[float] = []
scores: list[float] = []
for lora_id, info in long_context_infos.items():
context_len = info["context_length"]
for prompt_and_response in prompts_and_responses[context_len]:
......@@ -286,7 +286,7 @@ def test_max_len(lora_llm, long_context_infos):
generate(lora_llm, (bad_prompt, sampling_params, lora_request))
# Also test batched
batched_prompts: List[Tuple[str, SamplingParams,
batched_prompts: list[tuple[str, SamplingParams,
Optional[LoRARequest]]] = []
for lora_id_with_bad_inputs in long_context_infos:
for lora_id, info in long_context_infos.items():
......
# SPDX-License-Identifier: Apache-2.0
from typing import List
import pytest
from vllm.lora.models import LoRAModel
......@@ -12,6 +10,12 @@ from vllm.model_executor.models.utils import WeightsMapper
lora_lst = [
"baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"
]
BAICHUAN_LORA_MODULES = [
"W_pack",
"o_proj",
"gate_up_proj",
"down_proj",
]
@pytest.mark.parametrize("lora_name", lora_lst)
......@@ -22,12 +26,11 @@ def test_load_checkpoints(
baichuan_regex_lora_files,
chatglm3_lora_files,
):
supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
expected_lora_modules: List[str] = []
for module in supported_lora_modules:
expected_lora_modules: list[str] = []
for module in BAICHUAN_LORA_MODULES:
if module in packed_modules_mapping:
expected_lora_modules.extend(packed_modules_mapping[module])
else:
......@@ -90,12 +93,12 @@ def test_load_checkpoints(
def test_lora_weights_mapping(baichuan_lora_files):
supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
embed_padding_modules = BaiChuanBaseForCausalLM.embedding_padding_modules
expected_lora_modules: List[str] = []
for module in supported_lora_modules:
expected_lora_modules: list[str] = []
for module in BAICHUAN_LORA_MODULES:
if module in packed_modules_mapping:
expected_lora_modules.extend(packed_modules_mapping[module])
else:
......
# SPDX-License-Identifier: Apache-2.0
"""
Script to test add_lora, remove_lora, pin_lora, list_loras functions.
"""
import os
import pytest
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.lora.request import LoRARequest
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
LORA_RANK = 8
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def make_lora_request(lora_id: int):
return LoRARequest(lora_name=f"{lora_id}",
lora_int_id=lora_id,
lora_path=LORA_MODULE_PATH)
def test_lora_functions_sync():
max_loras = 4
# Create engine in eager-mode. Due to high max_loras, the CI can
# OOM during cuda-graph capture.
engine_args = EngineArgs(model=MODEL_PATH,
enable_lora=True,
max_loras=max_loras,
max_lora_rank=LORA_RANK,
max_model_len=128,
gpu_memory_utilization=0.8,
enforce_eager=True)
llm = LLMEngine.from_engine_args(engine_args)
def run_check(fn, args, expected: list):
fn(args)
assert set(llm.list_loras()) == set(expected)
run_check(llm.add_lora, make_lora_request(1), [1])
run_check(llm.add_lora, make_lora_request(2), [1, 2])
# Pin LoRA 1 and test that it is never removed on subsequent adds.
run_check(llm.pin_lora, 1, [1, 2])
run_check(llm.add_lora, make_lora_request(3), [1, 2, 3])
run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4])
run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4])
run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4])
run_check(llm.add_lora, make_lora_request(7), [1, 5, 6, 7])
run_check(llm.add_lora, make_lora_request(8), [1, 8, 6, 7])
run_check(llm.add_lora, make_lora_request(9), [1, 8, 9, 7])
run_check(llm.add_lora, make_lora_request(10), [1, 8, 9, 10])
# Remove LoRA 1 and continue adding.
run_check(llm.remove_lora, 1, [8, 9, 10])
run_check(llm.add_lora, make_lora_request(11), [8, 9, 10, 11])
run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11])
run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11])
# Remove all LoRAs
run_check(llm.remove_lora, 13, [12, 10, 11])
run_check(llm.remove_lora, 12, [10, 11])
run_check(llm.remove_lora, 11, [10])
run_check(llm.remove_lora, 10, [])
@pytest.mark.asyncio
async def test_lora_functions_async():
if os.getenv("VLLM_USE_V1") == "0":
pytest.skip(
reason=
"V0 AsyncLLMEngine does not expose remove/list/pin LoRA functions")
# The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
# environment variable. reload vllm.enging.async_llm_engine as
# vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the
# env var.
import importlib
import vllm.engine.async_llm_engine
importlib.reload(vllm.engine.async_llm_engine)
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
max_loras = 4
engine_args = AsyncEngineArgs(model=MODEL_PATH,
enable_lora=True,
max_loras=max_loras,
max_lora_rank=LORA_RANK,
max_model_len=128,
gpu_memory_utilization=0.8,
enforce_eager=True)
async def run_check(fn, args, expected: list):
await fn(args)
assert set(await llm.list_loras()) == set(expected)
async with build_async_engine_client_from_engine_args(engine_args) as llm:
await run_check(llm.add_lora, make_lora_request(1), [1])
await run_check(llm.add_lora, make_lora_request(2), [1, 2])
# Pin LoRA 1 and test that it is never removed on subsequent adds.
await run_check(llm.pin_lora, 1, [1, 2])
await run_check(llm.add_lora, make_lora_request(3), [1, 2, 3])
await run_check(llm.add_lora, make_lora_request(4), [1, 2, 3, 4])
await run_check(llm.add_lora, make_lora_request(5), [1, 5, 3, 4])
await run_check(llm.add_lora, make_lora_request(6), [1, 5, 6, 4])
await run_check(llm.add_lora, make_lora_request(7), [1, 5, 6, 7])
await run_check(llm.add_lora, make_lora_request(8), [1, 8, 6, 7])
await run_check(llm.add_lora, make_lora_request(9), [1, 8, 9, 7])
await run_check(llm.add_lora, make_lora_request(10), [1, 8, 9, 10])
# Remove LoRA 1 and continue adding.
await run_check(llm.remove_lora, 1, [8, 9, 10])
await run_check(llm.add_lora, make_lora_request(11), [8, 9, 10, 11])
await run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11])
await run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11])
# Remove all LoRAs
await run_check(llm.remove_lora, 13, [12, 10, 11])
await run_check(llm.remove_lora, 12, [10, 11])
await run_check(llm.remove_lora, 11, [10])
await run_check(llm.remove_lora, 10, [])
# SPDX-License-Identifier: Apache-2.0
from typing import List
import pytest
from vllm.lora.models import LoRAModel
......@@ -11,17 +9,20 @@ from vllm.model_executor.models.llama import LlamaForCausalLM
# Provide absolute path and huggingface lora ids
lora_fixture_name = ["sql_lora_files", "sql_lora_huggingface_id"]
LLAMA_LORA_MODULES = [
"qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens",
"lm_head"
]
@pytest.mark.parametrize("lora_fixture_name", lora_fixture_name)
def test_load_checkpoints_from_huggingface(lora_fixture_name, request):
lora_name = request.getfixturevalue(lora_fixture_name)
supported_lora_modules = LlamaForCausalLM.supported_lora_modules
packed_modules_mapping = LlamaForCausalLM.packed_modules_mapping
embedding_modules = LlamaForCausalLM.embedding_modules
embed_padding_modules = LlamaForCausalLM.embedding_padding_modules
expected_lora_modules: List[str] = []
for module in supported_lora_modules:
expected_lora_modules: list[str] = []
for module in LLAMA_LORA_MODULES:
if module in packed_modules_mapping:
expected_lora_modules.extend(packed_modules_mapping[module])
else:
......
# SPDX-License-Identifier: Apache-2.0
import os
from typing import Dict, List
import pytest
import torch
from safetensors.torch import load_file
from torch import nn
from vllm import envs
from vllm.config import LoRAConfig
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
MergedColumnParallelLinearWithLoRA,
......@@ -19,7 +19,6 @@ from vllm.lora.peft_helper import PEFTHelper
from vllm.lora.request import LoRARequest
from vllm.lora.worker_manager import (LRUCacheWorkerLoRAManager,
WorkerLoRAManager)
from vllm.model_executor.layers.linear import RowParallelLinear
from vllm.platforms import current_platform
EMBEDDING_MODULES = {
......@@ -73,9 +72,9 @@ def test_from_lora_tensors(sql_lora_files, device):
assert lora.embeddings_tensor is None
def create_lora(lora_id: int, model: nn.Module, sub_modules: List[str],
def create_lora(lora_id: int, model: nn.Module, sub_modules: list[str],
device: torch.device) -> LoRAModel:
loras: Dict[str, LoRALayerWeights] = {}
loras: dict[str, LoRALayerWeights] = {}
for name in sub_modules:
w = model.get_submodule(name).weight
loras[name] = LoRALayerWeights(
......@@ -97,7 +96,7 @@ def create_packed_lora(
empty_replaced_module_name=None,
) -> LoRAModel:
w = model.get_submodule(module_name).weight
loras: Dict[str, LoRALayerWeights] = {}
loras: dict[str, LoRALayerWeights] = {}
for replaced_module_name in replaced_module_names:
if replaced_module_name == empty_replaced_module_name:
continue
......@@ -114,19 +113,16 @@ def create_packed_lora(
def test_replace_submodules(dist_init, dummy_model):
model = dummy_model
model.supported_lora_modules = ["dense1", "layer1.dense2"]
model.packed_modules_mapping = {}
manager = LoRAModelManager(
model, 1, 1, 1,
LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8),
torch.device(DEVICES[0]))
model = manager.model
assert isinstance(model.get_submodule("dense1"),
ColumnParallelLinearWithLoRA)
assert isinstance(model.get_submodule("layer1.dense1"),
ColumnParallelLinearWithLoRA)
assert isinstance(model.get_submodule("dense2"), RowParallelLinear)
assert isinstance(model.get_submodule("dense2"), RowParallelLinearWithLoRA)
assert isinstance(model.get_submodule("layer1.dense2"),
RowParallelLinearWithLoRA)
......@@ -134,8 +130,6 @@ def test_replace_submodules(dist_init, dummy_model):
@pytest.mark.parametrize("device", DEVICES)
def test_lora_model_manager(dist_init, dummy_model, device):
model = dummy_model
model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
model.packed_modules_mapping = {}
model_lora1 = create_lora(1,
model, ["layer1.dense1", "dense2", "lm_head"],
device=device)
......@@ -190,13 +184,18 @@ def test_lora_model_manager(dist_init, dummy_model, device):
assert manager.device == device
assert manager.punica_wrapper.device == device
assert hasattr(manager, "supported_lora_modules")
assert sorted(manager.supported_lora_modules) == [
"dense1",
"dense2",
"lm_head",
"output",
]
@pytest.mark.parametrize("device", DEVICES)
def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
model = dummy_model
model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
model.packed_modules_mapping = {}
model_lora1 = create_lora(1,
model, ["layer1.dense1", "dense2", "lm_head"],
device=device)
......@@ -289,8 +288,6 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
# This tests just the LRU cache functionality, everything else is
# tested in test_lora_model_manager
model = dummy_model
model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
model.packed_modules_mapping = {}
model_lora1 = create_lora(1,
model, ["layer1.dense1", "dense2", "lm_head"],
device=device)
......@@ -414,6 +411,7 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
assert manager.device == device
@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
@pytest.mark.parametrize("device", DEVICES)
def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
sql_lora_files, device):
......@@ -493,6 +491,7 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
device)
@pytest.mark.skipif(envs.VLLM_USE_V1, reason="Test leverages V0 internals.")
@pytest.mark.parametrize("device", DEVICES)
def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
sql_lora_files, device):
......@@ -572,13 +571,6 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
@pytest.mark.parametrize("device", DEVICES)
def test_packed_loras(dist_init, dummy_model_gate_up, device):
model = dummy_model_gate_up
model.supported_lora_modules = ["gate_up_proj"]
model.packed_modules_mapping = {
"gate_up_proj": [
"gate_proj",
"up_proj",
],
}
model_lora = create_packed_lora(
1,
model,
......
# SPDX-License-Identifier: Apache-2.0
from typing import List
import os
import pytest
import vllm
from tests.utils import fork_new_process_for_each_test
from vllm.assets.image import ImageAsset
from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform
from ..utils import models_path_prefix
from ..utils import create_new_process_for_each_test
MODEL_PATH = os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5")
PROMPT_TEMPLATE = (
......@@ -29,7 +28,7 @@ EXPECTED_OUTPUT = [
]
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
sampling_params = vllm.SamplingParams(
temperature=0,
max_tokens=5,
......@@ -50,7 +49,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
if lora_id else None,
)
# Print the outputs.
generated_texts: List[str] = []
generated_texts: list[str] = []
for output in outputs:
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
......@@ -61,7 +60,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="MiniCPM-V dependency xformers incompatible with ROCm")
@fork_new_process_for_each_test
@create_new_process_for_each_test()
def test_minicpmv_lora(minicpmv_lora_files):
llm = vllm.LLM(
MODEL_PATH,
......@@ -84,7 +83,7 @@ def test_minicpmv_lora(minicpmv_lora_files):
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="MiniCPM-V dependency xformers incompatible with ROCm")
@fork_new_process_for_each_test
@create_new_process_for_each_test()
def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
llm = vllm.LLM(
MODEL_PATH,
......@@ -105,7 +104,7 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="MiniCPM-V dependency xformers incompatible with ROCm")
@fork_new_process_for_each_test
@create_new_process_for_each_test()
def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
llm = vllm.LLM(
MODEL_PATH,
......
# SPDX-License-Identifier: Apache-2.0
from typing import List
import pytest
import torch
import os
......@@ -14,8 +12,16 @@ from ..utils import models_path_prefix
MODEL_PATH = os.path.join(models_path_prefix, "mistralai/Mixtral-8x7B-Instruct-v0.1")
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
prompts: List[str]) -> List[str]:
prompts: list[str]) -> list[str]:
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
outputs = llm.generate(
......@@ -24,7 +30,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
# Print the outputs.
generated_texts: List[str] = []
generated_texts: list[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip()
......@@ -65,45 +71,3 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):
prompts=prompts) == expected_lora_output
assert do_sample(llm, mixtral_lora_files, lora_id=2,
prompts=prompts) == expected_lora_output
@pytest.mark.parametrize("tp_size", [4])
@pytest.mark.parametrize("fully_shard", [True, False])
def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
tp_size, fully_shard):
"""This LoRA model has all supported Mixtral target modules"""
if torch.cuda.device_count() < tp_size:
pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
prompts = [
"Question: What happens if you touch the eyes of a blind man?\nChoices:\nB: The man's sight will be restored.\nA: Nothing happens if you touch the eyes of a blind man.\nC: The man will get his eyesight back.\nD: The man will no longer be blind.\nAnswer:", # noqa: E501
"Question: Which will increase the energy of motion of water molecules?\nChoices:\nA: add heat\nB: add ice\nC: remove some of the water\nD: place the water in the freezer\nAnswer:", # noqa: E501
"Since Craig threw aluminum cans in the trash and Benjamin recycled, _ was environmentally irresponsible.\nChoices:\n1: Craig\n2: Benjamin\nAnswer:", # noqa: E501
]
llm = vllm.LLM(
MODEL_PATH,
enable_lora=True,
max_num_seqs=16,
max_loras=4,
distributed_executor_backend="ray",
tensor_parallel_size=tp_size,
fully_sharded_loras=fully_shard,
max_lora_rank=32,
)
expected_lora_output = [
"A: Nothing happens if you touch the eyes of a blind man.",
"A: add heat",
"1: Craig",
]
assert do_sample(llm,
mixtral_lora_files_all_target_modules,
lora_id=1,
prompts=prompts) == expected_lora_output
assert do_sample(llm,
mixtral_lora_files_all_target_modules,
lora_id=2,
prompts=prompts) == expected_lora_output
# SPDX-License-Identifier: Apache-2.0
from typing import List
import os
import pytest
import vllm
......@@ -14,7 +12,7 @@ MODEL_PATH = os.path.join(models_path_prefix, "microsoft/phi-2")
PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [
PROMPT_TEMPLATE.format(
sql_prompt=
......@@ -43,7 +41,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
if lora_id else None,
)
# Print the outputs.
generated_texts: List[str] = []
generated_texts: list[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip()
......
# SPDX-License-Identifier: Apache-2.0
from threading import Lock
from typing import List
import pytest
import torch
import vllm.lora.ops.triton_ops # noqa: F401
from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
bgmv_shrink, sgmv_expand,
sgmv_expand_slice, sgmv_shrink)
import vllm.lora.ops.torch_ops as torch_ops
import vllm.lora.ops.triton_ops as triton_ops
from vllm.lora.ops.triton_ops import LoRAKernelMeta
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
from vllm.platforms import current_platform
from .utils import (PunicaTensors, assert_close, generate_data,
generate_data_for_expand_nslices,
generate_data_for_nslices)
from .utils import PunicaTensors, assert_close, generate_data_for_nslices
# Utility shrink and expand operations used as reference implementations.
def sgmv_shrink_for_nslices(
nslices: int, inputs_tensor: torch.Tensor,
lora_weights_lst: List[torch.Tensor], out_tensor: torch.Tensor,
lora_weights_lst: list[torch.Tensor], out_tensor: torch.Tensor,
b_seq_start_loc: torch.Tensor, seq_len_tensor: torch.Tensor,
prompt_lora_mapping: torch.Tensor, batches: int, max_seq_length: int,
num_tokens: int, scaling: float):
"""
Wrapper around sgmv_shrink that handles any nslices.
Wrapper around torch_ops.sgmv_shrink that handles any nslices.
"""
for index in range(nslices):
sgmv_shrink(
torch_ops.sgmv_shrink(
inputs_tensor,
lora_weights_lst[index],
out_tensor[index],
......@@ -44,7 +40,7 @@ def sgmv_shrink_for_nslices(
def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
inputs_tensor: torch.Tensor,
lora_weights_lst: List[torch.Tensor],
lora_weights_lst: list[torch.Tensor],
out_tensor: torch.Tensor,
b_seq_start_loc: torch.Tensor,
seq_len_tensor: torch.Tensor,
......@@ -52,11 +48,11 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
max_seq_length: int, num_tokens: int,
add_inputs: bool) -> None:
"""
Wrapper around sgmv_expand that handles any nslices.
Wrapper around torch_ops.sgmv_expand that handles any nslices.
"""
if nslices == 1:
# Verify the torch's sgmv_expand op
sgmv_expand(
torch_ops.sgmv_expand(
inputs_tensor[0],
lora_weights_lst[0],
out_tensor,
......@@ -72,7 +68,7 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
slice_offset = 0
for index in range(nslices):
lora_weights = lora_weights_lst[index]
sgmv_expand_slice(
torch_ops.sgmv_expand_slice(
inputs_tensor[index],
lora_weights,
out_tensor,
......@@ -92,12 +88,13 @@ def sgmv_expand_for_nslices(nslices: int, hidden_size: int,
_dict_lock = Lock()
def check_sgmv_shrink(batches: int, num_loras: int, rank: int,
hidden_size: int, nslices: int, dtype: torch.dtype,
device: str, seq_length: int, scaling: float):
def check_lora_shrink_kernel(batches: int, num_loras: int, rank: int,
hidden_size: int, nslices: int,
dtype: torch.dtype, device: str, seq_length: int,
scaling: float):
"""
Compare outputs of vllm.sgmv_shrink kernel against a reference
implementation.
Compare outputs of torch_ops.sgmv_shrink and triton_ops.lora_shrink
kernels.
"""
data: PunicaTensors = generate_data_for_nslices(
batches,
......@@ -112,44 +109,52 @@ def check_sgmv_shrink(batches: int, num_loras: int, rank: int,
)
max_seq_length, token_nums = data.meta()
# Setup metadata information for SGMV and reference kernels
sgmv_meta_args = (data.b_seq_start_loc, data.seq_len_tensor,
data.prompt_lora_mapping, batches, max_seq_length,
token_nums)
# Setup metadata information for the LoRA kernel.
lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
max_num_tokens=token_nums,
device='cuda')
lora_meta.prepare_tensors(data.token_lora_mapping)
ref_out_tensor = data.ref_out_tensor
out_tensor = data.our_out_tensor.clone()
# Preventing cache error pointer.
with _dict_lock:
# lora_shrink kernel
_LORA_A_PTR_DICT.clear()
torch.ops.vllm.sgmv_shrink(
triton_ops.lora_shrink(
data.inputs_tensor,
data.lora_weights,
data.our_out_tensor,
data.b_seq_start_loc,
data.seq_len_tensor,
data.prompt_lora_mapping,
batches,
max_seq_length,
token_nums,
out_tensor,
*lora_meta.meta_args(token_nums=token_nums),
scaling,
)
sgmv_shrink_for_nslices(
nslices,
data.inputs_tensor,
data.lora_weights,
data.ref_out_tensor,
data.b_seq_start_loc,
data.seq_len_tensor,
data.prompt_lora_mapping,
batches,
max_seq_length,
token_nums,
scaling,
)
assert_close(data.our_out_tensor, data.ref_out_tensor)
# Reference
sgmv_shrink_for_nslices(
nslices,
data.inputs_tensor,
data.lora_weights,
ref_out_tensor,
*sgmv_meta_args,
scaling,
)
assert_close(out_tensor, ref_out_tensor)
def check_sgmv_expand(batches: int, num_loras: int, rank: int,
hidden_size: int, nslices: int, dtype: torch.dtype,
device: str, seq_length: int, add_inputs: bool):
def check_lora_expand_kernel(batches: int, num_loras: int, rank: int,
hidden_size: int, nslices: int,
dtype: torch.dtype, device: str, seq_length: int,
add_inputs: bool):
"""
Compare outputs of vllm.sgmv_expand kernel against a reference
implementation.
Compare outputs of torch_ops.sgmv_expand and triton_ops.lora_expand
kernels.
"""
data: PunicaTensors = generate_data_for_nslices(
batches,
......@@ -165,152 +170,41 @@ def check_sgmv_expand(batches: int, num_loras: int, rank: int,
max_seq_length, token_nums = data.meta()
# Setup metadata information for SGMV and reference kernels
sgmv_meta_args = (data.b_seq_start_loc, data.seq_len_tensor,
data.prompt_lora_mapping, batches, max_seq_length,
token_nums)
# Setup metadata information for the LoRA kernel.
lora_meta = LoRAKernelMeta.make(max_loras=num_loras,
max_num_tokens=token_nums,
device='cuda')
lora_meta.prepare_tensors(data.token_lora_mapping)
# Setup output tensors
ref_out_tensor = data.ref_out_tensor
out_tensor = data.our_out_tensor.clone()
with _dict_lock:
# lora_expand kernel
_LORA_B_PTR_DICT.clear()
torch.ops.vllm.sgmv_expand(
data.inputs_tensor,
data.lora_weights,
data.our_out_tensor,
data.b_seq_start_loc,
data.seq_len_tensor,
data.prompt_lora_mapping,
batches,
max_seq_length,
token_nums,
offset_start=0,
add_inputs=add_inputs,
)
triton_ops.lora_expand(data.inputs_tensor,
data.lora_weights,
out_tensor,
*lora_meta.meta_args(token_nums=token_nums),
offset_start=0,
add_inputs=add_inputs)
# Reference
sgmv_expand_for_nslices(nslices,
hidden_size,
data.inputs_tensor,
data.lora_weights,
data.ref_out_tensor,
data.b_seq_start_loc,
data.seq_len_tensor,
data.prompt_lora_mapping,
batches,
max_seq_length,
token_nums,
ref_out_tensor,
*sgmv_meta_args,
add_inputs=add_inputs)
assert_close(data.our_out_tensor, data.ref_out_tensor)
def check_bgmv_shrink(batches: int, num_loras: int, rank: int,
hidden_size: int, dtype: torch.dtype, device: str,
scaling: float):
"""
Compare vllm.bgmv_shrink against a reference implementation.
"""
seq_length = 1
data: PunicaTensors = generate_data(
batches,
hidden_size,
num_loras,
rank,
seq_length,
dtype,
"shrink",
device,
)
torch.ops.vllm.bgmv_shrink(
data.inputs_tensor,
data.lora_weights,
data.our_out_tensor,
data.token_lora_mapping,
scaling,
)
bgmv_shrink(
data.inputs_tensor,
data.lora_weights,
data.ref_out_tensor,
data.token_lora_mapping,
scaling,
)
data.ref_out_tensor = data.ref_out_tensor.to(torch.float32)
assert_close(data.our_out_tensor, data.ref_out_tensor)
def check_bgmv_expand(batches: int, num_loras: int, rank: int,
hidden_size: int, dtype: torch.dtype, device: str,
add_inputs: bool):
"""
Compare vllm.bgmv_expand against a reference implementation.
"""
seq_length = 1
data: PunicaTensors = generate_data(
batches,
hidden_size,
num_loras,
rank,
seq_length,
dtype,
"expand",
device,
)
torch.ops.vllm.bgmv_expand(
data.inputs_tensor,
data.lora_weights,
data.our_out_tensor,
data.token_lora_mapping,
add_inputs=add_inputs,
)
bgmv_expand(
data.inputs_tensor,
data.lora_weights,
data.ref_out_tensor,
data.token_lora_mapping,
add_inputs=add_inputs,
)
assert_close(data.our_out_tensor, data.ref_out_tensor)
def check_bgmv_expand_slice(batches: int, num_loras: int, rank: int,
hidden_size: int, nslices: int, dtype: torch.dtype,
device: str, add_inputs: bool):
"""
Compare vllm.bgmv_expand_slice against a reference implementation.
"""
seq_length = 1
data: PunicaTensors = generate_data_for_expand_nslices(
batches,
hidden_size,
num_loras,
rank,
seq_length,
dtype,
nslices,
device,
)
slice_offset = 0
for index in range(nslices):
torch.ops.vllm.bgmv_expand_slice(
data.inputs_tensor,
data.lora_weights[index],
data.our_out_tensor,
data.token_lora_mapping,
slice_offset,
slice_size=hidden_size,
add_inputs=add_inputs,
)
bgmv_expand_slice(
data.inputs_tensor,
data.lora_weights[index],
data.ref_out_tensor,
data.token_lora_mapping,
slice_offset,
slice_size=hidden_size,
add_inputs=add_inputs,
)
slice_offset += hidden_size
assert_close(data.our_out_tensor, data.ref_out_tensor)
assert_close(out_tensor, ref_out_tensor)
# Tests
......@@ -440,7 +334,7 @@ SEED = [0]
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
def test_punica_sgmv(
def test_kernels(
batches: int,
num_loras: int,
rank: int,
......@@ -451,29 +345,32 @@ def test_punica_sgmv(
seed: int,
op_type: str,
):
"""
Tests LoRA kernels.
"""
torch.set_default_device(device)
current_platform.seed_everything(seed)
if op_type == "shrink":
check_sgmv_shrink(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
scaling=0.5)
check_lora_shrink_kernel(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
scaling=0.5)
else:
check_sgmv_expand(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
add_inputs=True)
check_lora_expand_kernel(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
add_inputs=True)
@pytest.mark.parametrize("batches", hs_test_params['batches'])
......@@ -485,7 +382,7 @@ def test_punica_sgmv(
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
def test_punica_sgmv_hidden_size(
def test_kernels_hidden_size(
batches: int,
num_loras: int,
rank: int,
......@@ -496,157 +393,29 @@ def test_punica_sgmv_hidden_size(
seed: int,
op_type: str,
):
"""
Tests SGMV and LoRA kernels.
"""
torch.set_default_device(device)
current_platform.seed_everything(seed)
if op_type == "shrink":
check_sgmv_shrink(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
scaling=0.5)
else:
check_sgmv_expand(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
add_inputs=True)
@pytest.mark.parametrize("batches", test_params['batches'])
@pytest.mark.parametrize("num_loras", test_params['num_loras'])
@pytest.mark.parametrize("rank", test_params['max_ranks'])
@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
def test_punica_bgmv(
batches: int,
num_loras: int,
rank: int,
hidden_size: int,
dtype: torch.dtype,
device: str,
seed: int,
op_type: str,
):
torch.set_default_device(device)
current_platform.seed_everything(seed)
if op_type == "shrink":
check_bgmv_shrink(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
dtype=dtype,
device=device,
scaling=0.5)
else:
check_bgmv_expand(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
dtype=dtype,
device=device,
add_inputs=True)
@pytest.mark.parametrize("batches", hs_test_params['batches'])
@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
def test_punica_bgmv_hidden_size(
batches: int,
num_loras: int,
rank: int,
hidden_size: int,
dtype: torch.dtype,
device: str,
seed: int,
op_type: str,
):
torch.set_default_device(device)
current_platform.seed_everything(seed)
if op_type == "shrink":
check_bgmv_shrink(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
dtype=dtype,
device=device,
scaling=0.5)
check_lora_shrink_kernel(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
scaling=0.5)
else:
check_bgmv_expand(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
dtype=dtype,
device=device,
add_inputs=True)
@pytest.mark.parametrize("batches", test_params['batches'])
@pytest.mark.parametrize("num_loras", test_params['num_loras'])
@pytest.mark.parametrize("rank", test_params['max_ranks'])
@pytest.mark.parametrize("hidden_size", test_params['hidden_sizes'])
@pytest.mark.parametrize("nslices", [2, 3])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("seed", SEED)
def test_punica_bgmv_expand_nslices(batches: int, num_loras: int, rank: int,
hidden_size: int, nslices: int,
dtype: torch.dtype, device: str,
seed: int):
torch.set_default_device(device)
current_platform.seed_everything(seed)
check_bgmv_expand_slice(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
add_inputs=True)
@pytest.mark.parametrize("batches", hs_test_params['batches'])
@pytest.mark.parametrize("num_loras", hs_test_params['num_loras'])
@pytest.mark.parametrize("rank", hs_test_params['max_ranks'])
@pytest.mark.parametrize("hidden_size", hs_test_params['hidden_sizes'])
@pytest.mark.parametrize("nslices", [2, 3])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("seed", SEED)
def test_punica_bgmv_expand_nslices_hidden_size(batches: int, num_loras: int,
rank: int, hidden_size: int,
nslices: int,
dtype: torch.dtype,
device: str, seed: int):
torch.set_default_device(device)
current_platform.seed_everything(seed)
check_bgmv_expand_slice(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
add_inputs=True)
check_lora_expand_kernel(batches=batches,
num_loras=num_loras,
rank=rank,
hidden_size=hidden_size,
nslices=nslices,
dtype=dtype,
device=device,
seq_length=128,
add_inputs=True)
......@@ -3,7 +3,6 @@
# Adapted from
# https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
from dataclasses import dataclass
from typing import List
import pytest
import os
......@@ -22,7 +21,7 @@ class ModelWithQuantization:
quantization: str
MODELS: List[ModelWithQuantization]
MODELS: list[ModelWithQuantization]
#AWQ quantization is currently not supported in ROCm.
if current_platform.is_rocm():
MODELS = [
......@@ -44,7 +43,7 @@ else:
def do_sample(llm: vllm.LLM,
lora_path: str,
lora_id: int,
max_tokens: int = 256) -> List[str]:
max_tokens: int = 256) -> list[str]:
raw_prompts = [
"Give me an orange-ish brown color",
"Give me a neon pink color",
......@@ -64,7 +63,7 @@ def do_sample(llm: vllm.LLM,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
# Print the outputs.
generated_texts: List[str] = []
generated_texts: list[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
......@@ -182,7 +181,8 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
model):
if num_gpus_available < 2:
pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
if model.quantization == "GPTQ":
pytest.skip("GPTQ lora outputs are just incredibly unstable")
llm_tp1 = vllm.LLM(
model=model.model_path,
enable_lora=True,
......
# SPDX-License-Identifier: Apache-2.0
from dataclasses import dataclass
from typing import Dict, List, Optional
from typing import Optional
import pytest
from packaging.version import Version
......@@ -12,6 +12,14 @@ from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform
@pytest.fixture(autouse=not current_platform.is_cpu())
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@dataclass
class TestConfig:
model_path: str
......@@ -20,7 +28,7 @@ class TestConfig:
max_loras: int = 2
max_lora_rank: int = 16
max_model_len: int = 4096
mm_processor_kwargs: Optional[Dict[str, int]] = None
mm_processor_kwargs: Optional[dict[str, int]] = None
def __post_init__(self):
if self.mm_processor_kwargs is None:
......@@ -57,11 +65,11 @@ class Qwen2VLTester:
)
def run_test(self,
images: List[ImageAsset],
expected_outputs: List[str],
images: list[ImageAsset],
expected_outputs: list[str],
lora_id: Optional[int] = None,
temperature: float = 0,
max_tokens: int = 5) -> List[str]:
max_tokens: int = 5) -> list[str]:
sampling_params = vllm.SamplingParams(
temperature=temperature,
......
......@@ -25,12 +25,10 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
)
lora_request = LoRARequest("1", 1, sql_lora_files)
assert reference_tokenizer.encode("prompt") == tokenizer_group.encode(
request_id="request_id", prompt="prompt", lora_request=lora_request)
prompt="prompt", lora_request=lora_request)
assert reference_tokenizer.encode(
"prompt") == await tokenizer_group.encode_async(
request_id="request_id",
prompt="prompt",
lora_request=lora_request)
prompt="prompt", lora_request=lora_request)
assert isinstance(tokenizer_group.get_lora_tokenizer(None),
PreTrainedTokenizerBase)
assert tokenizer_group.get_lora_tokenizer(
......
# SPDX-License-Identifier: Apache-2.0
import pytest
import vllm
from vllm.lora.request import LoRARequest
from ..utils import create_new_process_for_each_test, multi_gpu_test
MODEL_PATH = "ArthurZ/ilama-3.2-1B"
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
EXPECTED_LORA_OUTPUT = [
"SELECT count(*) FROM singer",
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", # noqa: E501
"SELECT DISTINCT Country FROM singer WHERE Age > 20",
]
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [
PROMPT_TEMPLATE.format(query="How many singers do we have?"),
PROMPT_TEMPLATE.format(
query=
"What is the average, minimum, and maximum age of all singers from France?" # noqa: E501
),
PROMPT_TEMPLATE.format(
query=
"What are all distinct countries where singers above age 20 are from?" # noqa: E501
),
]
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
# Print the outputs.
generated_texts: list[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
return generated_texts
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@pytest.mark.skip_v1
@create_new_process_for_each_test()
def test_ilama_lora(ilama_lora_files):
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=16,
tensor_parallel_size=1,
trust_remote_code=True,
enable_chunked_prefill=True)
output1 = do_sample(llm, ilama_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
output2 = do_sample(llm, ilama_lora_files, lora_id=2)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
@pytest.mark.skip_v1
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_ilama_lora_tp4(ilama_lora_files):
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=16,
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=False,
enable_chunked_prefill=True)
output1 = do_sample(llm, ilama_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
output2 = do_sample(llm, ilama_lora_files, lora_id=2)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
@pytest.mark.skip_v1
@multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test()
def test_ilama_lora_tp4_fully_sharded_loras(ilama_lora_files):
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
max_lora_rank=16,
tensor_parallel_size=4,
trust_remote_code=True,
fully_sharded_loras=True,
enable_chunked_prefill=True)
output1 = do_sample(llm, ilama_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output1[i] == EXPECTED_LORA_OUTPUT[i]
output2 = do_sample(llm, ilama_lora_files, lora_id=2)
for i in range(len(EXPECTED_LORA_OUTPUT)):
assert output2[i] == EXPECTED_LORA_OUTPUT[i]
# SPDX-License-Identifier: Apache-2.0
import shutil
from os import path
from tempfile import TemporaryDirectory
from typing import List, Tuple
import torch
from huggingface_hub import snapshot_download
from safetensors.torch import load_file, save_file
from transformers import AutoTokenizer
from vllm.lora.request import LoRARequest
from ..models.utils import check_outputs_equal
ULTRAVOX_MODEL_NAME = "fixie-ai/ultravox-v0_3"
LLMA_MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
PROMPT = "Tell me about a Fool's mate move in 20 words. Provide the moves!"
def llama3_1_8b_chess_lora_path():
return snapshot_download(
repo_id="mkopecki/chess-lora-adapter-llama-3.1-8b")
# can't use llama lora adapter without module name transformation
# because ultravox nest language model
def transform_module_names_for_ultravox(state_dict):
transformed_state_dict = {}
for key, value in state_dict.items():
new_key = key.replace("base_model.model",
"base_model.model.language_model")
transformed_state_dict[new_key] = value
return transformed_state_dict
def mk_llama3_1_8b_ultravox_chess_lora(source_repo, target_path):
tensor_file = "adapter_model.safetensors"
state_dict = load_file(path.join(source_repo, tensor_file))
transformed_state_dict = transform_module_names_for_ultravox(state_dict)
save_file(transformed_state_dict, path.join(target_path, tensor_file))
config_file = "adapter_config.json"
shutil.copyfile(path.join(source_repo, config_file),
path.join(target_path, config_file))
return target_path
def _get_prompt(audio_count, question, placeholder, model_name) -> str:
tokenizer = AutoTokenizer.from_pretrained(model_name)
placeholder = f"{placeholder}\n" * audio_count
return tokenizer.apply_chat_template([{
'role': 'user',
'content': f"{placeholder}{question}"
}],
tokenize=False,
add_generation_prompt=True)
def test_ultravox_lora(vllm_runner):
"""
TODO: Train an Ultravox LoRA instead of using a Llama LoRA.
"""
# Workaround to prevent device mismatch in Whisper.
# Can be removed when it is fixed upstream in transformer
# https://github.com/huggingface/transformers/pull/35866
torch.set_default_device("cpu")
llama3_1_8b_chess_lora = llama3_1_8b_chess_lora_path()
with TemporaryDirectory() as temp_ultravox_lora_dir:
llama3_1_8b_ultravox_chess_lora = mk_llama3_1_8b_ultravox_chess_lora(
llama3_1_8b_chess_lora, temp_ultravox_lora_dir)
with vllm_runner(
ULTRAVOX_MODEL_NAME,
enforce_eager=True,
max_num_seqs=2,
enable_lora=True,
max_loras=1,
max_lora_rank=128,
dtype="bfloat16",
max_model_len=1024,
) as vllm_model:
ultravox_outputs: List[Tuple[
List[int], str]] = vllm_model.generate_greedy(
[
_get_prompt(0, PROMPT, VLLM_PLACEHOLDER,
ULTRAVOX_MODEL_NAME)
],
256,
lora_request=LoRARequest(str(1), 1,
llama3_1_8b_ultravox_chess_lora),
)
# run llama with and without lora to compare outputs with above
with vllm_runner(
LLMA_MODEL_NAME,
enforce_eager=True,
max_num_seqs=2,
enable_lora=True,
max_loras=1,
max_lora_rank=128,
dtype="bfloat16",
max_model_len=1024,
) as vllm_model:
llama_outputs: List[Tuple[List[int], str]] = (
vllm_model.generate_greedy(
[_get_prompt(0, PROMPT, VLLM_PLACEHOLDER, LLMA_MODEL_NAME)],
256,
lora_request=LoRARequest(str(1), 1, llama3_1_8b_chess_lora),
))
check_outputs_equal(
outputs_0_lst=ultravox_outputs,
outputs_1_lst=llama_outputs,
name_0="ultravox",
name_1="llama",
)
......@@ -3,19 +3,46 @@
import os
import random
import tempfile
from typing import Union
from unittest.mock import patch
import pytest
import vllm.envs as envs
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
ModelConfig, ParallelConfig, SchedulerConfig,
VllmConfig)
from vllm.lora.models import LoRAMapping
from vllm.lora.request import LoRARequest
from vllm.v1.worker.gpu_worker import Worker as V1Worker
from vllm.worker.worker import Worker
from ..utils import models_path_prefix
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@patch.dict(os.environ, {"RANK": "0"})
def test_worker_apply_lora(sql_lora_files):
def set_active_loras(worker: Union[Worker, V1Worker],
lora_requests: list[LoRARequest]):
lora_mapping = LoRAMapping([], [])
if isinstance(worker, Worker):
# v0 case
worker.model_runner.set_active_loras(lora_requests, lora_mapping)
else:
# v1 case
worker.model_runner.lora_manager.set_active_adapters(
lora_requests, lora_mapping)
worker_cls = V1Worker if envs.VLLM_USE_V1 else Worker
vllm_config = VllmConfig(
model_config=ModelConfig(
os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
......@@ -26,6 +53,7 @@ def test_worker_apply_lora(sql_lora_files):
seed=0,
dtype="float16",
revision=None,
enforce_eager=True,
),
load_config=LoadConfig(
download_dir=None,
......@@ -41,16 +69,17 @@ def test_worker_apply_lora(sql_lora_files):
lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
max_loras=32),
)
worker = Worker(
worker = worker_cls(
vllm_config=vllm_config,
local_rank=0,
rank=0,
distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
)
worker.init_device()
worker.load_model()
worker.model_runner.set_active_loras([], LoRAMapping([], []))
set_active_loras(worker, [])
assert worker.list_loras() == set()
n_loras = 32
......@@ -58,7 +87,7 @@ def test_worker_apply_lora(sql_lora_files):
LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
]
worker.model_runner.set_active_loras(lora_requests, LoRAMapping([], []))
set_active_loras(worker, lora_requests)
assert worker.list_loras() == {
lora_request.lora_int_id
for lora_request in lora_requests
......@@ -70,8 +99,7 @@ def test_worker_apply_lora(sql_lora_files):
k=random.randint(1, n_loras))
random.shuffle(iter_lora_requests)
iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
worker.model_runner.set_active_loras(iter_lora_requests,
LoRAMapping([], []))
set_active_loras(worker, lora_requests)
assert worker.list_loras().issuperset(
{lora_request.lora_int_id
for lora_request in iter_lora_requests})
# SPDX-License-Identifier: Apache-2.0
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple, Union
from typing import Optional, Union
import torch
......@@ -12,7 +12,7 @@ class DummyLoRAManager:
def __init__(self, device: torch.device = "cuda:0"):
super().__init__()
self._loras: Dict[str, LoRALayerWeights] = {}
self._loras: dict[str, LoRALayerWeights] = {}
self._device = device
def set_module_lora(self, module_name: str, lora: LoRALayerWeights):
......@@ -77,11 +77,11 @@ class DummyLoRAManager:
self,
module_name: str,
input_dim: int,
output_dims: List[int],
noop_lora_index: Optional[List[int]] = None,
output_dims: list[int],
noop_lora_index: Optional[list[int]] = None,
rank: int = 8,
):
base_loras: List[LoRALayerWeights] = []
base_loras: list[LoRALayerWeights] = []
noop_lora_index_set = set(noop_lora_index or [])
for i, out_dim in enumerate(output_dims):
......@@ -110,7 +110,7 @@ def assert_close(a, b):
@dataclass
class PunicaTensors:
inputs_tensor: torch.Tensor
lora_weights: Union[torch.Tensor, List[torch.Tensor]]
lora_weights: Union[torch.Tensor, list[torch.Tensor]]
our_out_tensor: torch.Tensor
ref_out_tensor: torch.Tensor
b_seq_start_loc: torch.Tensor
......@@ -118,7 +118,7 @@ class PunicaTensors:
seq_len_tensor: torch.Tensor
token_lora_mapping: torch.Tensor
def meta(self) -> Tuple[int, int]:
def meta(self) -> tuple[int, int]:
"""
Infer max_seq_length and token_nums from the tensors
and return them.
......
......@@ -2,14 +2,13 @@
import os
import time
from typing import List
import pytest
import ray
from prometheus_client import REGISTRY
import vllm.envs as envs
from vllm import EngineArgs, LLMEngine
from vllm.config import LoadFormat
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
......@@ -18,15 +17,21 @@ from vllm.sampling_params import SamplingParams
import vllm.envs as envs
from ..utils import models_path_prefix
from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
This module tests V0 internals, so set VLLM_USE_V1=0.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
from ..conftest import MODEL_WEIGHTS_S3_BUCKET
MODELS = [
os.path.join(models_path_prefix, "distilbert/distilgpt2"),
]
RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", [("float" if envs.VLLM_USE_TRITON_FLASH_ATTN else "half")])
......@@ -140,7 +145,7 @@ def test_metric_counter_generation_tokens_multi_step(
"served_model_name",
[None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
served_model_name: List[str]) -> None:
served_model_name: list[str]) -> None:
with vllm_runner(model,
dtype=dtype,
disable_log_stats=False,
......@@ -149,10 +154,11 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
metrics_tag_content = stat_logger.labels["model_name"]
if envs.VLLM_CI_USE_S3:
model = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}"
if served_model_name is None or served_model_name == []:
actual_model_name = f"{MODEL_WEIGHTS_S3_BUCKET}/{model.split('/')[-1]}"
assert metrics_tag_content == actual_model_name, (
f"Metrics tag model_name is wrong! expect: {actual_model_name!r}\n"
assert metrics_tag_content == model, (
f"Metrics tag model_name is wrong! expect: {model!r}\n"
f"actual: {metrics_tag_content!r}")
else:
assert metrics_tag_content == served_model_name[0], (
......@@ -178,10 +184,11 @@ async def test_async_engine_log_metrics_regression(
when disable_log_stats=False
(see: https://github.com/vllm-project/vllm/pull/4150#pullrequestreview-2008176678)
"""
engine_args = AsyncEngineArgs(model=model,
dtype=dtype,
disable_log_stats=disable_log_stats,
load_format=RUNAI_STREAMER_LOAD_FORMAT)
engine_args = AsyncEngineArgs(
model=model,
dtype=dtype,
disable_log_stats=disable_log_stats,
)
async_engine = AsyncLLMEngine.from_engine_args(engine_args)
for i, prompt in enumerate(example_prompts):
results = async_engine.generate(
......@@ -193,7 +200,7 @@ async def test_async_engine_log_metrics_regression(
async for _ in results:
pass
assert_metrics(async_engine.engine, disable_log_stats,
assert_metrics(model, async_engine.engine, disable_log_stats,
len(example_prompts))
......@@ -208,10 +215,11 @@ def test_engine_log_metrics_regression(
max_tokens: int,
disable_log_stats: bool,
) -> None:
engine_args = EngineArgs(model=model,
dtype=dtype,
disable_log_stats=disable_log_stats,
load_format=RUNAI_STREAMER_LOAD_FORMAT)
engine_args = EngineArgs(
model=model,
dtype=dtype,
disable_log_stats=disable_log_stats,
)
engine = LLMEngine.from_engine_args(engine_args)
for i, prompt in enumerate(example_prompts):
engine.add_request(
......@@ -222,7 +230,9 @@ def test_engine_log_metrics_regression(
while engine.has_unfinished_requests():
engine.step()
assert_metrics(engine, disable_log_stats, len(example_prompts))
if envs.VLLM_CI_USE_S3:
model = f"{MODEL_WEIGHTS_S3_BUCKET}/{model}"
assert_metrics(model, engine, disable_log_stats, len(example_prompts))
@pytest.mark.parametrize("model", MODELS)
......@@ -289,14 +299,15 @@ def test_metric_spec_decode_interval(
) -> None:
k = 5
engine_args = EngineArgs(model=model,
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.4,
speculative_model=model,
num_speculative_tokens=k,
enforce_eager=True,
load_format=RUNAI_STREAMER_LOAD_FORMAT)
engine_args = EngineArgs(
model=model,
dtype=dtype,
disable_log_stats=False,
gpu_memory_utilization=0.4,
speculative_model=model,
num_speculative_tokens=k,
enforce_eager=True,
)
engine = LLMEngine.from_engine_args(engine_args)
......@@ -363,7 +374,7 @@ def test_metric_spec_decode_interval(
cleanup_dist_env_and_memory()
def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
def assert_metrics(model: str, engine: LLMEngine, disable_log_stats: bool,
num_requests: int) -> None:
if disable_log_stats:
with pytest.raises(AttributeError):
......@@ -374,7 +385,7 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
# Ensure the count bucket of request-level histogram metrics matches
# the number of requests as a simple sanity check to ensure metrics are
# generated
labels = {'model_name': engine.model_config.model}
labels = {'model_name': model}
request_histogram_metrics = [
"vllm:e2e_request_latency_seconds",
"vllm:request_prompt_tokens",
......
# SPDX-License-Identifier: Apache-2.0
from typing import Dict, List, Optional
from typing import Optional
from typing_extensions import TypedDict
class ServerConfig(TypedDict, total=False):
model: str
arguments: List[str]
arguments: list[str]
system_prompt: Optional[str]
supports_parallel: Optional[bool]
supports_rocm: Optional[bool]
ARGS: List[str] = ["--max-model-len", "1024"]
ARGS: list[str] = ["--max-model-len", "1024"]
CONFIGS: Dict[str, ServerConfig] = {
CONFIGS: dict[str, ServerConfig] = {
"mistral": {
"model":
"mistralai/Mistral-7B-Instruct-v0.3",
......
# SPDX-License-Identifier: Apache-2.0
from typing import List
import pytest
from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
......@@ -9,7 +7,10 @@ from vllm.model_executor.custom_op import CustomOp
from vllm.model_executor.layers.activation import (GeluAndMul,
ReLUSquaredActivation,
SiluAndMul)
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.layernorm import (
RMSNorm, dispatch_cuda_rmsnorm_func, fused_add_rms_norm, rms_norm,
rocm_aiter_fused_add_rms_norm, rocm_aiter_rms_norm)
from vllm.platforms import current_platform
# Registered subclass for test
......@@ -51,7 +52,7 @@ class Relu3(ReLUSquaredActivation):
# All but RMSNorm
("all,-rms_norm", 4, [0, 1, 1, 1], True),
])
def test_enabled_ops(env: str, torch_level: int, ops_enabled: List[int],
def test_enabled_ops(env: str, torch_level: int, ops_enabled: list[int],
default_on: bool):
vllm_config = VllmConfig(compilation_config=CompilationConfig(
level=torch_level, custom_ops=env.split(",")))
......@@ -89,3 +90,27 @@ def test_enabled_ops_invalid(env: str):
custom_ops=env.split(",")))
with set_current_vllm_config(vllm_config):
RMSNorm(1024).enabled()
@pytest.mark.parametrize("add_residual", [True, False])
@pytest.mark.parametrize("use_rocm_aiter", ["0", "1"])
@pytest.mark.parametrize("use_rocm_aiter_norm", ["0", "1"])
@pytest.mark.skipif(not current_platform.is_rocm(),
reason="AITER is a feature exclusive for ROCm")
def test_rms_norm_dispatch(add_residual: bool, use_rocm_aiter: str,
use_rocm_aiter_norm: str, monkeypatch):
monkeypatch.setenv("VLLM_ROCM_USE_AITER", use_rocm_aiter)
monkeypatch.setenv("VLLM_ROCM_USE_AITER_RMSNORM", use_rocm_aiter_norm)
rms_norm_func = dispatch_cuda_rmsnorm_func(add_residual)
if not add_residual:
if current_platform.is_rocm() and int(use_rocm_aiter) and int(
use_rocm_aiter_norm):
assert rms_norm_func == rocm_aiter_rms_norm
else:
assert rms_norm_func == rms_norm
elif current_platform.is_rocm() and int(use_rocm_aiter) and int(
use_rocm_aiter_norm):
assert rms_norm_func == rocm_aiter_fused_add_rms_norm
else:
assert rms_norm_func == fused_add_rms_norm
# SPDX-License-Identifier: Apache-2.0
import json
import pickle
import pytest
......@@ -17,19 +18,38 @@ from vllm.model_executor.guided_decoding.outlines_logits_processors import (
from vllm.sampling_params import GuidedDecodingParams
from ..utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, 'HuggingFaceH4/zephyr-7b-beta')
GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
GUIDED_DECODING_BACKENDS = [
"outlines", "lm-format-enforcer", "xgrammar", "guidance"
]
GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT = ["outlines", "xgrammar"]
REASONING_MODEL_NAME = os.path.join(models_path_prefix, "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
# Initialize the tokenizer for the model here to avoid repeated loading
@pytest.fixture(scope="module")
def zephyr_7B_tokenzer():
return AutoTokenizer.from_pretrained(MODEL_NAME)
def test_guided_logits_processors(sample_regex, sample_json_schema):
@pytest.fixture(scope="module")
def deepseek_r1_qwen_tokenizer():
return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
def test_guided_logits_processors(zephyr_7B_tokenzer, sample_regex,
sample_json_schema):
"""Basic unit test for RegexLogitsProcessor and JSONLogitsProcessor."""
tokenizer = AutoTokenizer.from_pretrained(os.path.join(models_path_prefix, 'HuggingFaceH4/zephyr-7b-beta'))
regex_LP = RegexLogitsProcessor(sample_regex, tokenizer)
regex_LP = RegexLogitsProcessor(sample_regex,
zephyr_7B_tokenzer,
reasoner=None)
json_LP = JSONLogitsProcessor(sample_json_schema,
tokenizer,
whitespace_pattern=None)
zephyr_7B_tokenzer,
whitespace_pattern=None,
reasoner=None)
token_ids = tokenizer.encode(
token_ids = zephyr_7B_tokenzer.encode(
f"Give an example IPv4 address with this regex: {sample_regex}")
tensor = torch.rand(32000)
original_tensor = torch.clone(tensor)
......@@ -37,7 +57,7 @@ def test_guided_logits_processors(sample_regex, sample_json_schema):
assert tensor.shape == original_tensor.shape
assert not torch.allclose(tensor, original_tensor)
token_ids = tokenizer.encode(
token_ids = zephyr_7B_tokenzer.encode(
f"Give an employee profile that fits this schema: {sample_json_schema}"
)
tensor = torch.rand(32000)
......@@ -52,7 +72,8 @@ def test_guided_logits_processors(sample_regex, sample_json_schema):
@pytest.mark.parametrize("is_local", [True, False])
async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
sample_regex,
sample_json_schema):
sample_json_schema,
zephyr_7B_tokenzer):
config = ModelConfig(
MODEL_NAME,
......@@ -63,15 +84,14 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
seed=0,
dtype="bfloat16",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
token_ids = tokenizer.encode(
token_ids = zephyr_7B_tokenzer.encode(
f"Give an example IPv4 address with this regex: {sample_regex}")
regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
regex_lp = get_local_guided_decoding_logits_processor(
regex_request, tokenizer, config) if is_local else \
regex_request, zephyr_7B_tokenzer, config) if is_local else \
await get_guided_decoding_logits_processor(
regex_request, tokenizer, config)
regex_request, zephyr_7B_tokenzer, config)
assert regex_lp is not None
tensor = torch.rand(32000)
original_tensor = torch.clone(tensor)
......@@ -79,13 +99,85 @@ async def test_guided_logits_processor_black_box(backend: str, is_local: bool,
assert tensor.shape == original_tensor.shape
assert not torch.allclose(tensor, original_tensor)
token_ids = tokenizer.encode(
token_ids = zephyr_7B_tokenzer.encode(
f"Give an employee profile that fits this schema: {sample_json_schema}"
)
json_request = GuidedDecodingParams(json=sample_json_schema,
backend=backend)
json_lp = await get_guided_decoding_logits_processor(
json_request, tokenizer, config)
json_request, zephyr_7B_tokenzer, config)
assert json_lp is not None
tensor = torch.rand(32000)
original_tensor = torch.clone(tensor)
tensor = json_lp(token_ids, tensor)
assert tensor.shape == original_tensor.shape
assert not torch.allclose(tensor, original_tensor)
@pytest.mark.asyncio
@pytest.mark.parametrize("backend",
GUIDED_DECODING_BACKENDS_WITH_REASONING_SUPPORT)
@pytest.mark.parametrize("is_local", [True, False])
@pytest.mark.parametrize("reasoning_backend", ["deepseek_r1"])
async def test_guided_logits_processor_with_reasoning(
backend: str, is_local: bool, reasoning_backend: str, sample_regex,
sample_json_schema, deepseek_r1_qwen_tokenizer):
config = ModelConfig(
REASONING_MODEL_NAME,
task="generate",
tokenizer=REASONING_MODEL_NAME,
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="bfloat16",
)
token_ids = deepseek_r1_qwen_tokenizer.encode(
f"Give an example IPv4 address with this regex: {sample_regex}."
"<think>here is the thinking process")
regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
regex_lp = get_local_guided_decoding_logits_processor(regex_request,
deepseek_r1_qwen_tokenizer, config,
reasoning_backend) if is_local else \
await get_guided_decoding_logits_processor(
regex_request, deepseek_r1_qwen_tokenizer, config,
reasoning_backend)
assert regex_lp is not None
tensor = torch.rand(32000)
original_tensor = torch.clone(tensor)
tensor = regex_lp(token_ids, tensor)
assert tensor.shape == original_tensor.shape
assert torch.allclose(tensor, original_tensor)
token_ids = deepseek_r1_qwen_tokenizer.encode(
f"Give an employee profile that fits this schema: {sample_json_schema}."
"<think>here is the thinking process")
json_request = GuidedDecodingParams(json=sample_json_schema,
backend=backend)
json_lp = get_local_guided_decoding_logits_processor(
json_request, deepseek_r1_qwen_tokenizer, config,
reasoning_backend) if is_local else \
await get_guided_decoding_logits_processor(
json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend)
assert json_lp is not None
tensor = torch.rand(32000)
original_tensor = torch.clone(tensor)
tensor = json_lp(token_ids, tensor)
assert tensor.shape == original_tensor.shape
assert torch.allclose(tensor, original_tensor)
# Thinking is over, so the tensor should change.
token_ids = deepseek_r1_qwen_tokenizer.encode(
f"Give an employee profile that fits this schema: {sample_json_schema}."
"<think>here is the thinking process</think> Then")
json_request = GuidedDecodingParams(json=sample_json_schema,
backend=backend)
json_lp = get_local_guided_decoding_logits_processor(
json_request, deepseek_r1_qwen_tokenizer, config,
reasoning_backend) if is_local else \
await get_guided_decoding_logits_processor(
json_request, deepseek_r1_qwen_tokenizer, config, reasoning_backend)
assert json_lp is not None
tensor = torch.rand(32000)
original_tensor = torch.clone(tensor)
......@@ -112,9 +204,17 @@ def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")
def test_pickle_xgrammar_tokenizer_data():
def test_guided_decoding_backend_options():
"""Test backend-specific options"""
params = GuidedDecodingParams(
backend="xgrammar:option-1,option-2,option-3")
assert params.backend_options() == ["option-1", "option-2", "option-3"]
no_fallback = GuidedDecodingParams(backend="xgrammar:option-1,no-fallback")
assert no_fallback.no_fallback()
# TODO: move to another test file for xgrammar
def test_pickle_xgrammar_tokenizer_data():
try:
import xgrammar as xgr
except ImportError:
......@@ -122,7 +222,11 @@ def test_pickle_xgrammar_tokenizer_data():
from vllm.model_executor.guided_decoding.xgrammar_decoding import (
TokenizerData)
tokenizer_data = TokenizerData(vocab_type=xgr.VocabType.RAW)
tokenizer_data = TokenizerData(
metadata=
'{"vocab_type":2,"vocab_size":151665,"add_prefix_space":false,"stop_token_ids":[151645]}',
encoded_vocab=['!', '"', '#', '$', '%'],
)
pickled = pickle.dumps(tokenizer_data)
assert pickled is not None
......@@ -130,4 +234,5 @@ def test_pickle_xgrammar_tokenizer_data():
depickled: TokenizerData = pickle.loads(pickled)
assert depickled is not None
assert depickled.vocab_type == xgr.VocabType.RAW
assert json.loads(
depickled.metadata)['vocab_type'] == xgr.VocabType.BYTE_LEVEL.value
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment