"vllm/vscode:/vscode.git/clone" did not exist on "34f093b417d492d9cba2d9b54d126a2d87e7e012"
Commit cc7f22a8 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.1' into v0.9.1-ori

parents b9ea0c09 b6553be1
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os import os
import time import time
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import tempfile import tempfile
from collections import OrderedDict from collections import OrderedDict
...@@ -163,11 +164,6 @@ def mixtral_lora_files(): ...@@ -163,11 +164,6 @@ def mixtral_lora_files():
return snapshot_download(repo_id="SangBinCho/mixtral-lora") return snapshot_download(repo_id="SangBinCho/mixtral-lora")
@pytest.fixture(scope="session")
def gemma_lora_files():
return snapshot_download(repo_id="wskwon/gemma-7b-test-lora")
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def chatglm3_lora_files(): def chatglm3_lora_files():
return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider") return snapshot_download(repo_id="jeeejeee/chatglm3-text2sql-spider")
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio import asyncio
import time import time
...@@ -6,6 +7,8 @@ import pytest ...@@ -6,6 +7,8 @@ import pytest
import vllm.envs as env import vllm.envs as env
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
from vllm.inputs import TextPrompt from vllm.inputs import TextPrompt
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
...@@ -16,14 +19,6 @@ LORA_RANK = 64 ...@@ -16,14 +19,6 @@ LORA_RANK = 64
DEFAULT_MAX_LORAS = 4 * 3 DEFAULT_MAX_LORAS = 4 * 3
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def get_lora_requests(lora_path) -> list[LoRARequest]: def get_lora_requests(lora_path) -> list[LoRARequest]:
lora_requests: list[LoRARequest] = [ lora_requests: list[LoRARequest] = [
LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path) LoRARequest(lora_name=f"{i}", lora_int_id=i, lora_path=lora_path)
...@@ -88,17 +83,6 @@ async def test_add_lora(chatglm3_lora_files): ...@@ -88,17 +83,6 @@ async def test_add_lora(chatglm3_lora_files):
trust_remote_code=True, trust_remote_code=True,
enforce_eager=True) enforce_eager=True)
# The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
# environment variable. reload vllm.enging.async_llm_engine as
# vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the
# env var.
import importlib
import vllm.engine.async_llm_engine
importlib.reload(vllm.engine.async_llm_engine)
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
# split lora_requests into 3 parts # split lora_requests into 3 parts
part_size = len(lora_requests) // 3 part_size = len(lora_requests) // 3
dummy_run_requests = lora_requests[:part_size] dummy_run_requests = lora_requests[:part_size]
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest import pytest
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import vllm import vllm
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
...@@ -18,14 +17,6 @@ EXPECTED_LORA_OUTPUT = [ ...@@ -18,14 +17,6 @@ EXPECTED_LORA_OUTPUT = [
] ]
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [ prompts = [
PROMPT_TEMPLATE.format(query="How many singers do we have?"), PROMPT_TEMPLATE.format(query="How many singers do we have?"),
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import random import random
from copy import deepcopy from copy import deepcopy
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import subprocess import subprocess
import sys import sys
from typing import Union from typing import Union
import pytest
import ray
import vllm import vllm
from vllm import LLM from vllm import LLM
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
...@@ -33,14 +31,6 @@ EXPECTED_LORA_OUTPUT = [ ...@@ -33,14 +31,6 @@ EXPECTED_LORA_OUTPUT = [
] ]
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def do_sample(llm: vllm.LLM, def do_sample(llm: vllm.LLM,
lora_path: str, lora_path: str,
lora_id: int, lora_id: int,
...@@ -128,37 +118,6 @@ def test_llama_lora(sql_lora_files): ...@@ -128,37 +118,6 @@ def test_llama_lora(sql_lora_files):
generate_and_test(llm, sql_lora_files) generate_and_test(llm, sql_lora_files)
# Skipping for v1 as v1 doesn't have a good way to expose the num_gpu_blocks
# used by the engine yet.
@pytest.mark.skip_v1
@create_new_process_for_each_test()
def test_llama_lora_warmup(sql_lora_files):
"""Test that the LLM initialization works with a warmup LORA path and
is more conservative"""
@ray.remote(num_gpus=1)
def get_num_gpu_blocks_lora():
llm = vllm.LLM(MODEL_PATH, enable_lora=True, max_num_seqs=16)
num_gpu_blocks_lora_warmup = llm.llm_engine.cache_config.num_gpu_blocks
return num_gpu_blocks_lora_warmup
@ray.remote(num_gpus=1)
def get_num_gpu_blocks_no_lora():
llm = vllm.LLM(MODEL_PATH, max_num_seqs=16)
num_gpu_blocks_no_lora_warmup = (
llm.llm_engine.cache_config.num_gpu_blocks)
return num_gpu_blocks_no_lora_warmup
num_gpu_blocks_lora_warmup = ray.get(get_num_gpu_blocks_lora.remote())
num_gpu_blocks_no_lora_warmup = ray.get(
get_num_gpu_blocks_no_lora.remote())
assert num_gpu_blocks_lora_warmup < num_gpu_blocks_no_lora_warmup, (
"The warmup with lora should be more "
"conservative than without lora, therefore the number of "
"memory blocks for the KV cache should be "
"less when using lora than when not using lora")
@multi_gpu_test(num_gpus=4) @multi_gpu_test(num_gpus=4)
@create_new_process_for_each_test() @create_new_process_for_each_test()
def test_llama_lora_tp4(sql_lora_files): def test_llama_lora_tp4(sql_lora_files):
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest import pytest
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest import pytest
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
""" """
Script to test add_lora, remove_lora, pin_lora, list_loras functions. Script to test add_lora, remove_lora, pin_lora, list_loras functions.
""" """
import os
import pytest import pytest
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.engine.llm_engine import LLMEngine from vllm.engine.llm_engine import LLMEngine
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
MODEL_PATH = "meta-llama/Llama-2-7b-hf" MODEL_PATH = "meta-llama/Llama-2-7b-hf"
...@@ -16,14 +16,6 @@ LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test" ...@@ -16,14 +16,6 @@ LORA_MODULE_PATH = "yard1/llama-2-7b-sql-lora-test"
LORA_RANK = 8 LORA_RANK = 8
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def make_lora_request(lora_id: int): def make_lora_request(lora_id: int):
return LoRARequest(lora_name=f"{lora_id}", return LoRARequest(lora_name=f"{lora_id}",
lora_int_id=lora_id, lora_int_id=lora_id,
...@@ -79,22 +71,6 @@ def test_lora_functions_sync(): ...@@ -79,22 +71,6 @@ def test_lora_functions_sync():
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_lora_functions_async(): async def test_lora_functions_async():
if os.getenv("VLLM_USE_V1") == "0":
pytest.skip(
reason=
"V0 AsyncLLMEngine does not expose remove/list/pin LoRA functions")
# The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
# environment variable. reload vllm.enging.async_llm_engine as
# vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the
# env var.
import importlib
import vllm.engine.async_llm_engine
importlib.reload(vllm.engine.async_llm_engine)
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
max_loras = 4 max_loras = 4
engine_args = AsyncEngineArgs(model=MODEL_PATH, engine_args = AsyncEngineArgs(model=MODEL_PATH,
enable_lora=True, enable_lora=True,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest import pytest
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os import os
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest import pytest
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest import pytest
import torch import torch
...@@ -10,14 +11,6 @@ from vllm.platforms import current_platform ...@@ -10,14 +11,6 @@ from vllm.platforms import current_platform
MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1" MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
prompts: list[str]) -> list[str]: prompts: list[str]) -> list[str]:
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import json import json
import math import math
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest import pytest
...@@ -10,14 +11,6 @@ MODEL_PATH = "microsoft/phi-2" ...@@ -10,14 +11,6 @@ MODEL_PATH = "microsoft/phi-2"
PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501 PROMPT_TEMPLATE = "### Instruct: {sql_prompt}\n\n### Context: {context}\n\n### Output:" # noqa: E501
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [ prompts = [
PROMPT_TEMPLATE.format( PROMPT_TEMPLATE.format(
...@@ -58,7 +51,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: ...@@ -58,7 +51,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
# Skipping for V1 for now as we are hitting, # Skipping for V1 for now as we are hitting,
# "Head size 80 is not supported by FlashAttention." error. # "Head size 80 is not supported by FlashAttention." error.
@pytest.mark.skip_v1 @pytest.mark.skip(reason="Head size 80 is not supported by FlashAttention")
def test_phi2_lora(phi2_lora_files): def test_phi2_lora(phi2_lora_files):
# We enable enforce_eager=True here to reduce VRAM usage for lora-test CI, # We enable enforce_eager=True here to reduce VRAM usage for lora-test CI,
# Otherwise, the lora-test will fail due to CUDA OOM. # Otherwise, the lora-test will fail due to CUDA OOM.
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from threading import Lock from threading import Lock
import pytest import pytest
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from # Adapted from
# https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py # https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/tests/lora/test_llama.py
...@@ -24,27 +25,19 @@ if current_platform.is_rocm(): ...@@ -24,27 +25,19 @@ if current_platform.is_rocm():
MODELS = [ MODELS = [
ModelWithQuantization( ModelWithQuantization(
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
quantization="GPTQ"), quantization="gptq"),
] ]
else: else:
MODELS = [ MODELS = [
ModelWithQuantization( ModelWithQuantization(
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
quantization="AWQ"), quantization="awq"),
ModelWithQuantization( ModelWithQuantization(
model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
quantization="GPTQ"), quantization="gptq"),
] ]
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
def do_sample(llm: vllm.LLM, def do_sample(llm: vllm.LLM,
lora_path: str, lora_path: str,
lora_id: int, lora_id: int,
...@@ -100,7 +93,7 @@ def test_quant_model_lora(tinyllama_lora_files, model): ...@@ -100,7 +93,7 @@ def test_quant_model_lora(tinyllama_lora_files, model):
"#ff8050", "#ff8050",
"#ff8080", "#ff8080",
] ]
elif model.quantization == "AWQ": elif model.quantization == "awq":
expected_no_lora_output = [ expected_no_lora_output = [
"I'm sorry, I don't understand", "I'm sorry, I don't understand",
"I'm sorry, I don't understand", "I'm sorry, I don't understand",
...@@ -109,7 +102,7 @@ def test_quant_model_lora(tinyllama_lora_files, model): ...@@ -109,7 +102,7 @@ def test_quant_model_lora(tinyllama_lora_files, model):
"#f07700: A v", "#f07700: A v",
"#f00000: A v", "#f00000: A v",
] ]
elif model.quantization == "GPTQ": elif model.quantization == "gptq":
expected_no_lora_output = [ expected_no_lora_output = [
"I'm sorry, I don't have", "I'm sorry, I don't have",
"I'm sorry, I don't have", "I'm sorry, I don't have",
...@@ -122,7 +115,7 @@ def test_quant_model_lora(tinyllama_lora_files, model): ...@@ -122,7 +115,7 @@ def test_quant_model_lora(tinyllama_lora_files, model):
def expect_match(output, expected_output): def expect_match(output, expected_output):
# HACK: GPTQ lora outputs are just incredibly unstable. # HACK: GPTQ lora outputs are just incredibly unstable.
# Assert that the outputs changed. # Assert that the outputs changed.
if (model.quantization == "GPTQ" if (model.quantization == "gptq"
and expected_output is expected_lora_output): and expected_output is expected_lora_output):
assert output != expected_no_lora_output assert output != expected_no_lora_output
for i, o in enumerate(output): for i, o in enumerate(output):
...@@ -172,7 +165,7 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, ...@@ -172,7 +165,7 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
model): model):
if num_gpus_available < 2: if num_gpus_available < 2:
pytest.skip(f"Not enough GPUs for tensor parallelism {2}") pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
if model.quantization == "GPTQ": if model.quantization == "gptq":
pytest.skip("GPTQ lora outputs are just incredibly unstable") pytest.skip("GPTQ lora outputs are just incredibly unstable")
llm_tp1 = vllm.LLM( llm_tp1 = vllm.LLM(
model=model.model_path, model=model.model_path,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional from typing import Optional
...@@ -10,14 +11,7 @@ import vllm ...@@ -10,14 +11,7 @@ import vllm
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.sampling_params import BeamSearchParams
@pytest.fixture(autouse=not current_platform.is_cpu())
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
@dataclass @dataclass
...@@ -69,7 +63,7 @@ class Qwen2VLTester: ...@@ -69,7 +63,7 @@ class Qwen2VLTester:
expected_outputs: list[str], expected_outputs: list[str],
lora_id: Optional[int] = None, lora_id: Optional[int] = None,
temperature: float = 0, temperature: float = 0,
max_tokens: int = 5) -> list[str]: max_tokens: int = 5):
sampling_params = vllm.SamplingParams( sampling_params = vllm.SamplingParams(
temperature=temperature, temperature=temperature,
...@@ -97,7 +91,35 @@ class Qwen2VLTester: ...@@ -97,7 +91,35 @@ class Qwen2VLTester:
generated), f"Generated text {generated} doesn't " generated), f"Generated text {generated} doesn't "
f"match expected pattern {expected}" f"match expected pattern {expected}"
return generated_texts def run_beam_search_test(self,
images: list[ImageAsset],
expected_outputs: list[list[str]],
lora_id: Optional[int] = None,
temperature: float = 0,
beam_width: int = 2,
max_tokens: int = 5):
beam_search_params = BeamSearchParams(beam_width=beam_width,
max_tokens=max_tokens,
temperature=temperature)
inputs = [{
"prompt": self.PROMPT_TEMPLATE,
"multi_modal_data": {
"image": asset.pil_image
},
} for asset in images]
lora_request = LoRARequest(str(lora_id), lora_id,
self.config.lora_path)
outputs = self.llm.beam_search(inputs,
beam_search_params,
lora_request=lora_request)
for output_obj, expected_outs in zip(outputs, expected_outputs):
output_texts = [seq.text for seq in output_obj.sequences]
assert output_texts == expected_outs, \
f"Generated texts {output_texts} do not match expected {expected_outs}" # noqa: E501
TEST_IMAGES = [ TEST_IMAGES = [
...@@ -110,6 +132,14 @@ EXPECTED_OUTPUTS = [ ...@@ -110,6 +132,14 @@ EXPECTED_OUTPUTS = [
"A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.", # noqa: E501 "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.", # noqa: E501
] ]
# NOTE - beam search .text contains the whole text
EXPECTED_BEAM_SEARCH_OUTPUTS = [
[
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is in the image?<|im_end|>\n<|im_start|>assistant\nA majestic skyscraper stands", # noqa: E501
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is in the image?<|im_end|>\n<|im_start|>assistant\nA majestic tower stands tall", # noqa: E501
],
]
QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct" QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct" QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
...@@ -130,6 +160,27 @@ def test_qwen2vl_lora(qwen2vl_lora_files): ...@@ -130,6 +160,27 @@ def test_qwen2vl_lora(qwen2vl_lora_files):
lora_id=lora_id) lora_id=lora_id)
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="Qwen2-VL dependency xformers incompatible with ROCm")
def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
"""Test Qwen 2.0 VL model with LoRA through beam search."""
config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
lora_path=qwen2vl_lora_files)
tester = Qwen2VLTester(config)
# Test with different LoRA IDs
for lora_id in [1, 2]:
# NOTE currently, we only test cherry blossom since stop sign
# output is slightly different for v1; - the root cause is likely
# independent of the intent of this test, which is to ensure beam
# search passes through lora through correctly.
tester.run_beam_search_test(
[ImageAsset("cherry_blossom")],
expected_outputs=EXPECTED_BEAM_SEARCH_OUTPUTS,
lora_id=lora_id)
@pytest.mark.xfail( @pytest.mark.xfail(
current_platform.is_rocm(), current_platform.is_rocm(),
reason="Qwen2.5-VL dependency xformers incompatible with ROCm", reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional from typing import Optional
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment