Commit 04629132 authored by zhuwenwen's avatar zhuwenwen
Browse files

[tests] fix tests

parent 07c69390
# SPDX-License-Identifier: Apache-2.0
import os
from ..utils import models_path_prefix
from ..entrypoints.openai.test_oot_registration import (
run_and_test_dummy_opt_api_server)
def test_distributed_oot(dummy_opt_path: str):
run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2)
dummy_opt_path = os.path.join(models_path_prefix, "facebook/opt-125m")
run_and_test_dummy_opt_api_server(dummy_opt_path, tp=2)
\ No newline at end of file
......@@ -20,7 +20,8 @@ from ..utils import models_path_prefix
from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
LIST_ENC_DEC_SUPPORTED_BACKENDS = [
_Backend.XFORMERS, _Backend.FLASH_ATTN, None
# _Backend.XFORMERS, _Backend.FLASH_ATTN, None
_Backend.FLASH_ATTN, _Backend.ROCM_FLASH,None
]
......@@ -57,7 +58,7 @@ def clear_cache():
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/bart-large-cnn")])
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
......@@ -131,4 +132,4 @@ def test_encoder_decoder_e2e(
name_0="hf",
name_1="vllm",
num_outputs_0_skip_tokens=hf_skip_tokens,
)
)
\ No newline at end of file
......@@ -84,4 +84,4 @@ def test_lm_eval_accuracy_v0_engine(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "0")
run_test("Qwen/Qwen2-1.5B-Instruct")
run_test(os.path.join(models_path_prefix,"Qwen/Qwen2-1.5B-Instruct"))
\ No newline at end of file
......@@ -65,7 +65,7 @@ def test_multi_chat():
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
def test_chat_multi_image(image_urls: list[str]):
llm = LLM(
model="microsoft/Phi-3.5-vision-instruct",
model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
max_model_len=4096,
max_num_seqs=5,
enforce_eager=True,
......@@ -98,7 +98,7 @@ def test_llm_chat_tokenization_no_double_bos():
LLM.chat() should not add special tokens when using chat templates.
Check we get a single BOS token for llama chat.
"""
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct", enforce_eager=True)
llm = LLM(model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), enforce_eager=True)
messages = [
{
"role": "system",
......
......@@ -16,7 +16,7 @@ def v1(run_with_both_engines):
def test_empty_prompt():
llm = LLM(model=os.path.join(models_path_prefix, "openai-community/gpt2"),, enforce_eager=True)
llm = LLM(model=os.path.join(models_path_prefix, "openai-community/gpt2"), enforce_eager=True)
with pytest.raises(ValueError, match='decoder prompt cannot be empty'):
llm.generate([""])
......
# SPDX-License-Identifier: Apache-2.0
import os
import asyncio
from contextlib import suppress
from dataclasses import dataclass
......@@ -272,4 +273,4 @@ def test_serving_chat_could_load_correct_generation_config():
asyncio.run(serving_chat.create_chat_completion(req))
assert mock_engine.generate.call_args.args[1].temperature == 0.0
assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
import os
import requests
from ...utils import RemoteOpenAIServer
from ...utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME = "meta-llama/Llama-3.2-1B"
MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
def test_sleep_mode():
......@@ -58,4 +59,4 @@ def test_sleep_mode():
response = requests.get(remote_server.url_for("is_sleeping"))
assert response.status_code == 200
assert response.json().get("is_sleeping") is False
assert response.json().get("is_sleeping") is False
\ No newline at end of file
......@@ -20,12 +20,12 @@ vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec.jinja"
assert vlm2vec_jinja_path.exists()
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
"https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
]
# TEST_IMAGE_URLS = [
# "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
# "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
# "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
# "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
# ]
TEST_IMAGE_URLS = [
f"http://localhost:{urls_port}/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
......
File mode changed from 100755 to 100644
......@@ -2,6 +2,7 @@
import asyncio
import time
import os
import pytest
import vllm.envs as env
......@@ -10,8 +11,9 @@ from vllm.inputs import TextPrompt
from vllm.lora.request import LoRARequest
from vllm.sampling_params import SamplingParams
from vllm.utils import merge_async_iterators
from ..utils import models_path_prefix
MODEL_PATH = "THUDM/chatglm3-6b"
MODEL_PATH = os.path.join(models_path_prefix, "THUDM/chatglm3-6b")
LORA_RANK = 64
DEFAULT_MAX_LORAS = 4 * 3
......@@ -134,4 +136,4 @@ async def test_add_lora(chatglm3_lora_files):
f"time_with_add_lora={time_with_add_lora}, "
f"time_cold_start={time_cold_start}"
"The engine request processing time with LoRA pre-loading "
"must be less than the version that does on-demand LoRA loading.")
"must be less than the version that does on-demand LoRA loading.")
\ No newline at end of file
......@@ -9,7 +9,7 @@ from vllm.lora.request import LoRARequest
from ..utils import models_path_prefix
MODEL_PATH = os.path.join(models_path_prefix, "baichuan-inc/Baichuan-7B")
MODEL_PATH = "baichuan-inc/Baichuan-7B"
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
......
# SPDX-License-Identifier: Apache-2.0
import pytest
import os
import vllm
from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform
from ..utils import models_path_prefix
MODEL_PATH = os.path.join(models_path_prefix, "google/gemma-7b")
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [
"Quote: Imagination is",
"Quote: Be yourself;",
"Quote: Painting is poetry that is seen rather than felt,",
]
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
# Print the outputs.
generated_texts: list[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
return generated_texts
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
# The V1 lora test for this model requires more than 24GB.
@pytest.mark.skip_v1
@pytest.mark.xfail(current_platform.is_rocm(),
reason="There can be output mismatch on ROCm")
def test_gemma_lora(gemma_lora_files):
llm = vllm.LLM(MODEL_PATH,
max_model_len=1024,
enable_lora=True,
max_loras=4,
enable_chunked_prefill=True)
expected_lora_output = [
"more important than knowledge.\nAuthor: Albert Einstein\n",
"everyone else is already taken.\nAuthor: Oscar Wilde\n",
"and poetry is painting that is felt rather than seen.\n"
"Author: Leonardo da Vinci\n",
]
output1 = do_sample(llm, gemma_lora_files, lora_id=1)
for i in range(len(expected_lora_output)):
assert output1[i].startswith(expected_lora_output[i])
output2 = do_sample(llm, gemma_lora_files, lora_id=2)
for i in range(len(expected_lora_output)):
assert output2[i].startswith(expected_lora_output[i])
# SPDX-License-Identifier: Apache-2.0
import ast
from typing import Optional
import numpy as np
import pytest
import os
import vllm
from vllm import SamplingParams
from vllm.lora.layers import LinearScalingRotaryEmbeddingWithLoRA
from vllm.lora.request import LoRARequest
from vllm.model_executor.layers.rotary_embedding import (
LinearScalingRotaryEmbedding)
from .data.long_context_test_data import prompts_and_responses
from ..utils import models_path_prefix
context_len_to_scaling_factor = {
"16k": 4,
"32k": 8,
}
# We use the same sampling params for all requests
sampling_params = SamplingParams(
temperature=0,
max_tokens=100,
)
def _create_lora_request(lora_id, long_context_infos):
context_len = long_context_infos[lora_id]["context_length"]
scaling_factor = context_len_to_scaling_factor[context_len]
return LoRARequest(
# There are 2 LoRAs for 16K, we need to add lora_id to indicate
# they are different LoRAs.
context_len + str(lora_id),
lora_id,
long_context_infos[lora_id]["lora"],
None,
4096 * scaling_factor,
)
def evaluate_json_response(model_response, golden_response):
"""Evaluates the model response against the golden response.
Returns a score between 0 and 1, where 1 is a perfect match and 0 is no
match. The score quantifies how well the model is able to extract the
golden JSON from the long context.
"""
try:
model_response = ast.literal_eval(model_response)
except Exception as e:
raise ValueError(
f"Model response is not a valid JSON. Expected {golden_response}, "
f"got {model_response}") from e
# Normally, we would flatten the dictionary and compare the values, but in
# this case, we know that the dictionary is only 2 levels deep
positive_values = 0
total_values = 0
# We look at all the attributes of the person that we are extracting a
# biography of and copmare them to the golden response
for person_attribute, person_attribute_value in golden_response.items():
if person_attribute in model_response:
if isinstance(person_attribute_value, dict):
for (sub_attribute,
sub_attribute_value) in person_attribute_value.items():
total_values += 1
if sub_attribute in model_response[
person_attribute] and model_response[
person_attribute][
sub_attribute] == sub_attribute_value:
positive_values += 1
else:
total_values += 1
if model_response[person_attribute] == person_attribute_value:
positive_values += 1
else:
# We count a missing sub-dict as a single missed value.
total_values += 1
# Return a score between 0 and 1
return positive_values / total_values
def generate(
llm: vllm.LLM,
inputs: tuple[str, SamplingParams, Optional[LoRARequest]],
):
prompts, sampling_param, lora_request = inputs
outputs = llm.generate(prompts, sampling_param, lora_request=lora_request)
return outputs[0].outputs[0].text.strip()
def batched_generate(
llm: vllm.LLM,
inputs: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
):
for input in inputs:
prompt, sampling_param, lora_req = input
# Add requests to the engine and run the engine
llm._validate_and_add_requests(prompt,
sampling_param,
lora_request=lora_req,
prompt_adapter_request=None)
outputs = llm._run_engine(use_tqdm=True)
return [outputs[i].outputs[0].text.strip() for i in range(len(outputs))]
@pytest.fixture(scope="module")
def lora_llm(long_context_infos):
scaling_factors = [
context_len_to_scaling_factor[info["context_length"]]
for info in long_context_infos.values()
]
llm = vllm.LLM(
os.path.join(models_path_prefix, "meta-llama/Llama-2-13b-chat-hf"),
enable_lora=True,
max_num_seqs=16,
max_loras=2,
long_lora_scaling_factors=tuple(scaling_factors),
max_num_batched_tokens=4096 * 8,
tensor_parallel_size=4,
# FIXME enable async output processor
disable_async_output_proc=True,
distributed_executor_backend="mp",
enable_chunked_prefill=True)
yield llm
del llm
def test_rotary_emb_replaced(dist_init):
"""Verify rotary emb in all the layers are replaced"""
from vllm.engine.arg_utils import EngineArgs
from vllm.worker.model_runner import ModelRunner
engine_args = EngineArgs(os.path.join(models_path_prefix, "meta-llama/Llama-2-13b-chat-hf"),
long_lora_scaling_factors=(4.0, ),
enable_lora=True)
engine_config = engine_args.create_engine_config()
model_runner = ModelRunner(
vllm_config=engine_config,
is_driver_worker=True,
)
model_runner.load_model()
rotary_emb_count = 0
for module_name, module in model_runner.model.named_modules(
remove_duplicate=False):
if "rotary_emb" in module_name:
if "base_layer" not in module_name:
rotary_emb_count += 1
assert isinstance(module, LinearScalingRotaryEmbeddingWithLoRA)
else:
assert isinstance(module, LinearScalingRotaryEmbedding)
# Llama 2 has 32 layers.
assert rotary_emb_count == 32
@pytest.mark.skip_global_cleanup
def test_batched_rope_kernel(lora_llm, long_context_infos):
"""We test the batched kernel by comparing the results of batched an
non-batched generation.
"""
# Create non batched results first to compare against batched results
non_batched_results: list[str] = []
for lora_id, info in long_context_infos.items():
context_len = info["context_length"]
lora_prompt = (prompts_and_responses[context_len][0]["prompt"],
sampling_params,
_create_lora_request(lora_id, long_context_infos))
lora_output = generate(lora_llm, lora_prompt)
non_batched_results.append(lora_output)
# Create batched results
# Each element of the batch must be
# (prompt, prompt_sampling_params, prompt_lora_request)
batched_prompts: list[tuple[str, SamplingParams,
Optional[LoRARequest]]] = []
for lora_id, info in long_context_infos.items():
context_len = info["context_length"]
batched_prompts.extend([
(prompts_and_responses[context_len][0]["prompt"], sampling_params,
_create_lora_request(lora_id, long_context_infos))
])
batched_results = batched_generate(lora_llm, batched_prompts)
# Results should be the same
for non_batched, batched in zip(non_batched_results, batched_results):
assert non_batched == batched, (
"Non batched and batched results should be the "
f"same:\n{batched}\n{non_batched}")
@pytest.mark.skip_global_cleanup
def test_self_consistency(lora_llm, long_context_infos):
"""We test consistency of the batched kernel by permuting batched
inputs and comparing the results to the non-permuted batched results.
"""
num_loras = len(long_context_infos)
# Create results in order of long_context_infos
batched_prompts: list[tuple[str, SamplingParams,
Optional[LoRARequest]]] = []
for lora_id, info in long_context_infos.items():
context_len = info["context_length"]
batched_prompts.extend([
(prompts_and_responses[context_len][0]["prompt"], sampling_params,
_create_lora_request(lora_id, long_context_infos))
])
batched_results = batched_generate(lora_llm, batched_prompts)
permutation = np.random.default_rng(seed=42).permutation(num_loras)
# Create results in random order of permutation
batched_prompts = []
for i in permutation:
lora_id, info = list(long_context_infos.items())[i]
context_len = info["context_length"]
batched_prompts.extend([
(prompts_and_responses[context_len][0]["prompt"], sampling_params,
_create_lora_request(lora_id, long_context_infos))
])
permutated_batched_results = batched_generate(lora_llm, batched_prompts)
# Results should be the same
for i in range(num_loras):
assert batched_results[i] == permutated_batched_results[
permutation[i]], (
f"Results should be the same:\n{batched_results[i]}"
f"\n{permutated_batched_results[permutation[i]]}")
@pytest.mark.skip_global_cleanup
def test_quality(lora_llm, long_context_infos):
"""We test the quality of the answers given by the LoRA model by
comparing the generated text to the merged model's outputs.
This is effectively a mini-benchmark over four prompts.
If this test fails, this indicates that the quality of the LoRA model
is suboptimal compared to the merged model. For example, if the model
does not output valid dictionaries, this test will fail.
If needed for testing, the merged versions of the models are available
as part of the `conftest`.
The test is expected to run for about 1 minute on a p4de.24xlarge
instance.
"""
scores: list[float] = []
for lora_id, info in long_context_infos.items():
context_len = info["context_length"]
for prompt_and_response in prompts_and_responses[context_len]:
lora_prompt = (prompt_and_response["prompt"], sampling_params,
_create_lora_request(lora_id, long_context_infos))
response = generate(lora_llm, lora_prompt)
golden_answer = prompt_and_response["golden_answer"]
score = evaluate_json_response(response, golden_answer)
scores.append(score)
assert score > 0.3, ("Quality of the answer is not good enough. "
f"Expected {golden_answer}, got {response}")
assert np.mean(scores) > 0.5
@pytest.mark.skip_global_cleanup
def test_max_len(lora_llm, long_context_infos):
"""Test that we raise an ValueError when the input of a given LoRA
model exceeds the maximum length."""
# Since each LoRA model has a different maximum length, we need to
# test each one separately
for lora_id, info in long_context_infos.items():
context_len = info["context_length"]
lora_request = _create_lora_request(lora_id, long_context_infos)
# Good prompt should be fine
good_prompt = prompts_and_responses[context_len][0]["prompt"]
generate(lora_llm, (good_prompt, sampling_params, lora_request))
# Bad prompt should raise an error
bad_prompt = good_prompt * 2
with pytest.raises(ValueError):
generate(lora_llm, (bad_prompt, sampling_params, lora_request))
# Also test batched
batched_prompts: list[tuple[str, SamplingParams,
Optional[LoRARequest]]] = []
for lora_id_with_bad_inputs in long_context_infos:
for lora_id, info in long_context_infos.items():
context_len = info["context_length"]
batched_prompts.extend([
(prompts_and_responses[context_len][0]["prompt"] *
(2 if lora_id == lora_id_with_bad_inputs else 1),
sampling_params,
_create_lora_request(lora_id, long_context_infos))
])
# Turn good prompt into bad prompt inside of batched prompts
with pytest.raises(ValueError):
batched_generate(lora_llm, batched_prompts)
# SPDX-License-Identifier: Apache-2.0
from typing import List
import os
import pytest
import vllm
from vllm.lora.request import LoRARequest
from ..utils import models_path_prefix
MODEL_PATH = os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-base")
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
prompts = [
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501
]
sampling_params = vllm.SamplingParams(temperature=0,
max_tokens=256,
stop=["[/assistant]"])
outputs = llm.generate(
prompts,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None)
generated_texts: List[str] = []
for output in outputs:
generated_text = output.outputs[0].text
generated_texts.append(generated_text)
return generated_texts
@pytest.fixture(autouse=True)
def v1(run_with_both_engines_lora):
# Simple autouse wrapper to run both engines for each test
# This can be promoted up to conftest.py to run for every
# test in a package
pass
# Skipping for V1 for now as we are hitting,
# "Head size 80 is not supported by FlashAttention." error.
@pytest.mark.skip_v1
@pytest.mark.parametrize("lora_bias", [True])
@pytest.mark.parametrize("fully_sharded", [True, False])
def test_lora_bias(lora_bias_files: str, lora_bias: bool, fully_sharded: bool):
llm = vllm.LLM(MODEL_PATH,
enable_lora=True,
max_num_seqs=16,
max_lora_rank=8,
max_loras=1,
enable_lora_bias=lora_bias,
tensor_parallel_size=1,
fully_sharded_loras=fully_sharded)
print("lora adapter created")
output1 = do_sample(llm, lora_bias_files, lora_id=0)
print("lora")
output2 = do_sample(llm, lora_bias_files, lora_id=1)
if lora_bias:
assert output1 != output2
else:
assert output1 == output2
from typing import List
import os
import pytest
import vllm
from vllm.assets.image import ImageAsset
from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform
from ..utils import models_path_prefix
MODEL_PATH = os.path.join(models_path_prefix, "openbmb/MiniCPM-Llama3-V-2_5")
PROMPT_TEMPLATE = (
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
"(<image>./</image>)\nWhat is in the image?<|eot_id|>"
"<|start_header_id|>assistant<|end_header_id|>\n\n")
IMAGE_ASSETS = [
ImageAsset("stop_sign"),
ImageAsset("cherry_blossom"),
]
# After fine-tuning with LoRA, all generated content should start begin `A`.
EXPECTED_OUTPUT = [
"A red and white stop sign with a Chinese archway in the background featuring red lanterns and gold accents.", # noqa: E501
"A pink cherry blossom tree with a blue sky in the background.",
]
def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
sampling_params = vllm.SamplingParams(
temperature=0,
max_tokens=5,
stop_token_ids=[128001, 128009], # eos_id, eot_id
)
inputs = [{
"prompt": PROMPT_TEMPLATE,
"multi_modal_data": {
"image": asset.pil_image
},
} for asset in IMAGE_ASSETS]
outputs = llm.generate(
inputs,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None,
)
# Print the outputs.
generated_texts: List[str] = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
return generated_texts
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="MiniCPM-V dependency xformers incompatible with ROCm")
def test_minicpmv_lora(minicpmv_lora_files):
llm = vllm.LLM(
MODEL_PATH,
max_num_seqs=2,
enable_lora=True,
max_loras=4,
max_lora_rank=64,
trust_remote_code=True,
enable_chunked_prefill=True,
)
output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output1[i])
output2 = do_sample(llm, minicpmv_lora_files, lora_id=2)
for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output2[i])
......@@ -57,9 +57,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
return generated_texts
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="MiniCPM-V dependency xformers incompatible with ROCm")
# @pytest.mark.xfail(
# current_platform.is_rocm(),
# reason="MiniCPM-V dependency xformers incompatible with ROCm")
def test_minicpmv_lora(minicpmv_lora_files):
llm = vllm.LLM(
MODEL_PATH,
......@@ -85,9 +85,9 @@ def test_minicpmv_lora(minicpmv_lora_files):
@pytest.mark.skipif(current_platform.is_cuda_alike(),
reason="Skipping to avoid redundant model tests")
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="MiniCPM-V dependency xformers incompatible with ROCm")
# @pytest.mark.xfail(
# current_platform.is_rocm(),
# reason="MiniCPM-V dependency xformers incompatible with ROCm")
@create_new_process_for_each_test()
def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
llm = vllm.LLM(
......@@ -110,9 +110,9 @@ def test_minicpmv_tp4_wo_fully_sharded_loras(minicpmv_lora_files):
@pytest.mark.skipif(current_platform.is_cuda_alike(),
reason="Skipping to avoid redundant model tests")
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="MiniCPM-V dependency xformers incompatible with ROCm")
# @pytest.mark.xfail(
# current_platform.is_rocm(),
# reason="MiniCPM-V dependency xformers incompatible with ROCm")
@create_new_process_for_each_test()
def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
llm = vllm.LLM(
......@@ -134,4 +134,4 @@ def test_minicpmv_tp4_fully_sharded_loras(minicpmv_lora_files):
assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
output_tp = do_sample(llm, minicpmv_lora_files, lora_id=2)
for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
assert EXPECTED_OUTPUT[i].startswith(output_tp[i])
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
"""
This script is mainly used to test whether trtion kernels can run normally
under different conditions, including various batches, numbers of LoRA , and
maximum ranks.
"""
from threading import Lock
import pytest
import torch
# Enable custom op register
import vllm.lora.ops.triton_ops # noqa: F401
from vllm.lora.ops.torch_ops import (bgmv_expand, bgmv_expand_slice,
bgmv_shrink, sgmv_expand,
sgmv_expand_slice, sgmv_shrink)
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
from vllm.platforms import current_platform
from .utils import (assert_close, generate_data,
generate_data_for_expand_nslices,
generate_data_for_nslices)
HIDDEN_SIZES = [1024] # [2049]
BATCHES = [1, 4, 16, 32]
NUM_LORA = [1, 8, 32, 128]
DTYPES = [torch.float16, torch.bfloat16]
MAX_RANKS = [1, 4, 8, 16, 32, 64, 128, 256]
SCALES = [0.5]
SEED = [0]
DEVICES = [f"cuda:{0}"]
_dict_lock = Lock()
@pytest.mark.parametrize("batches", BATCHES)
@pytest.mark.parametrize("num_loras", NUM_LORA)
@pytest.mark.parametrize("rank", MAX_RANKS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("scaling", SCALES)
@pytest.mark.parametrize("nslices", [1, 2, 3])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("device", DEVICES)
def test_punica_sgmv(
batches: int,
num_loras: int,
rank: int,
hidden_size: int,
scaling: float,
nslices: int,
dtype: torch.dtype,
op_type: str,
seed: int,
device: str,
):
torch.set_default_device(device)
current_platform.seed_everything(seed)
seq_length = 128
(
inputs_tensor,
lora_weights_lst,
our_out_tensor,
ref_out_tensor,
b_seq_start_loc,
lora_indices_tensor,
seq_len_tensor,
indices,
) = generate_data_for_nslices(
batches,
hidden_size,
num_loras,
rank,
seq_length,
nslices,
dtype,
op_type,
device,
)
max_seq_length = seq_len_tensor.max()
token_nums = seq_len_tensor.sum().item()
if isinstance(max_seq_length, tuple):
max_seq_length = max_seq_length[0].item()
else:
max_seq_length = max_seq_length.item()
if op_type == "shrink":
# Preventing cache error pointer.
with _dict_lock:
_LORA_A_PTR_DICT.clear()
torch.ops.vllm.sgmv_shrink(
inputs_tensor,
lora_weights_lst,
our_out_tensor,
b_seq_start_loc,
seq_len_tensor,
lora_indices_tensor,
batches,
max_seq_length,
token_nums,
scaling,
)
for index in range(nslices):
sgmv_shrink(
inputs_tensor,
lora_weights_lst[index],
ref_out_tensor[index],
b_seq_start_loc,
seq_len_tensor,
lora_indices_tensor,
batches,
max_seq_length,
token_nums,
scaling,
)
else:
with _dict_lock:
_LORA_B_PTR_DICT.clear()
torch.ops.vllm.sgmv_expand(
inputs_tensor,
lora_weights_lst,
our_out_tensor,
b_seq_start_loc,
seq_len_tensor,
lora_indices_tensor,
batches,
max_seq_length,
token_nums,
offset_start=0,
add_inputs=True,
)
slice_offset = 0
if nslices == 1:
# Verify the torch's sgmv_expand op
sgmv_expand(
inputs_tensor[0],
lora_weights_lst[0],
ref_out_tensor,
b_seq_start_loc,
seq_len_tensor,
lora_indices_tensor,
batches,
max_seq_length,
token_nums,
add_inputs=True,
)
else:
for index in range(nslices):
lora_weights = lora_weights_lst[index]
sgmv_expand_slice(
inputs_tensor[index],
lora_weights,
ref_out_tensor,
b_seq_start_loc,
seq_len_tensor,
lora_indices_tensor,
batches,
max_seq_length,
token_nums,
slice_offset,
hidden_size,
add_inputs=True,
)
slice_offset += hidden_size
assert_close(our_out_tensor, ref_out_tensor)
@pytest.mark.parametrize("batches", BATCHES)
@pytest.mark.parametrize("num_loras", NUM_LORA)
@pytest.mark.parametrize("rank", MAX_RANKS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("scaling", SCALES)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("op_type", ["shrink", "expand"])
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("device", DEVICES)
def test_punica_bgmv(
batches: int,
num_loras: int,
rank: int,
hidden_size: int,
scaling: float,
dtype: torch.dtype,
op_type: str,
seed: int,
device: str,
):
torch.set_default_device(device)
current_platform.seed_everything(seed)
seq_length = 1
(
inputs_tensor,
lora_weights,
our_out_tensor,
ref_out_tensor,
b_seq_start_loc,
lora_indices_tensor,
seq_len_tensor,
indices,
) = generate_data(
batches,
hidden_size,
num_loras,
rank,
seq_length,
dtype,
op_type,
device,
)
if op_type == "shrink":
torch.ops.vllm.bgmv_shrink(
inputs_tensor,
lora_weights,
our_out_tensor,
indices,
scaling,
)
bgmv_shrink(
inputs_tensor,
lora_weights,
ref_out_tensor,
indices,
scaling,
)
else:
torch.ops.vllm.bgmv_expand(
inputs_tensor,
lora_weights,
our_out_tensor,
indices,
add_inputs=True,
)
bgmv_expand(
inputs_tensor,
lora_weights,
ref_out_tensor,
indices,
add_inputs=True,
)
if op_type == "shrink":
ref_out_tensor = ref_out_tensor.to(torch.float32)
assert_close(our_out_tensor, ref_out_tensor)
@pytest.mark.parametrize("batches", BATCHES)
@pytest.mark.parametrize("num_loras", NUM_LORA)
@pytest.mark.parametrize("rank", MAX_RANKS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@pytest.mark.parametrize("nslices", [2, 3])
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEED)
@pytest.mark.parametrize("device", DEVICES)
def test_punica_bgmv_expand_nslices(
batches: int,
num_loras: int,
rank: int,
hidden_size: int,
nslices: int,
dtype: torch.dtype,
seed: int,
device: str,
):
torch.set_default_device(device)
current_platform.seed_everything(seed)
seq_length = 1
(
inputs_tensor,
lora_weights_lst,
our_outputs,
ref_outputs,
b_seq_start_loc,
lora_indices_tensor,
seq_len_tensor,
indices,
) = generate_data_for_expand_nslices(
batches,
hidden_size,
num_loras,
rank,
seq_length,
dtype,
nslices,
device,
)
slice_offset = 0
for index in range(nslices):
lora_weights = lora_weights_lst[index]
torch.ops.vllm.bgmv_expand_slice(
inputs_tensor,
lora_weights,
our_outputs,
indices,
slice_offset,
slice_size=hidden_size,
add_inputs=True,
)
bgmv_expand_slice(
inputs_tensor,
lora_weights,
ref_outputs,
indices,
slice_offset,
slice_size=hidden_size,
add_inputs=True,
)
slice_offset += hidden_size
assert_close(our_outputs, ref_outputs)
......@@ -114,9 +114,9 @@ QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="Qwen2-VL dependency xformers incompatible with ROCm")
# @pytest.mark.xfail(
# current_platform.is_rocm(),
# reason="Qwen2-VL dependency xformers incompatible with ROCm")
def test_qwen2vl_lora(qwen2vl_lora_files):
"""Test Qwen 2.0 VL model with LoRA"""
config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
......@@ -130,10 +130,10 @@ def test_qwen2vl_lora(qwen2vl_lora_files):
lora_id=lora_id)
@pytest.mark.xfail(
current_platform.is_rocm(),
reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
)
# @pytest.mark.xfail(
# current_platform.is_rocm(),
# reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
# )
@pytest.mark.skipif(
Version(TRANSFORMERS_VERSION) < Version("4.49.0"),
reason="Qwen2.5-VL require transformers version no lower than 4.49.0",
......@@ -148,4 +148,4 @@ def test_qwen25vl_lora(qwen25vl_lora_files):
for lora_id in [1, 2]:
tester.run_test(TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS,
lora_id=lora_id)
lora_id=lora_id)
\ No newline at end of file
......@@ -3,6 +3,7 @@
from collections.abc import Sequence
from typing import Optional
import os
import pytest
from transformers import AutoModelForSpeechSeq2Seq
......@@ -12,6 +13,7 @@ from vllm.sequence import SampleLogprobs
from ....conftest import HfRunner, PromptAudioInput, VllmRunner, _AudioAssets
from ...registry import HF_EXAMPLE_MODELS
from ...utils import check_logprobs_close
from ....utils import models_path_prefix
HF_AUDIO_PROMPT = "<|start_of_role|>system<|end_of_role|>Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant<|end_of_text|>\n<|start_of_role|>user<|end_of_role|><|audio|>can you transcribe the speech into a written format?<|end_of_text|>\n<|start_of_role|>assistant<|end_of_role|>" # noqa: E501
......@@ -27,7 +29,7 @@ def vllm_to_hf_output(
return output_ids, hf_output_str, out_logprobs
MODEL_NAME = "ibm-granite/granite-speech-3.3-8b"
MODEL_NAME = os.path.join(models_path_prefix, "ibm-granite/granite-speech-3.3-8b")
# Audio lora co-exists directly in the model directory, but
# currently still needs to be passed directly to vLLM.
audio_lora_path = MODEL_NAME
......@@ -140,4 +142,4 @@ def test_models(hf_runner, vllm_runner, model: str, audio_assets: _AudioAssets,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
)
)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment