Commit 2ff1c360 authored by zhuwenwen's avatar zhuwenwen
Browse files

update tests

parent 201768d5
......@@ -37,7 +37,7 @@ from vllm.logger import init_logger
from vllm.outputs import RequestOutput
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
identity, is_cpu)
from utils import models_path_prefix
from .utils import models_path_prefix
logger = init_logger(__name__)
......
......@@ -2,6 +2,7 @@ from http import HTTPStatus
import openai
import pytest
import os
import pytest_asyncio
import requests
......
# imports for guided decoding tests
import re
import os
import openai
import pytest
......
......@@ -24,12 +24,15 @@ MAX_MODEL_LEN = 1024
MODELS = [
# act_order==False, group_size=channelwise
(os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-gptq"), "main"),
# (os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-gptq"), "main"),
(os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-gptq")),
# act_order==False, group_size=128
("TheBloke/Llama-2-7B-GPTQ", "main"),
# ("TheBloke/Llama-2-7B-GPTQ", "main"),
("TheBloke/Llama-2-7B-GPTQ"),
# act_order==True, group_size=128
(os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "main"),
# (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "main"),
(os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ")),
# act_order==True, group_size=64
(os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-4bit-64g-actorder_True"),
# act_order==True, group_size=32
......@@ -43,7 +46,8 @@ MODELS = [
(os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-8bit-32g-actorder_True"),
# 4-bit, act_order==True, group_size=128
(os.path.join(models_path_prefix, "TechxGenus/gemma-1.1-2b-it-GPTQ"), "main")
# (os.path.join(models_path_prefix, "TechxGenus/gemma-1.1-2b-it-GPTQ"), "main")
(os.path.join(models_path_prefix, "TechxGenus/gemma-1.1-2b-it-GPTQ"))
]
......
......@@ -12,6 +12,7 @@ import torch
from tests.quantization.utils import is_quant_method_supported
from ..utils import fork_new_process_for_each_test, models_path_prefix
from vllm.utils import is_hip
models_4bit_to_test = [
......@@ -30,7 +31,7 @@ models_pre_quant_8bit_to_test = [
]
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or is_hip(),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@fork_new_process_for_each_test
......@@ -42,7 +43,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, hf_model_kwargs)
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or is_hip(),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
models_pre_qaunt_4bit_to_test)
......@@ -54,7 +55,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name)
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or is_hip(),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description",
models_pre_quant_8bit_to_test)
......@@ -68,7 +69,7 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
@pytest.mark.skipif(torch.cuda.device_count() < 2,
reason='Test requires at least 2 GPUs.')
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or is_hip(),
reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@fork_new_process_for_each_test
......
......@@ -7,9 +7,10 @@ import os
from tests.quantization.utils import is_quant_method_supported
from ..utils import compare_two_settings, models_path_prefix
from vllm.utils import is_hip
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
@pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
reason="fp8 is not supported on this GPU type.")
def test_cpu_offload_fp8():
# Test quantization of an unquantized checkpoint
......@@ -23,7 +24,7 @@ def test_cpu_offload_fp8():
max_wait_seconds=480)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or is_hip(),
reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_gptq():
# Test GPTQ Marlin
......@@ -37,7 +38,7 @@ def test_cpu_offload_gptq():
max_wait_seconds=480)
@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"),
@pytest.mark.skipif(not is_quant_method_supported("awq_marlin") or is_hip(),
reason="awq_marlin is not supported on this GPU type.")
def test_cpu_offload_awq():
# Test AWQ Marlin
......@@ -51,7 +52,7 @@ def test_cpu_offload_awq():
max_wait_seconds=480)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or is_hip(),
reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_compressed_tensors():
# Test wNa16
......
......@@ -7,11 +7,12 @@ import os
from tests.quantization.utils import is_quant_method_supported
from ..utils import models_path_prefix
from vllm.utils import is_hip
MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-random")]
@pytest.mark.skipif(not is_quant_method_supported("experts_int8"),
@pytest.mark.skipif(not is_quant_method_supported("experts_int8") or is_hip(),
reason="ExpertsInt8 is not supported on this GPU type.")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
......
......@@ -12,6 +12,7 @@ from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
Fp8LinearMethod)
from vllm.platforms import current_platform
from ..utils import models_path_prefix
from vllm.utils import is_hip
MODELS = [
os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"),
......@@ -20,7 +21,7 @@ MODELS = [
]
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
@pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_id", MODELS)
@pytest.mark.parametrize("force_marlin", [False, True])
......@@ -45,7 +46,7 @@ KV_CACHE_MODELS = [
]
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
@pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
......@@ -66,7 +67,7 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
print(outputs[0][1])
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
@pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
@pytest.mark.parametrize("force_marlin", [False, True])
......@@ -97,7 +98,7 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
assert fc1.weight.dtype == torch.int32
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
@pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
def test_scaled_fp8_quant(dtype) -> None:
......
......@@ -20,8 +20,8 @@ PROMPT = "On the surface of Mars, we found"
MODELS_QUANT = [(
os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse"),
True), (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), False),
(os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), False)]
True), (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), False)]
# (os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), False)]
@pytest.mark.parametrize("model_lm_head_quant", MODELS_QUANT)
......
......@@ -7,6 +7,7 @@ import torch.nn.functional as F
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
from vllm.model_executor.utils import set_random_seed
from vllm.utils import is_hip
CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
......@@ -43,8 +44,7 @@ def mock_causal_accepted_tensor(
"which_tokens_accepted",
["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"])
@pytest.mark.parametrize("device", CUDA_DEVICES)
# @pytest.mark.parametrize("use_flashinfer", [True, False])
@pytest.mark.parametrize("use_flashinfer", [False])
@pytest.mark.parametrize("use_flashinfer", [True, False] if not is_hip() else [False])
@torch.inference_mode()
def test_correct_output_format(which_tokens_accepted: str, seed: int,
device: str, use_flashinfer: bool):
......@@ -128,8 +128,7 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
@pytest.mark.parametrize("vocab_size", [30_000, 50_000])
@pytest.mark.parametrize("batch_size", list(range(1, 32)))
@pytest.mark.parametrize("device", CUDA_DEVICES)
# @pytest.mark.parametrize("use_flashinfer", [True, False])
@pytest.mark.parametrize("use_flashinfer", [False])
@pytest.mark.parametrize("use_flashinfer", [True, False] if not is_hip() else [False])
@torch.inference_mode()
def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
device: str, use_flashinfer: bool):
......@@ -161,8 +160,7 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
@pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
@pytest.mark.parametrize("n_rep", [100])
@pytest.mark.parametrize("device", CUDA_DEVICES)
# @pytest.mark.parametrize("use_flashinfer", [True, False])
@pytest.mark.parametrize("use_flashinfer", [False])
@pytest.mark.parametrize("use_flashinfer", [True, False] if not is_hip() else [False])
@torch.inference_mode()
def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
frac_seeded: float, n_rep: int, device: str,
......@@ -240,8 +238,7 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
for i in range(batch_size)
}
# for use_flashinfer in [True, False]:
for use_flashinfer in [False]:
for use_flashinfer in [True, False] if not is_hip() else [False]:
rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
rejection_sampler.init_gpu_tensors(device=device)
# We use seeded sequences to ensure the same tokens are accepted
......@@ -262,8 +259,7 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
@pytest.mark.parametrize("which_token_ids",
["bonus_token_ids", "draft_token_ids"])
@pytest.mark.parametrize("device", CUDA_DEVICES)
# @pytest.mark.parametrize("use_flashinfer", [True, False])
@pytest.mark.parametrize("use_flashinfer", [False])
@pytest.mark.parametrize("use_flashinfer", [True, False] if not is_hip() else [False])
@torch.inference_mode()
def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
which_token_ids: str, device: str,
......@@ -315,8 +311,7 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
@pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
@pytest.mark.parametrize("seed", list(range(5)))
# @pytest.mark.parametrize("use_flashinfer", [True, False])
@pytest.mark.parametrize("use_flashinfer", [False])
@pytest.mark.parametrize("use_flashinfer", [True, False] if not is_hip() else [False])
@torch.inference_mode()
def test_rejection_sampling_approximates_target_distribution(
seed: int, draft_and_target_probs_equal: bool, use_flashinfer: bool):
......
......@@ -89,7 +89,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize("model, test_llm_kwargs",
[("JackFram/llama-68m", [
[(os.path.join(models_path_prefix, "JackFram/llama-68m"), [
"--speculative-model",
os.path.join(models_path_prefix, "JackFram/llama-68m"),
"--num_speculative-tokens",
......@@ -97,9 +97,11 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
"--speculative-draft-tensor-parallel-size",
"1",
]),
(os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"), [
# (os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"), [
(os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct-2k"), [
"--speculative-model",
os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"),
# os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"),
os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct-2k"),
"--num_speculative-tokens",
"5",
"--speculative-draft-tensor-parallel-size",
......
......@@ -261,7 +261,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
},
{
"model_name": os.path.join(models_path_prefix, "JackFram/llama-160m")",
"model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
},
])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......
......@@ -10,7 +10,7 @@ import pytest
import torch
from tensorizer import EncryptionParams
from vllm import SamplingParams
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.engine.arg_utils import EngineArgs
# yapf: disable
from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
......@@ -25,6 +25,9 @@ from ..conftest import VllmRunner
from ..utils import RemoteOpenAIServer, models_path_prefix
from .conftest import retry_until_skip
from typing import List, Optional, Tuple
from vllm.lora.request import LoRARequest
# yapf conflicts with isort for this docstring
......@@ -155,11 +158,92 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
assert outputs == deserialized_outputs
def create_test_prompts(
lora_path: str
) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
"""Create a list of test prompts with their sampling parameters.
2 requests for base model, 4 requests for the LoRA. We define 2
different LoRA adapters (using the same model for demo purposes).
Since we also set `max_loras=1`, the expectation is that the requests
with the second LoRA adapter will be ran after all requests with the
first adapter have finished.
"""
return [
("A robot may not injure a human being",
SamplingParams(temperature=0.0,
logprobs=1,
prompt_logprobs=1,
max_tokens=128), None),
("To be or not to be,",
SamplingParams(temperature=0.8,
top_k=5,
presence_penalty=0.2,
max_tokens=128), None),
(
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
SamplingParams(temperature=0.0,
logprobs=1,
prompt_logprobs=1,
max_tokens=128,
stop_token_ids=[32003]),
LoRARequest("sql-lora", 1, lora_path)),
(
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501
SamplingParams(n=3,
best_of=3,
use_beam_search=True,
temperature=0,
max_tokens=128,
stop_token_ids=[32003]),
LoRARequest("sql-lora", 1, lora_path)),
(
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
SamplingParams(temperature=0.0,
logprobs=1,
prompt_logprobs=1,
max_tokens=128,
stop_token_ids=[32003]),
LoRARequest("sql-lora2", 2, lora_path)),
(
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501
SamplingParams(n=3,
best_of=3,
use_beam_search=True,
temperature=0,
max_tokens=128,
stop_token_ids=[32003]),
LoRARequest("sql-lora", 1, lora_path)),
]
def process_requests(engine: LLMEngine,
test_prompts: List[Tuple[str, SamplingParams,
Optional[LoRARequest]]]):
"""Continuously process a list of prompts and handle the outputs."""
request_id = 0
while test_prompts or engine.has_unfinished_requests():
if test_prompts:
prompt, sampling_params, lora_request = test_prompts.pop(0)
engine.add_request(str(request_id),
prompt,
sampling_params,
lora_request=lora_request)
request_id += 1
request_outputs: List[RequestOutput] = engine.step()
for request_output in request_outputs:
if request_output.finished:
print(request_output)
def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
# from huggingface_hub import snapshot_download
from examples.multilora_inference import (create_test_prompts,
process_requests)
# from examples.multilora_inference import (create_test_prompts,
# process_requests)
model_ref = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")
# lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
......
......@@ -14,7 +14,8 @@ TOKENIZER_NAMES = [
@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
def test_tokenizer_revision(tokenizer_name: str):
# Assume that "main" branch always exists
tokenizer = get_tokenizer(tokenizer_name, revision="main")
# tokenizer = get_tokenizer(tokenizer_name, revision="main")
tokenizer = get_tokenizer(tokenizer_name)
assert isinstance(tokenizer, PreTrainedTokenizerBase)
# Assume that "never" branch always does not exist
......
......@@ -15,7 +15,7 @@ def test_weight_loading(vllm_runner):
Test parameter weight loading with tp>1.
"""
with vllm_runner(model_name=MODEL_NAME,
revision=REVISION,
# revision=REVISION,
dtype=torch.half if QUANTIZATION == "gptq" else "auto",
quantization=QUANTIZATION,
max_model_len=MAX_MODEL_LEN,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment