Commit 2ff1c360 authored by zhuwenwen's avatar zhuwenwen
Browse files

update tests

parent 201768d5
...@@ -37,7 +37,7 @@ from vllm.logger import init_logger ...@@ -37,7 +37,7 @@ from vllm.logger import init_logger
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless, from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
identity, is_cpu) identity, is_cpu)
from utils import models_path_prefix from .utils import models_path_prefix
logger = init_logger(__name__) logger = init_logger(__name__)
......
...@@ -2,6 +2,7 @@ from http import HTTPStatus ...@@ -2,6 +2,7 @@ from http import HTTPStatus
import openai import openai
import pytest import pytest
import os
import pytest_asyncio import pytest_asyncio
import requests import requests
......
# imports for guided decoding tests # imports for guided decoding tests
import re import re
import os
import openai import openai
import pytest import pytest
......
...@@ -24,12 +24,15 @@ MAX_MODEL_LEN = 1024 ...@@ -24,12 +24,15 @@ MAX_MODEL_LEN = 1024
MODELS = [ MODELS = [
# act_order==False, group_size=channelwise # act_order==False, group_size=channelwise
(os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-gptq"), "main"), # (os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-gptq"), "main"),
(os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-gptq")),
# act_order==False, group_size=128 # act_order==False, group_size=128
("TheBloke/Llama-2-7B-GPTQ", "main"), # ("TheBloke/Llama-2-7B-GPTQ", "main"),
("TheBloke/Llama-2-7B-GPTQ"),
# act_order==True, group_size=128 # act_order==True, group_size=128
(os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "main"), # (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "main"),
(os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ")),
# act_order==True, group_size=64 # act_order==True, group_size=64
(os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-4bit-64g-actorder_True"), (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-4bit-64g-actorder_True"),
# act_order==True, group_size=32 # act_order==True, group_size=32
...@@ -43,7 +46,8 @@ MODELS = [ ...@@ -43,7 +46,8 @@ MODELS = [
(os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-8bit-32g-actorder_True"), (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-8bit-32g-actorder_True"),
# 4-bit, act_order==True, group_size=128 # 4-bit, act_order==True, group_size=128
(os.path.join(models_path_prefix, "TechxGenus/gemma-1.1-2b-it-GPTQ"), "main") # (os.path.join(models_path_prefix, "TechxGenus/gemma-1.1-2b-it-GPTQ"), "main")
(os.path.join(models_path_prefix, "TechxGenus/gemma-1.1-2b-it-GPTQ"))
] ]
......
...@@ -12,6 +12,7 @@ import torch ...@@ -12,6 +12,7 @@ import torch
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from ..utils import fork_new_process_for_each_test, models_path_prefix from ..utils import fork_new_process_for_each_test, models_path_prefix
from vllm.utils import is_hip
models_4bit_to_test = [ models_4bit_to_test = [
...@@ -30,7 +31,7 @@ models_pre_quant_8bit_to_test = [ ...@@ -30,7 +31,7 @@ models_pre_quant_8bit_to_test = [
] ]
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"), @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or is_hip(),
reason='bitsandbytes is not supported on this GPU type.') reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test) @pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@fork_new_process_for_each_test @fork_new_process_for_each_test
...@@ -42,7 +43,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts, ...@@ -42,7 +43,7 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name, hf_model_kwargs) model_name, hf_model_kwargs)
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"), @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or is_hip(),
reason='bitsandbytes is not supported on this GPU type.') reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", @pytest.mark.parametrize("model_name, description",
models_pre_qaunt_4bit_to_test) models_pre_qaunt_4bit_to_test)
...@@ -54,7 +55,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts, ...@@ -54,7 +55,7 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
model_name) model_name)
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"), @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or is_hip(),
reason='bitsandbytes is not supported on this GPU type.') reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", @pytest.mark.parametrize("model_name, description",
models_pre_quant_8bit_to_test) models_pre_quant_8bit_to_test)
...@@ -68,7 +69,7 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts, ...@@ -68,7 +69,7 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
@pytest.mark.skipif(torch.cuda.device_count() < 2, @pytest.mark.skipif(torch.cuda.device_count() < 2,
reason='Test requires at least 2 GPUs.') reason='Test requires at least 2 GPUs.')
@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"), @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or is_hip(),
reason='bitsandbytes is not supported on this GPU type.') reason='bitsandbytes is not supported on this GPU type.')
@pytest.mark.parametrize("model_name, description", models_4bit_to_test) @pytest.mark.parametrize("model_name, description", models_4bit_to_test)
@fork_new_process_for_each_test @fork_new_process_for_each_test
......
...@@ -7,9 +7,10 @@ import os ...@@ -7,9 +7,10 @@ import os
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from ..utils import compare_two_settings, models_path_prefix from ..utils import compare_two_settings, models_path_prefix
from vllm.utils import is_hip
@pytest.mark.skipif(not is_quant_method_supported("fp8"), @pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
reason="fp8 is not supported on this GPU type.") reason="fp8 is not supported on this GPU type.")
def test_cpu_offload_fp8(): def test_cpu_offload_fp8():
# Test quantization of an unquantized checkpoint # Test quantization of an unquantized checkpoint
...@@ -23,7 +24,7 @@ def test_cpu_offload_fp8(): ...@@ -23,7 +24,7 @@ def test_cpu_offload_fp8():
max_wait_seconds=480) max_wait_seconds=480)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or is_hip(),
reason="gptq_marlin is not supported on this GPU type.") reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_gptq(): def test_cpu_offload_gptq():
# Test GPTQ Marlin # Test GPTQ Marlin
...@@ -37,7 +38,7 @@ def test_cpu_offload_gptq(): ...@@ -37,7 +38,7 @@ def test_cpu_offload_gptq():
max_wait_seconds=480) max_wait_seconds=480)
@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"), @pytest.mark.skipif(not is_quant_method_supported("awq_marlin") or is_hip(),
reason="awq_marlin is not supported on this GPU type.") reason="awq_marlin is not supported on this GPU type.")
def test_cpu_offload_awq(): def test_cpu_offload_awq():
# Test AWQ Marlin # Test AWQ Marlin
...@@ -51,7 +52,7 @@ def test_cpu_offload_awq(): ...@@ -51,7 +52,7 @@ def test_cpu_offload_awq():
max_wait_seconds=480) max_wait_seconds=480)
@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or is_hip(),
reason="gptq_marlin is not supported on this GPU type.") reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_compressed_tensors(): def test_cpu_offload_compressed_tensors():
# Test wNa16 # Test wNa16
......
...@@ -7,11 +7,12 @@ import os ...@@ -7,11 +7,12 @@ import os
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from ..utils import models_path_prefix from ..utils import models_path_prefix
from vllm.utils import is_hip
MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-random")] MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-random")]
@pytest.mark.skipif(not is_quant_method_supported("experts_int8"), @pytest.mark.skipif(not is_quant_method_supported("experts_int8") or is_hip(),
reason="ExpertsInt8 is not supported on this GPU type.") reason="ExpertsInt8 is not supported on this GPU type.")
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("dtype", ["bfloat16"])
......
...@@ -12,6 +12,7 @@ from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod, ...@@ -12,6 +12,7 @@ from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
Fp8LinearMethod) Fp8LinearMethod)
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ..utils import models_path_prefix from ..utils import models_path_prefix
from vllm.utils import is_hip
MODELS = [ MODELS = [
os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"), os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"),
...@@ -20,7 +21,7 @@ MODELS = [ ...@@ -20,7 +21,7 @@ MODELS = [
] ]
@pytest.mark.skipif(not is_quant_method_supported("fp8"), @pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
reason="FP8 is not supported on this GPU type.") reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_id", MODELS) @pytest.mark.parametrize("model_id", MODELS)
@pytest.mark.parametrize("force_marlin", [False, True]) @pytest.mark.parametrize("force_marlin", [False, True])
...@@ -45,7 +46,7 @@ KV_CACHE_MODELS = [ ...@@ -45,7 +46,7 @@ KV_CACHE_MODELS = [
] ]
@pytest.mark.skipif(not is_quant_method_supported("fp8"), @pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
reason="FP8 is not supported on this GPU type.") reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_id", KV_CACHE_MODELS) @pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
def test_kv_cache_model_load_and_run(vllm_runner, model_id: str): def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
...@@ -66,7 +67,7 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str): ...@@ -66,7 +67,7 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str):
print(outputs[0][1]) print(outputs[0][1])
@pytest.mark.skipif(not is_quant_method_supported("fp8"), @pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
reason="FP8 is not supported on this GPU type.") reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
@pytest.mark.parametrize("force_marlin", [False, True]) @pytest.mark.parametrize("force_marlin", [False, True])
...@@ -97,7 +98,7 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool, ...@@ -97,7 +98,7 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
assert fc1.weight.dtype == torch.int32 assert fc1.weight.dtype == torch.int32
@pytest.mark.skipif(not is_quant_method_supported("fp8"), @pytest.mark.skipif(not is_quant_method_supported("fp8") or is_hip(),
reason="FP8 is not supported on this GPU type.") reason="FP8 is not supported on this GPU type.")
@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
def test_scaled_fp8_quant(dtype) -> None: def test_scaled_fp8_quant(dtype) -> None:
......
...@@ -20,8 +20,8 @@ PROMPT = "On the surface of Mars, we found" ...@@ -20,8 +20,8 @@ PROMPT = "On the surface of Mars, we found"
MODELS_QUANT = [( MODELS_QUANT = [(
os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse"), os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse"),
True), (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), False), True), (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), False)]
(os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), False)] # (os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), False)]
@pytest.mark.parametrize("model_lm_head_quant", MODELS_QUANT) @pytest.mark.parametrize("model_lm_head_quant", MODELS_QUANT)
......
...@@ -7,6 +7,7 @@ import torch.nn.functional as F ...@@ -7,6 +7,7 @@ import torch.nn.functional as F
from vllm.model_executor.layers.rejection_sampler import RejectionSampler from vllm.model_executor.layers.rejection_sampler import RejectionSampler
from vllm.model_executor.utils import set_random_seed from vllm.model_executor.utils import set_random_seed
from vllm.utils import is_hip
CUDA_DEVICES = [ CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2) f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
...@@ -43,8 +44,7 @@ def mock_causal_accepted_tensor( ...@@ -43,8 +44,7 @@ def mock_causal_accepted_tensor(
"which_tokens_accepted", "which_tokens_accepted",
["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"]) ["all_tokens_accepted", "no_tokens_accepted", "some_tokens_accepted"])
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
# @pytest.mark.parametrize("use_flashinfer", [True, False]) @pytest.mark.parametrize("use_flashinfer", [True, False] if not is_hip() else [False])
@pytest.mark.parametrize("use_flashinfer", [False])
@torch.inference_mode() @torch.inference_mode()
def test_correct_output_format(which_tokens_accepted: str, seed: int, def test_correct_output_format(which_tokens_accepted: str, seed: int,
device: str, use_flashinfer: bool): device: str, use_flashinfer: bool):
...@@ -128,8 +128,7 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int, ...@@ -128,8 +128,7 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
@pytest.mark.parametrize("vocab_size", [30_000, 50_000]) @pytest.mark.parametrize("vocab_size", [30_000, 50_000])
@pytest.mark.parametrize("batch_size", list(range(1, 32))) @pytest.mark.parametrize("batch_size", list(range(1, 32)))
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
# @pytest.mark.parametrize("use_flashinfer", [True, False]) @pytest.mark.parametrize("use_flashinfer", [True, False] if not is_hip() else [False])
@pytest.mark.parametrize("use_flashinfer", [False])
@torch.inference_mode() @torch.inference_mode()
def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int, def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
device: str, use_flashinfer: bool): device: str, use_flashinfer: bool):
...@@ -161,8 +160,7 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int, ...@@ -161,8 +160,7 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
@pytest.mark.parametrize("batch_size", [1, 8, 32, 128]) @pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
@pytest.mark.parametrize("n_rep", [100]) @pytest.mark.parametrize("n_rep", [100])
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
# @pytest.mark.parametrize("use_flashinfer", [True, False]) @pytest.mark.parametrize("use_flashinfer", [True, False] if not is_hip() else [False])
@pytest.mark.parametrize("use_flashinfer", [False])
@torch.inference_mode() @torch.inference_mode()
def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int, def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
frac_seeded: float, n_rep: int, device: str, frac_seeded: float, n_rep: int, device: str,
...@@ -240,8 +238,7 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int, ...@@ -240,8 +238,7 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
for i in range(batch_size) for i in range(batch_size)
} }
# for use_flashinfer in [True, False]: for use_flashinfer in [True, False] if not is_hip() else [False]:
for use_flashinfer in [False]:
rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer) rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
rejection_sampler.init_gpu_tensors(device=device) rejection_sampler.init_gpu_tensors(device=device)
# We use seeded sequences to ensure the same tokens are accepted # We use seeded sequences to ensure the same tokens are accepted
...@@ -262,8 +259,7 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int, ...@@ -262,8 +259,7 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
@pytest.mark.parametrize("which_token_ids", @pytest.mark.parametrize("which_token_ids",
["bonus_token_ids", "draft_token_ids"]) ["bonus_token_ids", "draft_token_ids"])
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
# @pytest.mark.parametrize("use_flashinfer", [True, False]) @pytest.mark.parametrize("use_flashinfer", [True, False] if not is_hip() else [False])
@pytest.mark.parametrize("use_flashinfer", [False])
@torch.inference_mode() @torch.inference_mode()
def test_raises_when_vocab_oob(above_or_below_vocab_range: str, def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
which_token_ids: str, device: str, which_token_ids: str, device: str,
...@@ -315,8 +311,7 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str, ...@@ -315,8 +311,7 @@ def test_raises_when_vocab_oob(above_or_below_vocab_range: str,
@pytest.mark.parametrize("draft_and_target_probs_equal", [True, False]) @pytest.mark.parametrize("draft_and_target_probs_equal", [True, False])
@pytest.mark.parametrize("seed", list(range(5))) @pytest.mark.parametrize("seed", list(range(5)))
# @pytest.mark.parametrize("use_flashinfer", [True, False]) @pytest.mark.parametrize("use_flashinfer", [True, False] if not is_hip() else [False])
@pytest.mark.parametrize("use_flashinfer", [False])
@torch.inference_mode() @torch.inference_mode()
def test_rejection_sampling_approximates_target_distribution( def test_rejection_sampling_approximates_target_distribution(
seed: int, draft_and_target_probs_equal: bool, use_flashinfer: bool): seed: int, draft_and_target_probs_equal: bool, use_flashinfer: bool):
......
...@@ -89,7 +89,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs, ...@@ -89,7 +89,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
@pytest.mark.parametrize("per_test_common_llm_kwargs", [[]]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [[]])
@pytest.mark.parametrize("baseline_llm_kwargs", [[]]) @pytest.mark.parametrize("baseline_llm_kwargs", [[]])
@pytest.mark.parametrize("model, test_llm_kwargs", @pytest.mark.parametrize("model, test_llm_kwargs",
[("JackFram/llama-68m", [ [(os.path.join(models_path_prefix, "JackFram/llama-68m"), [
"--speculative-model", "--speculative-model",
os.path.join(models_path_prefix, "JackFram/llama-68m"), os.path.join(models_path_prefix, "JackFram/llama-68m"),
"--num_speculative-tokens", "--num_speculative-tokens",
...@@ -97,9 +97,11 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs, ...@@ -97,9 +97,11 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
"--speculative-draft-tensor-parallel-size", "--speculative-draft-tensor-parallel-size",
"1", "1",
]), ]),
(os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"), [ # (os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"), [
(os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct-2k"), [
"--speculative-model", "--speculative-model",
os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"), # os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"),
os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct-2k"),
"--num_speculative-tokens", "--num_speculative-tokens",
"5", "5",
"--speculative-draft-tensor-parallel-size", "--speculative-draft-tensor-parallel-size",
......
...@@ -261,7 +261,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs( ...@@ -261,7 +261,7 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
"model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"), "model_name": os.path.join(models_path_prefix, "JackFram/llama-68m"),
}, },
{ {
"model_name": os.path.join(models_path_prefix, "JackFram/llama-160m")", "model_name": os.path.join(models_path_prefix, "JackFram/llama-160m"),
}, },
]) ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}]) @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
......
...@@ -10,7 +10,7 @@ import pytest ...@@ -10,7 +10,7 @@ import pytest
import torch import torch
from tensorizer import EncryptionParams from tensorizer import EncryptionParams
from vllm import SamplingParams from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
# yapf: disable # yapf: disable
from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig, from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
...@@ -25,6 +25,9 @@ from ..conftest import VllmRunner ...@@ -25,6 +25,9 @@ from ..conftest import VllmRunner
from ..utils import RemoteOpenAIServer, models_path_prefix from ..utils import RemoteOpenAIServer, models_path_prefix
from .conftest import retry_until_skip from .conftest import retry_until_skip
from typing import List, Optional, Tuple
from vllm.lora.request import LoRARequest
# yapf conflicts with isort for this docstring # yapf conflicts with isort for this docstring
...@@ -155,11 +158,92 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner, ...@@ -155,11 +158,92 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
assert outputs == deserialized_outputs assert outputs == deserialized_outputs
def create_test_prompts(
lora_path: str
) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
"""Create a list of test prompts with their sampling parameters.
2 requests for base model, 4 requests for the LoRA. We define 2
different LoRA adapters (using the same model for demo purposes).
Since we also set `max_loras=1`, the expectation is that the requests
with the second LoRA adapter will be ran after all requests with the
first adapter have finished.
"""
return [
("A robot may not injure a human being",
SamplingParams(temperature=0.0,
logprobs=1,
prompt_logprobs=1,
max_tokens=128), None),
("To be or not to be,",
SamplingParams(temperature=0.8,
top_k=5,
presence_penalty=0.2,
max_tokens=128), None),
(
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
SamplingParams(temperature=0.0,
logprobs=1,
prompt_logprobs=1,
max_tokens=128,
stop_token_ids=[32003]),
LoRARequest("sql-lora", 1, lora_path)),
(
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501
SamplingParams(n=3,
best_of=3,
use_beam_search=True,
temperature=0,
max_tokens=128,
stop_token_ids=[32003]),
LoRARequest("sql-lora", 1, lora_path)),
(
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
SamplingParams(temperature=0.0,
logprobs=1,
prompt_logprobs=1,
max_tokens=128,
stop_token_ids=[32003]),
LoRARequest("sql-lora2", 2, lora_path)),
(
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501
SamplingParams(n=3,
best_of=3,
use_beam_search=True,
temperature=0,
max_tokens=128,
stop_token_ids=[32003]),
LoRARequest("sql-lora", 1, lora_path)),
]
def process_requests(engine: LLMEngine,
test_prompts: List[Tuple[str, SamplingParams,
Optional[LoRARequest]]]):
"""Continuously process a list of prompts and handle the outputs."""
request_id = 0
while test_prompts or engine.has_unfinished_requests():
if test_prompts:
prompt, sampling_params, lora_request = test_prompts.pop(0)
engine.add_request(str(request_id),
prompt,
sampling_params,
lora_request=lora_request)
request_id += 1
request_outputs: List[RequestOutput] = engine.step()
for request_output in request_outputs:
if request_output.finished:
print(request_output)
def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path): def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
# from huggingface_hub import snapshot_download # from huggingface_hub import snapshot_download
from examples.multilora_inference import (create_test_prompts, # from examples.multilora_inference import (create_test_prompts,
process_requests) # process_requests)
model_ref = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf") model_ref = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")
# lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") # lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
......
...@@ -14,7 +14,8 @@ TOKENIZER_NAMES = [ ...@@ -14,7 +14,8 @@ TOKENIZER_NAMES = [
@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES) @pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
def test_tokenizer_revision(tokenizer_name: str): def test_tokenizer_revision(tokenizer_name: str):
# Assume that "main" branch always exists # Assume that "main" branch always exists
tokenizer = get_tokenizer(tokenizer_name, revision="main") # tokenizer = get_tokenizer(tokenizer_name, revision="main")
tokenizer = get_tokenizer(tokenizer_name)
assert isinstance(tokenizer, PreTrainedTokenizerBase) assert isinstance(tokenizer, PreTrainedTokenizerBase)
# Assume that "never" branch always does not exist # Assume that "never" branch always does not exist
......
...@@ -15,7 +15,7 @@ def test_weight_loading(vllm_runner): ...@@ -15,7 +15,7 @@ def test_weight_loading(vllm_runner):
Test parameter weight loading with tp>1. Test parameter weight loading with tp>1.
""" """
with vllm_runner(model_name=MODEL_NAME, with vllm_runner(model_name=MODEL_NAME,
revision=REVISION, # revision=REVISION,
dtype=torch.half if QUANTIZATION == "gptq" else "auto", dtype=torch.half if QUANTIZATION == "gptq" else "auto",
quantization=QUANTIZATION, quantization=QUANTIZATION,
max_model_len=MAX_MODEL_LEN, max_model_len=MAX_MODEL_LEN,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment