Commit 3c9817d2 authored by zhuwenwen's avatar zhuwenwen
Browse files

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path...

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH  to load models from local path instead of Hugging Face Hub
parent 49204f68
......@@ -2,11 +2,13 @@ from contextlib import nullcontext
import numpy as np
import pytest
import os
from transformers import CLIPImageProcessor, LlavaNextImageProcessor
from vllm.config import ModelConfig
from vllm.multimodal import MultiModalRegistry
from vllm.multimodal.utils import rescale_image_size
from ..utils import models_path_prefix
@pytest.fixture
......@@ -17,7 +19,7 @@ def mm_registry():
@pytest.mark.parametrize("dtype", ["half", "float"])
@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")
hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
assert isinstance(hf_processor, CLIPImageProcessor)
......@@ -60,7 +62,7 @@ def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
def test_llava_next_image_processor(image_assets, mm_registry, dtype,
size_factor):
MODEL_NAME = "llava-hf/llava-v1.6-vicuna-7b-hf"
MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-v1.6-vicuna-7b-hf")
hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
assert isinstance(hf_processor, LlavaNextImageProcessor)
......@@ -105,7 +107,7 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype,
(2, 1, False), (2, 2, True)],
)
def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")
model_config = ModelConfig(
model=MODEL_NAME,
......@@ -135,7 +137,7 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
# NOTE: We don't test zero images since the HF processor doesn't support it
@pytest.mark.parametrize("num_images", [1, 2])
def test_image_mapper_multi(image_assets, mm_registry, num_images):
MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")
model_config = ModelConfig(
model=MODEL_NAME,
......
......@@ -4,6 +4,7 @@ from unittest.mock import patch
import pytest
import torch
import os
from vllm.inputs import InputContext, LLMInputs
from vllm.inputs.registry import InputRegistry
......@@ -11,11 +12,12 @@ from vllm.multimodal import MultiModalRegistry
from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
from ..models.utils import build_model_context
from ..utils import models_path_prefix
# Used for fast tests where the model doesn't matter
DUMMY_MODEL_ID = "facebook/opt-125m"
DUMMY_MODEL_ID = os.path.join(models_path_prefix, "facebook/opt-125m")
# Used for tests that need a multimodal model
MULTIMODAL_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
MULTIMODAL_MODEL_ID = os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")
# For mm_processor_kwargs - we test overrides by defining mocks for each place
# it is used, and ensuring that we can pass processor kwargs an override value
......
......@@ -2,21 +2,23 @@
Run `pytest tests/prefix_caching/test_prefix_caching.py`.
"""
import os
import pytest
from tests.conftest import cleanup
from vllm import LLM
from ..utils import models_path_prefix
MODEL_LEN_LEN = [
# Example models with sliding window.
("bigcode/starcoder2-3b", 4096, 16384),
(os.path.join(models_path_prefix, "bigcode/starcoder2-3b"), 4096, 16384),
# ("mistralai/Mistral-7B-v0.1", 4096, 32768), << OOM in CI
# Confirm model with sliding window works.
# config has "use_sliding_window": false
("Qwen/Qwen1.5-0.5B-Chat", 32768, 32768),
(os.path.join(models_path_prefix, "Qwen/Qwen1.5-0.5B-Chat"), 32768, 32768),
# config has no sliding window attribute.
("TinyLlama/TinyLlama-1.1B-Chat-v1.0", 2048, 2048),
(os.path.join(models_path_prefix, "TinyLlama/TinyLlama-1.1B-Chat-v1.0"), 2048, 2048),
]
......
......@@ -5,6 +5,7 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`.
from typing import List
import pytest
import os
from tests.kernels.utils import override_backend_env_variable
from vllm.block import PhysicalTokenBlock
......@@ -12,9 +13,10 @@ from vllm.core.block_manager_v1 import CachedBlockAllocator
from vllm.utils import Device
from ..models.utils import check_outputs_equal
from ..utils import models_path_prefix
MODELS = [
"facebook/opt-125m",
os.path.join(models_path_prefix, "facebook/opt-125m"),
]
......
import pytest
import os
import vllm
from vllm.prompt_adapter.request import PromptAdapterRequest
from ..utils import models_path_prefix
MODEL_PATH = "bigscience/bloomz-560m"
PA_PATH = 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM'
MODEL_PATH = os.path.join(models_path_prefix, "bigscience/bloomz-560m")
PA_PATH = os.path.join(models_path_prefix, 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM')
def do_sample(llm, pa_name: str, pa_id: int):
......
from vllm import EngineArgs, LLMEngine, SamplingParams
from vllm.prompt_adapter.request import PromptAdapterRequest
from ..utils import models_path_prefix
import os
MODEL_PATH = "bigscience/bloomz-560m"
pa_path = 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM'
pa_path2 = 'swapnilbp/angry_tweet_ptune'
MODEL_PATH = os.path.join(models_path_prefix, "bigscience/bloomz-560m")
pa_path = os.path.join(models_path_prefix, 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM')
pa_path2 = os.path.join(models_path_prefix, 'swapnilbp/angry_tweet_ptune')
def do_sample(engine):
......
......@@ -3,10 +3,14 @@ from huggingface_hub import snapshot_download
from vllm import EngineArgs, LLMEngine, SamplingParams
from vllm.lora.request import LoRARequest
from vllm.prompt_adapter.request import PromptAdapterRequest
from ..utils import models_path_prefix
import os
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
pa_path = snapshot_download(repo_id="swapnilbp/llama_tweet_ptune")
lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
MODEL_PATH = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")
# pa_path = snapshot_download(repo_id="swapnilbp/llama_tweet_ptune")
# lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
pa_path = os.path.join(models_path_prefix, "swapnilbp/llama_tweet_ptune")
lora_path = os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test")
def do_sample(engine):
......
......@@ -4,27 +4,29 @@ Run `pytest tests/quantization/test_bitsandbytes.py`.
'''
import gc
import os
import pytest
import torch
from tests.quantization.utils import is_quant_method_supported
from ..utils import fork_new_process_for_each_test
from ..utils import fork_new_process_for_each_test, models_path_prefix
models_4bit_to_test = [
('huggyllama/llama-7b', 'quantize model inflight'),
(os.path.join(models_path_prefix, 'huggyllama/llama-7b'), 'quantize model inflight'),
]
models_pre_qaunt_4bit_to_test = [
('lllyasviel/omost-llama-3-8b-4bits',
(os.path.join(models_path_prefix, 'lllyasviel/omost-llama-3-8b-4bits'),
'read pre-quantized 4-bit NF4 model'),
('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed',
(os.path.join(models_path_prefix, 'PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed'),
'read pre-quantized 4-bit FP4 model'),
]
models_pre_quant_8bit_to_test = [
('meta-llama/Llama-Guard-3-8B-INT8', 'read pre-quantized 8-bit model'),
(os.path.join(models_path_prefix, 'meta-llama/Llama-Guard-3-8B-INT8'), 'read pre-quantized 8-bit model'),
]
......
......@@ -5,6 +5,7 @@ Run `pytest tests/quantization/test_compressed_tensors.py`.
import pytest
import torch
import os
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
......@@ -12,12 +13,13 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
QuantizationType)
from ..utils import models_path_prefix
@pytest.mark.parametrize("model_args", [
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor",
(os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"), "tensor",
QuantizationType.INT, 2560),
("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel",
(os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor"), "channel",
QuantizationType.INT, 2560),
])
def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
......@@ -61,15 +63,15 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
def test_compressed_tensors_no_enforce_eager(vllm_runner):
model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
model_path = os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change")
with vllm_runner(model_path) as llm:
output = llm.generate_greedy("Hello my name is", max_tokens=20)
assert output
@pytest.mark.parametrize("model_args", [
("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"),
("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", "channel"),
(os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"), "tensor"),
(os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"), "channel"),
])
def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):
model_path, strategy = model_args
......@@ -91,9 +93,9 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):
@pytest.mark.parametrize(
"wNa16_args",
[("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8),
("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8),
("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4)])
[(os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w4a16-channel-v2"), "channel", None, 8),
(os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w4a16-group128-v2"), "group", 128, 8),
(os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w8a16-per-channel"), "channel", None, 4)])
def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
model, strategy, group, pack_factor = wNa16_args
with vllm_runner(model) as llm:
......@@ -116,7 +118,7 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
def test_compressed_tensors_w4a16_marlin24(vllm_runner):
model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"
model_path = os.path.join(models_path_prefix,"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t")
with vllm_runner(model_path) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
layer = model.model.layers[0]
......@@ -132,7 +134,7 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner):
def test_compressed_tensors_fp8(vllm_runner):
model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
model_path = os.path.join(models_path_prefix,"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test")
with vllm_runner(model_path) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
layer = model.model.layers[0]
......@@ -157,7 +159,7 @@ def test_compressed_tensors_fp8(vllm_runner):
def test_compressed_tensors_kv_cache(vllm_runner):
model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
model_path = os.path.join(models_path_prefix,"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")
with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
output = llm.generate_greedy("Hello world!", max_tokens=20)
assert output
\ No newline at end of file
......@@ -7,8 +7,10 @@ from dataclasses import dataclass
from typing import Tuple
import pytest
import os
from vllm.config import ModelConfig
from ..utils import models_path_prefix
@dataclass
......@@ -22,32 +24,32 @@ MODEL_ARG_EXPTYPES = [
# AUTOGPTQ
# compat: autogptq <=0.7.1 is_marlin_format: bool
# Model Serialized in Marlin Format should always use Marlin kernel.
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", None, "marlin"),
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "marlin", "marlin"),
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "gptq", "marlin"),
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "awq", "ERROR"),
(os.path.join(models_path_prefix, "neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"), None, "marlin"),
(os.path.join(models_path_prefix, "neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"), "marlin", "marlin"),
(os.path.join(models_path_prefix, "neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"), "gptq", "marlin"),
(os.path.join(models_path_prefix, "neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"), "awq", "ERROR"),
# Model Serialized in Exllama Format.
("TheBloke/Llama-2-7B-Chat-GPTQ", None, "gptq_marlin"),
("TheBloke/Llama-2-7B-Chat-GPTQ", "marlin", "gptq_marlin"),
("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq", "gptq"),
("TheBloke/Llama-2-7B-Chat-GPTQ", "awq", "ERROR"),
(os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), None, "gptq_marlin"),
(os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "marlin", "gptq_marlin"),
(os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "gptq", "gptq"),
(os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "awq", "ERROR"),
# compat: autogptq >=0.8.0 use checkpoint_format: str
# Model Serialized in Marlin Format should always use Marlin kernel.
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", None, "marlin"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "marlin", "marlin"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "gptq", "marlin"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "awq", "ERROR"),
(os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"), None, "marlin"),
(os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"), "marlin", "marlin"),
(os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"), "gptq", "marlin"),
(os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"), "awq", "ERROR"),
# Model Serialized in Exllama Format.
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", None, "gptq_marlin"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq", "gptq"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"),
(os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), None, "gptq_marlin"),
(os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), "marlin", "gptq_marlin"),
(os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), "gptq", "gptq"),
(os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), "awq", "ERROR"),
# AUTOAWQ
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq_marlin"),
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"),
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "awq_marlin"),
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "gptq", "ERROR"),
(os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), None, "awq_marlin"),
(os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), "awq", "awq"),
(os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), "marlin", "awq_marlin"),
(os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), "gptq", "ERROR"),
]
......
......@@ -2,22 +2,23 @@
# Base tests: tests/basic_correctness/test_cpu_offload.py
import pytest
import os
from tests.quantization.utils import is_quant_method_supported
from ..utils import compare_two_settings
from ..utils import compare_two_settings, models_path_prefix
@pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.")
def test_cpu_offload_fp8():
# Test quantization of an unquantized checkpoint
compare_two_settings("meta-llama/Meta-Llama-3-8B-Instruct",
compare_two_settings(os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
["--quantization", "fp8"],
["--quantization", "fp8", "--cpu-offload-gb", "2"],
max_wait_seconds=480)
# Test loading a quantized checkpoint
compare_two_settings("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", [],
compare_two_settings(os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), [],
["--cpu-offload-gb", "2"],
max_wait_seconds=480)
......@@ -26,11 +27,11 @@ def test_cpu_offload_fp8():
reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_gptq():
# Test GPTQ Marlin
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [],
compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"), [],
["--cpu-offload-gb", "1"],
max_wait_seconds=480)
# Test GPTQ
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4",
compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"),
["--quantization", "gptq"],
["--quantization", "gptq", "--cpu-offload-gb", "1"],
max_wait_seconds=480)
......@@ -40,11 +41,11 @@ def test_cpu_offload_gptq():
reason="awq_marlin is not supported on this GPU type.")
def test_cpu_offload_awq():
# Test AWQ Marlin
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [],
compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-AWQ"), [],
["--cpu-offload-gb", "1"],
max_wait_seconds=480)
# Test AWQ
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ",
compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-AWQ"),
["--quantization", "awq"],
["--quantization", "awq", "--cpu-offload-gb", "1"],
max_wait_seconds=480)
......@@ -54,15 +55,15 @@ def test_cpu_offload_awq():
reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_compressed_tensors():
# Test wNa16
compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [],
compare_two_settings(os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w4a16-channel-v2"), [],
["--cpu-offload-gb", "1"],
max_wait_seconds=480)
# Test w4a16_marlin24
compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
compare_two_settings(os.path.join(models_path_prefix, "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"),
[], ["--cpu-offload-gb", "1"],
max_wait_seconds=480)
# Test w8a8
compare_two_settings(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", [],
os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"), [],
["--cpu-offload-gb", "1"],
max_wait_seconds=480)
......@@ -3,10 +3,12 @@
doesn't test correctness
"""
import pytest
import os
from tests.quantization.utils import is_quant_method_supported
from ..utils import models_path_prefix
MODELS = ["ai21labs/Jamba-tiny-random"]
MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-random")]
@pytest.mark.skipif(not is_quant_method_supported("experts_int8"),
......
......@@ -4,17 +4,19 @@ Run `pytest tests/quantization/test_fp8.py --forked`.
"""
import pytest
import torch
import os
from tests.quantization.utils import is_quant_method_supported
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
Fp8LinearMethod)
from vllm.platforms import current_platform
from ..utils import models_path_prefix
MODELS = [
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
"nm-testing/Phi-3-mini-128k-instruct-FP8",
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"),
os.path.join(models_path_prefix, "nm-testing/Phi-3-mini-128k-instruct-FP8"),
os.path.join(models_path_prefix, "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV"),
]
......@@ -37,9 +39,9 @@ def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
KV_CACHE_MODELS = [
# Deprecated AutoFP8 format using .kv_scale
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"),
# AutoFP8 format using separate .k_scale and .v_scale
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
os.path.join(models_path_prefix, "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V"),
]
......@@ -73,7 +75,7 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
if force_marlin:
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
with vllm_runner("facebook/opt-125m",
with vllm_runner(os.path.join(models_path_prefix, "facebook/opt-125m"),
quantization="fp8",
kv_cache_dtype=kv_cache_dtype) as llm:
......
......@@ -6,6 +6,7 @@ from typing import Tuple
import pytest
import torch
import os
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
from vllm.model_executor.layers.quantization.gptq_marlin import (
......@@ -13,13 +14,14 @@ from vllm.model_executor.layers.quantization.gptq_marlin import (
from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
from vllm.model_executor.layers.vocab_parallel_embedding import (
UnquantizedEmbeddingMethod)
from ..utils import models_path_prefix
PROMPT = "On the surface of Mars, we found"
MODELS_QUANT = [(
"LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse",
True), ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False),
("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)]
os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse"),
True), (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), False),
(os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), False)]
@pytest.mark.parametrize("model_lm_head_quant", MODELS_QUANT)
......
......@@ -4,6 +4,8 @@ Run `pytest tests/samplers/test_beam_search.py`.
"""
import pytest
import os
from ..utils import models_path_prefix
# FIXME(zhuohan): The test can not pass if we:
# 1. Increase max_tokens to 256.
......@@ -11,7 +13,7 @@ import pytest
# 3. Use the model "huggyllama/llama-7b".
MAX_TOKENS = [64]
BEAM_WIDTHS = [4]
MODELS = ["TinyLlama/TinyLlama-1.1B-Chat-v1.0"]
MODELS = [os.path.join(models_path_prefix, "TinyLlama/TinyLlama-1.1B-Chat-v1.0")]
@pytest.mark.parametrize("model", MODELS)
......
......@@ -4,12 +4,14 @@ Run `pytest tests/samplers/test_ignore_eos.py`.
"""
import pytest
import os
from vllm import SamplingParams
from ..utils import models_path_prefix
# We also test with llama because it has generation_config to specify EOS
# (past regression).
MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"]
MODELS = [os.path.join(models_path_prefix, "facebook/opt-125m"), os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")]
@pytest.mark.parametrize("model", MODELS)
......
import pytest
import torch
import os
from vllm import SamplingParams
from ..utils import models_path_prefix
MODELS = ["facebook/opt-125m"]
MODELS = [os.path.join(models_path_prefix, "facebook/opt-125m")]
@pytest.mark.parametrize("model", MODELS)
......
......@@ -2,12 +2,14 @@ from typing import List
import pytest
import torch
import os
from vllm import SamplingParams
from ..conftest import VllmRunner
from ..utils import models_path_prefix
MODELS = ["facebook/opt-125m"]
MODELS = [os.path.join(models_path_prefix, "facebook/opt-125m")]
@pytest.mark.parametrize("model", MODELS)
......@@ -130,7 +132,7 @@ def test_get_prompt_logprobs(
def test_max_logprobs():
runner = VllmRunner("facebook/opt-125m", max_logprobs=1)
runner = VllmRunner(os.path.join(models_path_prefix, "facebook/opt-125m"), max_logprobs=1)
vllm_sampling_params = SamplingParams(logprobs=1)
# should pass
runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
......
import pytest
import os
from vllm import SamplingParams
from ..utils import models_path_prefix
MODELS = ["facebook/opt-125m"]
MODELS = [os.path.join(models_path_prefix, "facebook/opt-125m")]
@pytest.mark.parametrize("model", MODELS)
......
......@@ -7,11 +7,13 @@ import random
from itertools import combinations
import pytest
import os
from vllm import SamplingParams
from vllm.model_executor.utils import set_random_seed
from ..utils import models_path_prefix
MODEL = "facebook/opt-125m"
MODEL = os.path.join(models_path_prefix, "facebook/opt-125m")
RANDOM_SEEDS = list(range(5))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment