Commit 3c9817d2 authored by zhuwenwen's avatar zhuwenwen
Browse files

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path...

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH  to load models from local path instead of Hugging Face Hub
parent 49204f68
...@@ -2,11 +2,13 @@ from contextlib import nullcontext ...@@ -2,11 +2,13 @@ from contextlib import nullcontext
import numpy as np import numpy as np
import pytest import pytest
import os
from transformers import CLIPImageProcessor, LlavaNextImageProcessor from transformers import CLIPImageProcessor, LlavaNextImageProcessor
from vllm.config import ModelConfig from vllm.config import ModelConfig
from vllm.multimodal import MultiModalRegistry from vllm.multimodal import MultiModalRegistry
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
from ..utils import models_path_prefix
@pytest.fixture @pytest.fixture
...@@ -17,7 +19,7 @@ def mm_registry(): ...@@ -17,7 +19,7 @@ def mm_registry():
@pytest.mark.parametrize("dtype", ["half", "float"]) @pytest.mark.parametrize("dtype", ["half", "float"])
@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0]) @pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor): def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
MODEL_NAME = "llava-hf/llava-1.5-7b-hf" MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")
hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME) hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
assert isinstance(hf_processor, CLIPImageProcessor) assert isinstance(hf_processor, CLIPImageProcessor)
...@@ -60,7 +62,7 @@ def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor): ...@@ -60,7 +62,7 @@ def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0]) @pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
def test_llava_next_image_processor(image_assets, mm_registry, dtype, def test_llava_next_image_processor(image_assets, mm_registry, dtype,
size_factor): size_factor):
MODEL_NAME = "llava-hf/llava-v1.6-vicuna-7b-hf" MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-v1.6-vicuna-7b-hf")
hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME) hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
assert isinstance(hf_processor, LlavaNextImageProcessor) assert isinstance(hf_processor, LlavaNextImageProcessor)
...@@ -105,7 +107,7 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype, ...@@ -105,7 +107,7 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype,
(2, 1, False), (2, 2, True)], (2, 1, False), (2, 2, True)],
) )
def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid): def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
MODEL_NAME = "llava-hf/llava-1.5-7b-hf" MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")
model_config = ModelConfig( model_config = ModelConfig(
model=MODEL_NAME, model=MODEL_NAME,
...@@ -135,7 +137,7 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid): ...@@ -135,7 +137,7 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
# NOTE: We don't test zero images since the HF processor doesn't support it # NOTE: We don't test zero images since the HF processor doesn't support it
@pytest.mark.parametrize("num_images", [1, 2]) @pytest.mark.parametrize("num_images", [1, 2])
def test_image_mapper_multi(image_assets, mm_registry, num_images): def test_image_mapper_multi(image_assets, mm_registry, num_images):
MODEL_NAME = "llava-hf/llava-1.5-7b-hf" MODEL_NAME = os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf")
model_config = ModelConfig( model_config = ModelConfig(
model=MODEL_NAME, model=MODEL_NAME,
......
...@@ -4,6 +4,7 @@ from unittest.mock import patch ...@@ -4,6 +4,7 @@ from unittest.mock import patch
import pytest import pytest
import torch import torch
import os
from vllm.inputs import InputContext, LLMInputs from vllm.inputs import InputContext, LLMInputs
from vllm.inputs.registry import InputRegistry from vllm.inputs.registry import InputRegistry
...@@ -11,11 +12,12 @@ from vllm.multimodal import MultiModalRegistry ...@@ -11,11 +12,12 @@ from vllm.multimodal import MultiModalRegistry
from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
from ..models.utils import build_model_context from ..models.utils import build_model_context
from ..utils import models_path_prefix
# Used for fast tests where the model doesn't matter # Used for fast tests where the model doesn't matter
DUMMY_MODEL_ID = "facebook/opt-125m" DUMMY_MODEL_ID = os.path.join(models_path_prefix, "facebook/opt-125m")
# Used for tests that need a multimodal model # Used for tests that need a multimodal model
MULTIMODAL_MODEL_ID = "microsoft/Phi-3.5-vision-instruct" MULTIMODAL_MODEL_ID = os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")
# For mm_processor_kwargs - we test overrides by defining mocks for each place # For mm_processor_kwargs - we test overrides by defining mocks for each place
# it is used, and ensuring that we can pass processor kwargs an override value # it is used, and ensuring that we can pass processor kwargs an override value
......
...@@ -2,21 +2,23 @@ ...@@ -2,21 +2,23 @@
Run `pytest tests/prefix_caching/test_prefix_caching.py`. Run `pytest tests/prefix_caching/test_prefix_caching.py`.
""" """
import os
import pytest import pytest
from tests.conftest import cleanup from tests.conftest import cleanup
from vllm import LLM from vllm import LLM
from ..utils import models_path_prefix
MODEL_LEN_LEN = [ MODEL_LEN_LEN = [
# Example models with sliding window. # Example models with sliding window.
("bigcode/starcoder2-3b", 4096, 16384), (os.path.join(models_path_prefix, "bigcode/starcoder2-3b"), 4096, 16384),
# ("mistralai/Mistral-7B-v0.1", 4096, 32768), << OOM in CI # ("mistralai/Mistral-7B-v0.1", 4096, 32768), << OOM in CI
# Confirm model with sliding window works. # Confirm model with sliding window works.
# config has "use_sliding_window": false # config has "use_sliding_window": false
("Qwen/Qwen1.5-0.5B-Chat", 32768, 32768), (os.path.join(models_path_prefix, "Qwen/Qwen1.5-0.5B-Chat"), 32768, 32768),
# config has no sliding window attribute. # config has no sliding window attribute.
("TinyLlama/TinyLlama-1.1B-Chat-v1.0", 2048, 2048), (os.path.join(models_path_prefix, "TinyLlama/TinyLlama-1.1B-Chat-v1.0"), 2048, 2048),
] ]
......
...@@ -5,6 +5,7 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`. ...@@ -5,6 +5,7 @@ Run `pytest tests/prefix_caching/test_prefix_caching.py`.
from typing import List from typing import List
import pytest import pytest
import os
from tests.kernels.utils import override_backend_env_variable from tests.kernels.utils import override_backend_env_variable
from vllm.block import PhysicalTokenBlock from vllm.block import PhysicalTokenBlock
...@@ -12,9 +13,10 @@ from vllm.core.block_manager_v1 import CachedBlockAllocator ...@@ -12,9 +13,10 @@ from vllm.core.block_manager_v1 import CachedBlockAllocator
from vllm.utils import Device from vllm.utils import Device
from ..models.utils import check_outputs_equal from ..models.utils import check_outputs_equal
from ..utils import models_path_prefix
MODELS = [ MODELS = [
"facebook/opt-125m", os.path.join(models_path_prefix, "facebook/opt-125m"),
] ]
......
import pytest import pytest
import os
import vllm import vllm
from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.prompt_adapter.request import PromptAdapterRequest
from ..utils import models_path_prefix
MODEL_PATH = "bigscience/bloomz-560m" MODEL_PATH = os.path.join(models_path_prefix, "bigscience/bloomz-560m")
PA_PATH = 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM' PA_PATH = os.path.join(models_path_prefix, 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM')
def do_sample(llm, pa_name: str, pa_id: int): def do_sample(llm, pa_name: str, pa_id: int):
......
from vllm import EngineArgs, LLMEngine, SamplingParams from vllm import EngineArgs, LLMEngine, SamplingParams
from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.prompt_adapter.request import PromptAdapterRequest
from ..utils import models_path_prefix
import os
MODEL_PATH = "bigscience/bloomz-560m" MODEL_PATH = os.path.join(models_path_prefix, "bigscience/bloomz-560m")
pa_path = 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM' pa_path = os.path.join(models_path_prefix, 'stevhliu/bloomz-560m_PROMPT_TUNING_CAUSAL_LM')
pa_path2 = 'swapnilbp/angry_tweet_ptune' pa_path2 = os.path.join(models_path_prefix, 'swapnilbp/angry_tweet_ptune')
def do_sample(engine): def do_sample(engine):
......
...@@ -3,10 +3,14 @@ from huggingface_hub import snapshot_download ...@@ -3,10 +3,14 @@ from huggingface_hub import snapshot_download
from vllm import EngineArgs, LLMEngine, SamplingParams from vllm import EngineArgs, LLMEngine, SamplingParams
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.prompt_adapter.request import PromptAdapterRequest
from ..utils import models_path_prefix
import os
MODEL_PATH = "meta-llama/Llama-2-7b-hf" MODEL_PATH = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")
pa_path = snapshot_download(repo_id="swapnilbp/llama_tweet_ptune") # pa_path = snapshot_download(repo_id="swapnilbp/llama_tweet_ptune")
lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") # lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
pa_path = os.path.join(models_path_prefix, "swapnilbp/llama_tweet_ptune")
lora_path = os.path.join(models_path_prefix, "yard1/llama-2-7b-sql-lora-test")
def do_sample(engine): def do_sample(engine):
......
...@@ -4,27 +4,29 @@ Run `pytest tests/quantization/test_bitsandbytes.py`. ...@@ -4,27 +4,29 @@ Run `pytest tests/quantization/test_bitsandbytes.py`.
''' '''
import gc import gc
import os
import pytest import pytest
import torch import torch
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from ..utils import fork_new_process_for_each_test from ..utils import fork_new_process_for_each_test, models_path_prefix
models_4bit_to_test = [ models_4bit_to_test = [
('huggyllama/llama-7b', 'quantize model inflight'), (os.path.join(models_path_prefix, 'huggyllama/llama-7b'), 'quantize model inflight'),
] ]
models_pre_qaunt_4bit_to_test = [ models_pre_qaunt_4bit_to_test = [
('lllyasviel/omost-llama-3-8b-4bits', (os.path.join(models_path_prefix, 'lllyasviel/omost-llama-3-8b-4bits'),
'read pre-quantized 4-bit NF4 model'), 'read pre-quantized 4-bit NF4 model'),
('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed', (os.path.join(models_path_prefix, 'PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed'),
'read pre-quantized 4-bit FP4 model'), 'read pre-quantized 4-bit FP4 model'),
] ]
models_pre_quant_8bit_to_test = [ models_pre_quant_8bit_to_test = [
('meta-llama/Llama-Guard-3-8B-INT8', 'read pre-quantized 8-bit model'), (os.path.join(models_path_prefix, 'meta-llama/Llama-Guard-3-8B-INT8'), 'read pre-quantized 8-bit model'),
] ]
......
...@@ -5,6 +5,7 @@ Run `pytest tests/quantization/test_compressed_tensors.py`. ...@@ -5,6 +5,7 @@ Run `pytest tests/quantization/test_compressed_tensors.py`.
import pytest import pytest
import torch import torch
import os
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24, CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
...@@ -12,12 +13,13 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso ...@@ -12,12 +13,13 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
CompressedTensorsW8A16Fp8, CompressedTensorsWNA16) CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
QuantizationType) QuantizationType)
from ..utils import models_path_prefix
@pytest.mark.parametrize("model_args", [ @pytest.mark.parametrize("model_args", [
("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor", (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"), "tensor",
QuantizationType.INT, 2560), QuantizationType.INT, 2560),
("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel", (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor"), "channel",
QuantizationType.INT, 2560), QuantizationType.INT, 2560),
]) ])
def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
...@@ -61,15 +63,15 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): ...@@ -61,15 +63,15 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
def test_compressed_tensors_no_enforce_eager(vllm_runner): def test_compressed_tensors_no_enforce_eager(vllm_runner):
model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change" model_path = os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change")
with vllm_runner(model_path) as llm: with vllm_runner(model_path) as llm:
output = llm.generate_greedy("Hello my name is", max_tokens=20) output = llm.generate_greedy("Hello my name is", max_tokens=20)
assert output assert output
@pytest.mark.parametrize("model_args", [ @pytest.mark.parametrize("model_args", [
("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"), (os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"), "tensor"),
("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", "channel"), (os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"), "channel"),
]) ])
def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args): def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):
model_path, strategy = model_args model_path, strategy = model_args
...@@ -91,9 +93,9 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args): ...@@ -91,9 +93,9 @@ def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"wNa16_args", "wNa16_args",
[("nm-testing/tinyllama-oneshot-w4a16-channel-v2", "channel", None, 8), [(os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w4a16-channel-v2"), "channel", None, 8),
("nm-testing/tinyllama-oneshot-w4a16-group128-v2", "group", 128, 8), (os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w4a16-group128-v2"), "group", 128, 8),
("nm-testing/tinyllama-oneshot-w8a16-per-channel", "channel", None, 4)]) (os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w8a16-per-channel"), "channel", None, 4)])
def test_compressed_tensors_wNa16(vllm_runner, wNa16_args): def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
model, strategy, group, pack_factor = wNa16_args model, strategy, group, pack_factor = wNa16_args
with vllm_runner(model) as llm: with vllm_runner(model) as llm:
...@@ -116,7 +118,7 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args): ...@@ -116,7 +118,7 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
def test_compressed_tensors_w4a16_marlin24(vllm_runner): def test_compressed_tensors_w4a16_marlin24(vllm_runner):
model_path = "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t" model_path = os.path.join(models_path_prefix,"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t")
with vllm_runner(model_path) as llm: with vllm_runner(model_path) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
layer = model.model.layers[0] layer = model.model.layers[0]
...@@ -132,7 +134,7 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner): ...@@ -132,7 +134,7 @@ def test_compressed_tensors_w4a16_marlin24(vllm_runner):
def test_compressed_tensors_fp8(vllm_runner): def test_compressed_tensors_fp8(vllm_runner):
model_path = "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test" model_path = os.path.join(models_path_prefix,"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test")
with vllm_runner(model_path) as llm: with vllm_runner(model_path) as llm:
model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501
layer = model.model.layers[0] layer = model.model.layers[0]
...@@ -157,7 +159,7 @@ def test_compressed_tensors_fp8(vllm_runner): ...@@ -157,7 +159,7 @@ def test_compressed_tensors_fp8(vllm_runner):
def test_compressed_tensors_kv_cache(vllm_runner): def test_compressed_tensors_kv_cache(vllm_runner):
model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme" model_path = os.path.join(models_path_prefix,"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")
with vllm_runner(model_path, kv_cache_dtype="fp8") as llm: with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
output = llm.generate_greedy("Hello world!", max_tokens=20) output = llm.generate_greedy("Hello world!", max_tokens=20)
assert output assert output
\ No newline at end of file
...@@ -7,8 +7,10 @@ from dataclasses import dataclass ...@@ -7,8 +7,10 @@ from dataclasses import dataclass
from typing import Tuple from typing import Tuple
import pytest import pytest
import os
from vllm.config import ModelConfig from vllm.config import ModelConfig
from ..utils import models_path_prefix
@dataclass @dataclass
...@@ -22,32 +24,32 @@ MODEL_ARG_EXPTYPES = [ ...@@ -22,32 +24,32 @@ MODEL_ARG_EXPTYPES = [
# AUTOGPTQ # AUTOGPTQ
# compat: autogptq <=0.7.1 is_marlin_format: bool # compat: autogptq <=0.7.1 is_marlin_format: bool
# Model Serialized in Marlin Format should always use Marlin kernel. # Model Serialized in Marlin Format should always use Marlin kernel.
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", None, "marlin"), (os.path.join(models_path_prefix, "neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"), None, "marlin"),
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "marlin", "marlin"), (os.path.join(models_path_prefix, "neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"), "marlin", "marlin"),
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "gptq", "marlin"), (os.path.join(models_path_prefix, "neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"), "gptq", "marlin"),
("neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin", "awq", "ERROR"), (os.path.join(models_path_prefix, "neuralmagic/TinyLlama-1.1B-Chat-v1.0-marlin"), "awq", "ERROR"),
# Model Serialized in Exllama Format. # Model Serialized in Exllama Format.
("TheBloke/Llama-2-7B-Chat-GPTQ", None, "gptq_marlin"), (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), None, "gptq_marlin"),
("TheBloke/Llama-2-7B-Chat-GPTQ", "marlin", "gptq_marlin"), (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "marlin", "gptq_marlin"),
("TheBloke/Llama-2-7B-Chat-GPTQ", "gptq", "gptq"), (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "gptq", "gptq"),
("TheBloke/Llama-2-7B-Chat-GPTQ", "awq", "ERROR"), (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "awq", "ERROR"),
# compat: autogptq >=0.8.0 use checkpoint_format: str # compat: autogptq >=0.8.0 use checkpoint_format: str
# Model Serialized in Marlin Format should always use Marlin kernel. # Model Serialized in Marlin Format should always use Marlin kernel.
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", None, "marlin"), (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"), None, "marlin"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "marlin", "marlin"), (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"), "marlin", "marlin"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "gptq", "marlin"), (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"), "gptq", "marlin"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit", "awq", "ERROR"), (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-Marlin-4bit"), "awq", "ERROR"),
# Model Serialized in Exllama Format. # Model Serialized in Exllama Format.
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", None, "gptq_marlin"), (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), None, "gptq_marlin"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "marlin", "gptq_marlin"), (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), "marlin", "gptq_marlin"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "gptq", "gptq"), (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), "gptq", "gptq"),
("LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit", "awq", "ERROR"), (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), "awq", "ERROR"),
# AUTOAWQ # AUTOAWQ
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", None, "awq_marlin"), (os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), None, "awq_marlin"),
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"), (os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), "awq", "awq"),
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "marlin", "awq_marlin"), (os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), "marlin", "awq_marlin"),
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "gptq", "ERROR"), (os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), "gptq", "ERROR"),
] ]
......
...@@ -2,22 +2,23 @@ ...@@ -2,22 +2,23 @@
# Base tests: tests/basic_correctness/test_cpu_offload.py # Base tests: tests/basic_correctness/test_cpu_offload.py
import pytest import pytest
import os
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from ..utils import compare_two_settings from ..utils import compare_two_settings, models_path_prefix
@pytest.mark.skipif(not is_quant_method_supported("fp8"), @pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.") reason="fp8 is not supported on this GPU type.")
def test_cpu_offload_fp8(): def test_cpu_offload_fp8():
# Test quantization of an unquantized checkpoint # Test quantization of an unquantized checkpoint
compare_two_settings("meta-llama/Meta-Llama-3-8B-Instruct", compare_two_settings(os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
["--quantization", "fp8"], ["--quantization", "fp8"],
["--quantization", "fp8", "--cpu-offload-gb", "2"], ["--quantization", "fp8", "--cpu-offload-gb", "2"],
max_wait_seconds=480) max_wait_seconds=480)
# Test loading a quantized checkpoint # Test loading a quantized checkpoint
compare_two_settings("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", [], compare_two_settings(os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), [],
["--cpu-offload-gb", "2"], ["--cpu-offload-gb", "2"],
max_wait_seconds=480) max_wait_seconds=480)
...@@ -26,11 +27,11 @@ def test_cpu_offload_fp8(): ...@@ -26,11 +27,11 @@ def test_cpu_offload_fp8():
reason="gptq_marlin is not supported on this GPU type.") reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_gptq(): def test_cpu_offload_gptq():
# Test GPTQ Marlin # Test GPTQ Marlin
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [], compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"), [],
["--cpu-offload-gb", "1"], ["--cpu-offload-gb", "1"],
max_wait_seconds=480) max_wait_seconds=480)
# Test GPTQ # Test GPTQ
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"),
["--quantization", "gptq"], ["--quantization", "gptq"],
["--quantization", "gptq", "--cpu-offload-gb", "1"], ["--quantization", "gptq", "--cpu-offload-gb", "1"],
max_wait_seconds=480) max_wait_seconds=480)
...@@ -40,11 +41,11 @@ def test_cpu_offload_gptq(): ...@@ -40,11 +41,11 @@ def test_cpu_offload_gptq():
reason="awq_marlin is not supported on this GPU type.") reason="awq_marlin is not supported on this GPU type.")
def test_cpu_offload_awq(): def test_cpu_offload_awq():
# Test AWQ Marlin # Test AWQ Marlin
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [], compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-AWQ"), [],
["--cpu-offload-gb", "1"], ["--cpu-offload-gb", "1"],
max_wait_seconds=480) max_wait_seconds=480)
# Test AWQ # Test AWQ
compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-AWQ"),
["--quantization", "awq"], ["--quantization", "awq"],
["--quantization", "awq", "--cpu-offload-gb", "1"], ["--quantization", "awq", "--cpu-offload-gb", "1"],
max_wait_seconds=480) max_wait_seconds=480)
...@@ -54,15 +55,15 @@ def test_cpu_offload_awq(): ...@@ -54,15 +55,15 @@ def test_cpu_offload_awq():
reason="gptq_marlin is not supported on this GPU type.") reason="gptq_marlin is not supported on this GPU type.")
def test_cpu_offload_compressed_tensors(): def test_cpu_offload_compressed_tensors():
# Test wNa16 # Test wNa16
compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [], compare_two_settings(os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w4a16-channel-v2"), [],
["--cpu-offload-gb", "1"], ["--cpu-offload-gb", "1"],
max_wait_seconds=480) max_wait_seconds=480)
# Test w4a16_marlin24 # Test w4a16_marlin24
compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t", compare_two_settings(os.path.join(models_path_prefix, "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t"),
[], ["--cpu-offload-gb", "1"], [], ["--cpu-offload-gb", "1"],
max_wait_seconds=480) max_wait_seconds=480)
# Test w8a8 # Test w8a8
compare_two_settings( compare_two_settings(
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", [], os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"), [],
["--cpu-offload-gb", "1"], ["--cpu-offload-gb", "1"],
max_wait_seconds=480) max_wait_seconds=480)
...@@ -3,10 +3,12 @@ ...@@ -3,10 +3,12 @@
doesn't test correctness doesn't test correctness
""" """
import pytest import pytest
import os
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from ..utils import models_path_prefix
MODELS = ["ai21labs/Jamba-tiny-random"] MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-random")]
@pytest.mark.skipif(not is_quant_method_supported("experts_int8"), @pytest.mark.skipif(not is_quant_method_supported("experts_int8"),
......
...@@ -4,17 +4,19 @@ Run `pytest tests/quantization/test_fp8.py --forked`. ...@@ -4,17 +4,19 @@ Run `pytest tests/quantization/test_fp8.py --forked`.
""" """
import pytest import pytest
import torch import torch
import os
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod, from vllm.model_executor.layers.quantization.fp8 import (Fp8KVCacheMethod,
Fp8LinearMethod) Fp8LinearMethod)
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ..utils import models_path_prefix
MODELS = [ MODELS = [
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV", os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"),
"nm-testing/Phi-3-mini-128k-instruct-FP8", os.path.join(models_path_prefix, "nm-testing/Phi-3-mini-128k-instruct-FP8"),
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV", os.path.join(models_path_prefix, "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV"),
] ]
...@@ -37,9 +39,9 @@ def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool, ...@@ -37,9 +39,9 @@ def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
KV_CACHE_MODELS = [ KV_CACHE_MODELS = [
# Deprecated AutoFP8 format using .kv_scale # Deprecated AutoFP8 format using .kv_scale
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV", os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"),
# AutoFP8 format using separate .k_scale and .v_scale # AutoFP8 format using separate .k_scale and .v_scale
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", os.path.join(models_path_prefix, "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V"),
] ]
...@@ -73,7 +75,7 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool, ...@@ -73,7 +75,7 @@ def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
if force_marlin: if force_marlin:
monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1") monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
with vllm_runner("facebook/opt-125m", with vllm_runner(os.path.join(models_path_prefix, "facebook/opt-125m"),
quantization="fp8", quantization="fp8",
kv_cache_dtype=kv_cache_dtype) as llm: kv_cache_dtype=kv_cache_dtype) as llm:
......
...@@ -6,6 +6,7 @@ from typing import Tuple ...@@ -6,6 +6,7 @@ from typing import Tuple
import pytest import pytest
import torch import torch
import os
from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
from vllm.model_executor.layers.quantization.gptq_marlin import ( from vllm.model_executor.layers.quantization.gptq_marlin import (
...@@ -13,13 +14,14 @@ from vllm.model_executor.layers.quantization.gptq_marlin import ( ...@@ -13,13 +14,14 @@ from vllm.model_executor.layers.quantization.gptq_marlin import (
from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod from vllm.model_executor.layers.quantization.marlin import MarlinLinearMethod
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
UnquantizedEmbeddingMethod) UnquantizedEmbeddingMethod)
from ..utils import models_path_prefix
PROMPT = "On the surface of Mars, we found" PROMPT = "On the surface of Mars, we found"
MODELS_QUANT = [( MODELS_QUANT = [(
"LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse", os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-intermediate-step-1341k-3T-autoround-lm_head-symFalse"),
True), ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", False), True), (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), False),
("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", False)] (os.path.join(models_path_prefix, "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"), False)]
@pytest.mark.parametrize("model_lm_head_quant", MODELS_QUANT) @pytest.mark.parametrize("model_lm_head_quant", MODELS_QUANT)
......
...@@ -4,6 +4,8 @@ Run `pytest tests/samplers/test_beam_search.py`. ...@@ -4,6 +4,8 @@ Run `pytest tests/samplers/test_beam_search.py`.
""" """
import pytest import pytest
import os
from ..utils import models_path_prefix
# FIXME(zhuohan): The test can not pass if we: # FIXME(zhuohan): The test can not pass if we:
# 1. Increase max_tokens to 256. # 1. Increase max_tokens to 256.
...@@ -11,7 +13,7 @@ import pytest ...@@ -11,7 +13,7 @@ import pytest
# 3. Use the model "huggyllama/llama-7b". # 3. Use the model "huggyllama/llama-7b".
MAX_TOKENS = [64] MAX_TOKENS = [64]
BEAM_WIDTHS = [4] BEAM_WIDTHS = [4]
MODELS = ["TinyLlama/TinyLlama-1.1B-Chat-v1.0"] MODELS = [os.path.join(models_path_prefix, "TinyLlama/TinyLlama-1.1B-Chat-v1.0")]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
......
...@@ -4,12 +4,14 @@ Run `pytest tests/samplers/test_ignore_eos.py`. ...@@ -4,12 +4,14 @@ Run `pytest tests/samplers/test_ignore_eos.py`.
""" """
import pytest import pytest
import os
from vllm import SamplingParams from vllm import SamplingParams
from ..utils import models_path_prefix
# We also test with llama because it has generation_config to specify EOS # We also test with llama because it has generation_config to specify EOS
# (past regression). # (past regression).
MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"] MODELS = [os.path.join(models_path_prefix, "facebook/opt-125m"), os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf")]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
......
import pytest import pytest
import torch import torch
import os
from vllm import SamplingParams from vllm import SamplingParams
from ..utils import models_path_prefix
MODELS = ["facebook/opt-125m"] MODELS = [os.path.join(models_path_prefix, "facebook/opt-125m")]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
......
...@@ -2,12 +2,14 @@ from typing import List ...@@ -2,12 +2,14 @@ from typing import List
import pytest import pytest
import torch import torch
import os
from vllm import SamplingParams from vllm import SamplingParams
from ..conftest import VllmRunner from ..conftest import VllmRunner
from ..utils import models_path_prefix
MODELS = ["facebook/opt-125m"] MODELS = [os.path.join(models_path_prefix, "facebook/opt-125m")]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
...@@ -130,7 +132,7 @@ def test_get_prompt_logprobs( ...@@ -130,7 +132,7 @@ def test_get_prompt_logprobs(
def test_max_logprobs(): def test_max_logprobs():
runner = VllmRunner("facebook/opt-125m", max_logprobs=1) runner = VllmRunner(os.path.join(models_path_prefix, "facebook/opt-125m"), max_logprobs=1)
vllm_sampling_params = SamplingParams(logprobs=1) vllm_sampling_params = SamplingParams(logprobs=1)
# should pass # should pass
runner.generate(["Hello world"], sampling_params=vllm_sampling_params) runner.generate(["Hello world"], sampling_params=vllm_sampling_params)
......
import pytest import pytest
import os
from vllm import SamplingParams from vllm import SamplingParams
from ..utils import models_path_prefix
MODELS = ["facebook/opt-125m"] MODELS = [os.path.join(models_path_prefix, "facebook/opt-125m")]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
......
...@@ -7,11 +7,13 @@ import random ...@@ -7,11 +7,13 @@ import random
from itertools import combinations from itertools import combinations
import pytest import pytest
import os
from vllm import SamplingParams from vllm import SamplingParams
from vllm.model_executor.utils import set_random_seed from vllm.model_executor.utils import set_random_seed
from ..utils import models_path_prefix
MODEL = "facebook/opt-125m" MODEL = os.path.join(models_path_prefix, "facebook/opt-125m")
RANDOM_SEEDS = list(range(5)) RANDOM_SEEDS = list(range(5))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment