Commit 3c9817d2 authored by zhuwenwen's avatar zhuwenwen
Browse files

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path...

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH  to load models from local path instead of Hugging Face Hub
parent 49204f68
......@@ -2,15 +2,16 @@ from typing import List, Optional, Tuple, Type
import numpy as np
import pytest
import os
from transformers import AutoModel, AutoTokenizer, BatchEncoding
from vllm.sequence import SampleLogprobs
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from ....conftest import HfRunner, VllmRunner
from ...utils import check_logprobs_close
from ...utils import check_logprobs_close, models_path_prefix
MODEL_NAME = "fixie-ai/ultravox-v0_3"
MODEL_NAME = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_3")
AudioTuple = Tuple[np.ndarray, int]
......
......@@ -4,8 +4,10 @@ Run `pytest tests/models/test_aqlm.py`.
"""
import pytest
import os
from tests.quantization.utils import is_quant_method_supported
from ...utils import models_path_prefix
# These ground truth generations were generated using `transformers==4.38.1
# aqlm==1.1.0 torch==2.2.0`
......@@ -40,7 +42,7 @@ ground_truth_generations = [
@pytest.mark.skipif(not is_quant_method_supported("aqlm"),
reason="AQLM is not supported on this GPU type.")
@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf")])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [16])
@pytest.mark.parametrize("num_logprobs", [1])
......
......@@ -5,24 +5,26 @@ This tests bigger models and use half precision.
Run `pytest tests/models/test_big_models.py`.
"""
import pytest
import os
from vllm.platforms import current_platform
from ...utils import check_outputs_equal
from ....utils import models_path_prefix
MODELS = [
"meta-llama/Llama-2-7b-hf",
os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
# "mistralai/Mistral-7B-v0.1", # Tested by test_mistral.py
# "Deci/DeciLM-7b", # Broken
# "tiiuae/falcon-7b", # Broken
"EleutherAI/gpt-j-6b",
os.path.join(models_path_prefix, "EleutherAI/gpt-j-6b"),
# "mosaicml/mpt-7b", # Broken
# "Qwen/Qwen1.5-0.5B" # Broken,
]
if not current_platform.is_cpu():
# MiniCPM requires fused_moe which is not supported by CPU
MODELS.append("openbmb/MiniCPM3-4B")
MODELS.append(os.path.join(models_path_prefix, "openbmb/MiniCPM3-4B"))
#TODO: remove this after CPU float16 support ready
target_dtype = "float" if current_platform.is_cpu() else "half"
......
......@@ -5,10 +5,12 @@ This tests danube3 separately because its head size isn't supported on CPU yet.
Run `pytest tests/models/test_danube3_4b.py`.
"""
import pytest
import os
from ...utils import check_outputs_equal
from ....utils import models_path_prefix
MODELS = ["h2oai/h2o-danube3-4b-base"]
MODELS = [os.path.join(models_path_prefix, "h2oai/h2o-danube3-4b-base")]
target_dtype = "half"
......
......@@ -11,6 +11,7 @@ from tests.kernels.utils import override_backend_env_variable
from tests.quantization.utils import is_quant_method_supported
from ...utils import check_logprobs_close
from ....utils import models_path_prefix
os.environ["TOKENIZERS_PARALLELISM"] = "true"
......@@ -21,14 +22,14 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
"kv_cache_dtype,base_model,test_model,scale_path",
[
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct",
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None),
("fp8_e4m3", os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV"), None),
# Test FP16 checkpoint w. fp8_e5m2 kv-cache.
("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct",
"meta-llama/Meta-Llama-3-8B-Instruct", None),
("fp8_e5m2", os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"), None),
# Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
"meta-llama/Llama-2-7b-chat-hf",
("fp8_e4m3", os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
"./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
......
......@@ -12,6 +12,7 @@ from transformers import AutoTokenizer
from tests.quantization.utils import is_quant_method_supported
from ...utils import check_logprobs_close
from ....utils import models_path_prefix
os.environ["TOKENIZERS_PARALLELISM"] = "true"
......@@ -19,16 +20,16 @@ MAX_MODEL_LEN = 1024
# FIXME: Move this to confest
MODELS = [
("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
(os.path.join(models_path_prefix, "TinyLlama/TinyLlama-1.1B-Chat-v1.0"),
hf_hub_download("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")),
("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
(os.path.join(models_path_prefix, "TinyLlama/TinyLlama-1.1B-Chat-v1.0"),
hf_hub_download("duyntnet/TinyLlama-1.1B-Chat-v1.0-imatrix-GGUF",
filename="TinyLlama-1.1B-Chat-v1.0-IQ4_XS.gguf")),
("Qwen/Qwen2-1.5B-Instruct",
(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct"),
hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF",
filename="qwen2-1_5b-instruct-q4_k_m.gguf")),
("Qwen/Qwen2-1.5B-Instruct",
(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct"),
hf_hub_download("legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
filename="Qwen2-1.5B-Instruct.IQ4_XS.gguf")),
]
......
......@@ -16,6 +16,7 @@ from tests.quantization.utils import is_quant_method_supported
from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
from ...utils import check_logprobs_close
from ....utils import models_path_prefix
os.environ["TOKENIZERS_PARALLELISM"] = "true"
......@@ -23,26 +24,26 @@ MAX_MODEL_LEN = 1024
MODELS = [
# act_order==False, group_size=channelwise
("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
(os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-gptq"), "main"),
# act_order==False, group_size=128
("TheBloke/Llama-2-7B-GPTQ", "main"),
# act_order==True, group_size=128
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
(os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "main"),
# act_order==True, group_size=64
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"),
(os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-4bit-64g-actorder_True"),
# act_order==True, group_size=32
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"),
(os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-4bit-32g-actorder_True"),
# 8-bit, act_order==True, group_size=channelwise
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
(os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-8bit--1g-actorder_True"),
# 8-bit, act_order==True, group_size=128
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"),
(os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-8bit-128g-actorder_True"),
# 8-bit, act_order==True, group_size=32
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"),
(os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-8bit-32g-actorder_True"),
# 4-bit, act_order==True, group_size=128
("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")
(os.path.join(models_path_prefix, "TechxGenus/gemma-1.1-2b-it-GPTQ"), "main")
]
......
......@@ -9,10 +9,12 @@ Run `pytest tests/models/test_marlin_24.py`.
from dataclasses import dataclass
import pytest
import os
from tests.quantization.utils import is_quant_method_supported
from ...utils import check_logprobs_close
from ....utils import models_path_prefix
@dataclass
......@@ -23,18 +25,18 @@ class ModelPair:
model_pairs = [
# 4-bit, group_size == 128
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
ModelPair(model_marlin=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-4bit-g128"),
model_gptq=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-gptq-4bit-g128")),
# 4-bit, group_size == channelwise
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
ModelPair(model_marlin=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-4bit-channelwise"),
model_gptq=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-gptq-4bit-channelwise")),
# 8-bit, group_size == 128
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
ModelPair(model_marlin=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-8bit-g128"),
model_gptq=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-gptq-8bit-g128")),
# 8-bit, group_size == channelwise
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
ModelPair(model_marlin=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-8bit-channelwise"),
model_gptq=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-gptq-8bit-channelwise")),
]
......
......@@ -3,12 +3,14 @@
Run `pytest tests/models/test_granite.py`.
"""
import pytest
import os
import transformers
from ...utils import check_logprobs_close
from ....utils import models_path_prefix
MODELS = [
"ibm/PowerLM-3b",
os.path.join(models_path_prefix, "ibm/PowerLM-3b"),
]
......
import pytest
import os
from vllm.worker.model_runner import _get_graph_batch_size
from ...utils import check_outputs_equal
from ....utils import models_path_prefix
MODELS = ["ai21labs/Jamba-tiny-random"]
MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-random")]
# Fails due to usage of MoE as MLP(E=1_, which is different than the HF impl
......
......@@ -13,10 +13,12 @@ Run `pytest tests/models/test_marlin.py`.
from dataclasses import dataclass
import pytest
import os
from tests.quantization.utils import is_quant_method_supported
from ...utils import check_logprobs_close
from ....utils import models_path_prefix
@dataclass
......@@ -26,12 +28,12 @@ class ModelPair:
model_pairs = [
ModelPair(model_marlin="nm-testing/zephyr-beta-7b-marlin-g128",
model_gptq="nm-testing/zephyr-beta-7b-gptq-g128"),
ModelPair(model_marlin="robertgshaw2/zephyr-7b-beta-channelwise-marlin",
model_gptq="robertgshaw2/zephyr-7b-beta-channelwise-gptq"),
ModelPair(model_marlin="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin",
model_gptq="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq")
ModelPair(model_marlin=os.path.join(models_path_prefix, "nm-testing/zephyr-beta-7b-marlin-g128"),
model_gptq=os.path.join(models_path_prefix, "nm-testing/zephyr-beta-7b-gptq-g128")),
ModelPair(model_marlin=os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-marlin"),
model_gptq=os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-gptq")),
ModelPair(model_marlin=os.path.join(models_path_prefix, "robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin"),
model_gptq=os.path.join(models_path_prefix, "robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq"))
]
......
......@@ -3,14 +3,16 @@
Run `pytest tests/models/test_mistral.py`.
"""
import pytest
import os
from vllm import LLM, SamplingParams
from ...utils import check_logprobs_close
from ....utils import models_path_prefix
MODELS = [
"mistralai/Mistral-7B-Instruct-v0.1",
"mistralai/Mistral-7B-Instruct-v0.3",
os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.1"),
os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.3"),
# Mistral-Nemo is to big for CI, but passes locally
# "mistralai/Mistral-Nemo-Instruct-2407"
]
......
......@@ -10,12 +10,13 @@ from transformers import AutoTokenizer
from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams
from ....utils import models_path_prefix
os.environ["TOKENIZERS_PARALLELISM"] = "true"
MAX_MODEL_LEN = 1024
MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"]
MODELS = [os.path.join(models_path_prefix, "nvidia/Llama-3.1-8B-Instruct-FP8")]
EXPECTED_STRS_MAP = {
"nvidia/Llama-3.1-8B-Instruct-FP8": [
......
......@@ -6,20 +6,21 @@ test_big_models.py because it could use a larger instance to run tests.
Run `pytest tests/models/test_models.py`.
"""
import pytest
import os
from ...utils import check_outputs_equal
from ...utils import check_outputs_equal, models_path_prefix
MODELS = [
"facebook/opt-125m",
"gpt2",
"bigcode/tiny_starcoder_py",
"EleutherAI/pythia-70m",
"bigscience/bloom-560m", # Testing alibi slopes.
"microsoft/phi-2",
"stabilityai/stablelm-3b-4e1t",
os.path.join(models_path_prefix, "facebook/opt-125m"),
os.path.join(models_path_prefix, "gpt2"),
os.path.join(models_path_prefix, "bigcode/tiny_starcoder_py"),
os.path.join(models_path_prefix, "EleutherAI/pythia-70m"),
os.path.join(models_path_prefix, "bigscience/bloom-560m"), # Testing alibi slopes.
os.path.join(models_path_prefix, "microsoft/phi-2"),
os.path.join(models_path_prefix, "stabilityai/stablelm-3b-4e1t"),
# "allenai/OLMo-1B", # Broken
"bigcode/starcoder2-3b",
"google/gemma-1.1-2b-it",
os.path.join(models_path_prefix, "bigcode/starcoder2-3b"),
os.path.join(models_path_prefix, "google/gemma-1.1-2b-it"),
]
......
......@@ -4,13 +4,15 @@ Run `pytest tests/models/test_phimoe.py`.
"""
import pytest
import torch
import os
from vllm.utils import is_cpu
from ...utils import check_logprobs_close
from ....utils import models_path_prefix
MODELS = [
"microsoft/Phi-3.5-MoE-instruct",
os.path.join(models_path_prefix, "microsoft/Phi-3.5-MoE-instruct"),
]
......
from typing import List, Optional, Tuple
import pytest
import os
from transformers import AutoModelForVision2Seq, AutoTokenizer
from vllm.multimodal.utils import rescale_image_size
......@@ -8,6 +9,7 @@ from vllm.sequence import SampleLogprobs
from ....conftest import IMAGE_ASSETS
from ...utils import check_logprobs_close
from ....utils import models_path_prefix
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
......@@ -33,7 +35,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
return hf_output_ids, hf_output_str, out_logprobs
@pytest.mark.parametrize("model", ["Salesforce/blip2-opt-2.7b"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b")])
@pytest.mark.parametrize(
"size_factors",
[
......
import pytest
import os
from ....utils import multi_gpu_test
from ....utils import models_path_prefix
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", [
"llava-hf/llava-1.5-7b-hf",
"llava-hf/llava-v1.6-mistral-7b-hf",
"facebook/chameleon-7b",
os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf"),
os.path.join(models_path_prefix, "facebook/chameleon-7b"),
])
def test_models(hf_runner, vllm_runner, image_assets,
distributed_executor_backend, model) -> None:
......
from typing import List, Optional, Type
import pytest
import os
from transformers import AutoModelForVision2Seq, BatchEncoding
from vllm.multimodal.utils import rescale_image_size
......@@ -8,6 +9,7 @@ from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from ...utils import check_outputs_equal
from ....utils import models_path_prefix
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
......@@ -16,7 +18,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"USER: <image>\nWhat is the season?\nASSISTANT:",
})
models = ["facebook/chameleon-7b"]
models = [os.path.join(models_path_prefix, "facebook/chameleon-7b")]
def run_test(
......
from typing import List, Optional, Tuple, Type
import pytest
import os
from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs
......@@ -8,6 +9,7 @@ from vllm.utils import is_cpu
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from ...utils import check_logprobs_close
from ....utils import models_path_prefix
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
......@@ -16,7 +18,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"What is the season?\n",
})
models = ["adept/fuyu-8b"]
models = [os.path.join(models_path_prefix, "adept/fuyu-8b")]
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
......
from typing import List, Optional, Tuple, Type
import pytest
import os
from vllm.multimodal.utils import rescale_image_size
from vllm.transformers_utils.tokenizer import patch_padding_side
......@@ -8,6 +9,7 @@ from vllm.transformers_utils.tokenizer import patch_padding_side
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test
from ...utils import check_logprobs_close
from ....utils import models_path_prefix
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
......@@ -16,7 +18,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"What is the season?",
})
models = ["THUDM/glm-4v-9b"]
models = [os.path.join(models_path_prefix, "THUDM/glm-4v-9b")]
target_dtype = "bfloat16"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment