Commit 3c9817d2 authored by zhuwenwen's avatar zhuwenwen
Browse files

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH to load models from local path...

add VLLM_OPTEST_MODELS_PATH/OPTEST_MODELS_PATH  to load models from local path instead of Hugging Face Hub
parent 49204f68
...@@ -2,15 +2,16 @@ from typing import List, Optional, Tuple, Type ...@@ -2,15 +2,16 @@ from typing import List, Optional, Tuple, Type
import numpy as np import numpy as np
import pytest import pytest
import os
from transformers import AutoModel, AutoTokenizer, BatchEncoding from transformers import AutoModel, AutoTokenizer, BatchEncoding
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from ....conftest import HfRunner, VllmRunner from ....conftest import HfRunner, VllmRunner
from ...utils import check_logprobs_close from ...utils import check_logprobs_close, models_path_prefix
MODEL_NAME = "fixie-ai/ultravox-v0_3" MODEL_NAME = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_3")
AudioTuple = Tuple[np.ndarray, int] AudioTuple = Tuple[np.ndarray, int]
......
...@@ -4,8 +4,10 @@ Run `pytest tests/models/test_aqlm.py`. ...@@ -4,8 +4,10 @@ Run `pytest tests/models/test_aqlm.py`.
""" """
import pytest import pytest
import os
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from ...utils import models_path_prefix
# These ground truth generations were generated using `transformers==4.38.1 # These ground truth generations were generated using `transformers==4.38.1
# aqlm==1.1.0 torch==2.2.0` # aqlm==1.1.0 torch==2.2.0`
...@@ -40,7 +42,7 @@ ground_truth_generations = [ ...@@ -40,7 +42,7 @@ ground_truth_generations = [
@pytest.mark.skipif(not is_quant_method_supported("aqlm"), @pytest.mark.skipif(not is_quant_method_supported("aqlm"),
reason="AQLM is not supported on this GPU type.") reason="AQLM is not supported on this GPU type.")
@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf")])
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [16]) @pytest.mark.parametrize("max_tokens", [16])
@pytest.mark.parametrize("num_logprobs", [1]) @pytest.mark.parametrize("num_logprobs", [1])
......
...@@ -5,24 +5,26 @@ This tests bigger models and use half precision. ...@@ -5,24 +5,26 @@ This tests bigger models and use half precision.
Run `pytest tests/models/test_big_models.py`. Run `pytest tests/models/test_big_models.py`.
""" """
import pytest import pytest
import os
from vllm.platforms import current_platform from vllm.platforms import current_platform
from ...utils import check_outputs_equal from ...utils import check_outputs_equal
from ....utils import models_path_prefix
MODELS = [ MODELS = [
"meta-llama/Llama-2-7b-hf", os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf"),
# "mistralai/Mistral-7B-v0.1", # Tested by test_mistral.py # "mistralai/Mistral-7B-v0.1", # Tested by test_mistral.py
# "Deci/DeciLM-7b", # Broken # "Deci/DeciLM-7b", # Broken
# "tiiuae/falcon-7b", # Broken # "tiiuae/falcon-7b", # Broken
"EleutherAI/gpt-j-6b", os.path.join(models_path_prefix, "EleutherAI/gpt-j-6b"),
# "mosaicml/mpt-7b", # Broken # "mosaicml/mpt-7b", # Broken
# "Qwen/Qwen1.5-0.5B" # Broken, # "Qwen/Qwen1.5-0.5B" # Broken,
] ]
if not current_platform.is_cpu(): if not current_platform.is_cpu():
# MiniCPM requires fused_moe which is not supported by CPU # MiniCPM requires fused_moe which is not supported by CPU
MODELS.append("openbmb/MiniCPM3-4B") MODELS.append(os.path.join(models_path_prefix, "openbmb/MiniCPM3-4B"))
#TODO: remove this after CPU float16 support ready #TODO: remove this after CPU float16 support ready
target_dtype = "float" if current_platform.is_cpu() else "half" target_dtype = "float" if current_platform.is_cpu() else "half"
......
...@@ -5,10 +5,12 @@ This tests danube3 separately because its head size isn't supported on CPU yet. ...@@ -5,10 +5,12 @@ This tests danube3 separately because its head size isn't supported on CPU yet.
Run `pytest tests/models/test_danube3_4b.py`. Run `pytest tests/models/test_danube3_4b.py`.
""" """
import pytest import pytest
import os
from ...utils import check_outputs_equal from ...utils import check_outputs_equal
from ....utils import models_path_prefix
MODELS = ["h2oai/h2o-danube3-4b-base"] MODELS = [os.path.join(models_path_prefix, "h2oai/h2o-danube3-4b-base")]
target_dtype = "half" target_dtype = "half"
......
...@@ -11,6 +11,7 @@ from tests.kernels.utils import override_backend_env_variable ...@@ -11,6 +11,7 @@ from tests.kernels.utils import override_backend_env_variable
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
from ....utils import models_path_prefix
os.environ["TOKENIZERS_PARALLELISM"] = "true" os.environ["TOKENIZERS_PARALLELISM"] = "true"
...@@ -21,14 +22,14 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true" ...@@ -21,14 +22,14 @@ os.environ["TOKENIZERS_PARALLELISM"] = "true"
"kv_cache_dtype,base_model,test_model,scale_path", "kv_cache_dtype,base_model,test_model,scale_path",
[ [
# Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors. # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct", ("fp8_e4m3", os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None), os.path.join(models_path_prefix, "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV"), None),
# Test FP16 checkpoint w. fp8_e5m2 kv-cache. # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct", ("fp8_e5m2", os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"),
"meta-llama/Meta-Llama-3-8B-Instruct", None), os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct"), None),
# Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json. # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf", ("fp8_e4m3", os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
"meta-llama/Llama-2-7b-chat-hf", os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-chat-hf"),
"./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json") "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
]) ])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens # Due to low-precision numerical divergence, we only test logprob of 4 tokens
......
...@@ -12,6 +12,7 @@ from transformers import AutoTokenizer ...@@ -12,6 +12,7 @@ from transformers import AutoTokenizer
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
from ....utils import models_path_prefix
os.environ["TOKENIZERS_PARALLELISM"] = "true" os.environ["TOKENIZERS_PARALLELISM"] = "true"
...@@ -19,16 +20,16 @@ MAX_MODEL_LEN = 1024 ...@@ -19,16 +20,16 @@ MAX_MODEL_LEN = 1024
# FIXME: Move this to confest # FIXME: Move this to confest
MODELS = [ MODELS = [
("TinyLlama/TinyLlama-1.1B-Chat-v1.0", (os.path.join(models_path_prefix, "TinyLlama/TinyLlama-1.1B-Chat-v1.0"),
hf_hub_download("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", hf_hub_download("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")), filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")),
("TinyLlama/TinyLlama-1.1B-Chat-v1.0", (os.path.join(models_path_prefix, "TinyLlama/TinyLlama-1.1B-Chat-v1.0"),
hf_hub_download("duyntnet/TinyLlama-1.1B-Chat-v1.0-imatrix-GGUF", hf_hub_download("duyntnet/TinyLlama-1.1B-Chat-v1.0-imatrix-GGUF",
filename="TinyLlama-1.1B-Chat-v1.0-IQ4_XS.gguf")), filename="TinyLlama-1.1B-Chat-v1.0-IQ4_XS.gguf")),
("Qwen/Qwen2-1.5B-Instruct", (os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct"),
hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF", hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF",
filename="qwen2-1_5b-instruct-q4_k_m.gguf")), filename="qwen2-1_5b-instruct-q4_k_m.gguf")),
("Qwen/Qwen2-1.5B-Instruct", (os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct"),
hf_hub_download("legraphista/Qwen2-1.5B-Instruct-IMat-GGUF", hf_hub_download("legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
filename="Qwen2-1.5B-Instruct.IQ4_XS.gguf")), filename="Qwen2-1.5B-Instruct.IQ4_XS.gguf")),
] ]
......
...@@ -16,6 +16,7 @@ from tests.quantization.utils import is_quant_method_supported ...@@ -16,6 +16,7 @@ from tests.quantization.utils import is_quant_method_supported
from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
from ....utils import models_path_prefix
os.environ["TOKENIZERS_PARALLELISM"] = "true" os.environ["TOKENIZERS_PARALLELISM"] = "true"
...@@ -23,26 +24,26 @@ MAX_MODEL_LEN = 1024 ...@@ -23,26 +24,26 @@ MAX_MODEL_LEN = 1024
MODELS = [ MODELS = [
# act_order==False, group_size=channelwise # act_order==False, group_size=channelwise
("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"), (os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-gptq"), "main"),
# act_order==False, group_size=128 # act_order==False, group_size=128
("TheBloke/Llama-2-7B-GPTQ", "main"), ("TheBloke/Llama-2-7B-GPTQ", "main"),
# act_order==True, group_size=128 # act_order==True, group_size=128
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"), (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "main"),
# act_order==True, group_size=64 # act_order==True, group_size=64
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"), (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-4bit-64g-actorder_True"),
# act_order==True, group_size=32 # act_order==True, group_size=32
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"), (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-4bit-32g-actorder_True"),
# 8-bit, act_order==True, group_size=channelwise # 8-bit, act_order==True, group_size=channelwise
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"), (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-8bit--1g-actorder_True"),
# 8-bit, act_order==True, group_size=128 # 8-bit, act_order==True, group_size=128
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"), (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-8bit-128g-actorder_True"),
# 8-bit, act_order==True, group_size=32 # 8-bit, act_order==True, group_size=32
("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"), (os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"), "gptq-8bit-32g-actorder_True"),
# 4-bit, act_order==True, group_size=128 # 4-bit, act_order==True, group_size=128
("TechxGenus/gemma-1.1-2b-it-GPTQ", "main") (os.path.join(models_path_prefix, "TechxGenus/gemma-1.1-2b-it-GPTQ"), "main")
] ]
......
...@@ -9,10 +9,12 @@ Run `pytest tests/models/test_marlin_24.py`. ...@@ -9,10 +9,12 @@ Run `pytest tests/models/test_marlin_24.py`.
from dataclasses import dataclass from dataclasses import dataclass
import pytest import pytest
import os
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
from ....utils import models_path_prefix
@dataclass @dataclass
...@@ -23,18 +25,18 @@ class ModelPair: ...@@ -23,18 +25,18 @@ class ModelPair:
model_pairs = [ model_pairs = [
# 4-bit, group_size == 128 # 4-bit, group_size == 128
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128", ModelPair(model_marlin=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-4bit-g128"),
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"), model_gptq=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-gptq-4bit-g128")),
# 4-bit, group_size == channelwise # 4-bit, group_size == channelwise
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise", ModelPair(model_marlin=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-4bit-channelwise"),
model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"), model_gptq=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-gptq-4bit-channelwise")),
# 8-bit, group_size == 128 # 8-bit, group_size == 128
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128", ModelPair(model_marlin=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-8bit-g128"),
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"), model_gptq=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-gptq-8bit-g128")),
# 8-bit, group_size == channelwise # 8-bit, group_size == channelwise
ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise", ModelPair(model_marlin=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-marlin24-8bit-channelwise"),
model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"), model_gptq=os.path.join(models_path_prefix, "alexm-nm/tinyllama-24-gptq-8bit-channelwise")),
] ]
......
...@@ -3,12 +3,14 @@ ...@@ -3,12 +3,14 @@
Run `pytest tests/models/test_granite.py`. Run `pytest tests/models/test_granite.py`.
""" """
import pytest import pytest
import os
import transformers import transformers
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
from ....utils import models_path_prefix
MODELS = [ MODELS = [
"ibm/PowerLM-3b", os.path.join(models_path_prefix, "ibm/PowerLM-3b"),
] ]
......
import pytest import pytest
import os
from vllm.worker.model_runner import _get_graph_batch_size from vllm.worker.model_runner import _get_graph_batch_size
from ...utils import check_outputs_equal from ...utils import check_outputs_equal
from ....utils import models_path_prefix
MODELS = ["ai21labs/Jamba-tiny-random"] MODELS = [os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-random")]
# Fails due to usage of MoE as MLP(E=1_, which is different than the HF impl # Fails due to usage of MoE as MLP(E=1_, which is different than the HF impl
......
...@@ -13,10 +13,12 @@ Run `pytest tests/models/test_marlin.py`. ...@@ -13,10 +13,12 @@ Run `pytest tests/models/test_marlin.py`.
from dataclasses import dataclass from dataclasses import dataclass
import pytest import pytest
import os
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
from ....utils import models_path_prefix
@dataclass @dataclass
...@@ -26,12 +28,12 @@ class ModelPair: ...@@ -26,12 +28,12 @@ class ModelPair:
model_pairs = [ model_pairs = [
ModelPair(model_marlin="nm-testing/zephyr-beta-7b-marlin-g128", ModelPair(model_marlin=os.path.join(models_path_prefix, "nm-testing/zephyr-beta-7b-marlin-g128"),
model_gptq="nm-testing/zephyr-beta-7b-gptq-g128"), model_gptq=os.path.join(models_path_prefix, "nm-testing/zephyr-beta-7b-gptq-g128")),
ModelPair(model_marlin="robertgshaw2/zephyr-7b-beta-channelwise-marlin", ModelPair(model_marlin=os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-marlin"),
model_gptq="robertgshaw2/zephyr-7b-beta-channelwise-gptq"), model_gptq=os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-gptq")),
ModelPair(model_marlin="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin", ModelPair(model_marlin=os.path.join(models_path_prefix, "robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-marlin"),
model_gptq="robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq") model_gptq=os.path.join(models_path_prefix, "robertgshaw2/TinyLlama-1.1B-Chat-v1.0-g128-gptq"))
] ]
......
...@@ -3,14 +3,16 @@ ...@@ -3,14 +3,16 @@
Run `pytest tests/models/test_mistral.py`. Run `pytest tests/models/test_mistral.py`.
""" """
import pytest import pytest
import os
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
from ....utils import models_path_prefix
MODELS = [ MODELS = [
"mistralai/Mistral-7B-Instruct-v0.1", os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.1"),
"mistralai/Mistral-7B-Instruct-v0.3", os.path.join(models_path_prefix, "mistralai/Mistral-7B-Instruct-v0.3"),
# Mistral-Nemo is to big for CI, but passes locally # Mistral-Nemo is to big for CI, but passes locally
# "mistralai/Mistral-Nemo-Instruct-2407" # "mistralai/Mistral-Nemo-Instruct-2407"
] ]
......
...@@ -10,12 +10,13 @@ from transformers import AutoTokenizer ...@@ -10,12 +10,13 @@ from transformers import AutoTokenizer
from tests.quantization.utils import is_quant_method_supported from tests.quantization.utils import is_quant_method_supported
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from ....utils import models_path_prefix
os.environ["TOKENIZERS_PARALLELISM"] = "true" os.environ["TOKENIZERS_PARALLELISM"] = "true"
MAX_MODEL_LEN = 1024 MAX_MODEL_LEN = 1024
MODELS = ["nvidia/Llama-3.1-8B-Instruct-FP8"] MODELS = [os.path.join(models_path_prefix, "nvidia/Llama-3.1-8B-Instruct-FP8")]
EXPECTED_STRS_MAP = { EXPECTED_STRS_MAP = {
"nvidia/Llama-3.1-8B-Instruct-FP8": [ "nvidia/Llama-3.1-8B-Instruct-FP8": [
......
...@@ -6,20 +6,21 @@ test_big_models.py because it could use a larger instance to run tests. ...@@ -6,20 +6,21 @@ test_big_models.py because it could use a larger instance to run tests.
Run `pytest tests/models/test_models.py`. Run `pytest tests/models/test_models.py`.
""" """
import pytest import pytest
import os
from ...utils import check_outputs_equal from ...utils import check_outputs_equal, models_path_prefix
MODELS = [ MODELS = [
"facebook/opt-125m", os.path.join(models_path_prefix, "facebook/opt-125m"),
"gpt2", os.path.join(models_path_prefix, "gpt2"),
"bigcode/tiny_starcoder_py", os.path.join(models_path_prefix, "bigcode/tiny_starcoder_py"),
"EleutherAI/pythia-70m", os.path.join(models_path_prefix, "EleutherAI/pythia-70m"),
"bigscience/bloom-560m", # Testing alibi slopes. os.path.join(models_path_prefix, "bigscience/bloom-560m"), # Testing alibi slopes.
"microsoft/phi-2", os.path.join(models_path_prefix, "microsoft/phi-2"),
"stabilityai/stablelm-3b-4e1t", os.path.join(models_path_prefix, "stabilityai/stablelm-3b-4e1t"),
# "allenai/OLMo-1B", # Broken # "allenai/OLMo-1B", # Broken
"bigcode/starcoder2-3b", os.path.join(models_path_prefix, "bigcode/starcoder2-3b"),
"google/gemma-1.1-2b-it", os.path.join(models_path_prefix, "google/gemma-1.1-2b-it"),
] ]
......
...@@ -4,13 +4,15 @@ Run `pytest tests/models/test_phimoe.py`. ...@@ -4,13 +4,15 @@ Run `pytest tests/models/test_phimoe.py`.
""" """
import pytest import pytest
import torch import torch
import os
from vllm.utils import is_cpu from vllm.utils import is_cpu
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
from ....utils import models_path_prefix
MODELS = [ MODELS = [
"microsoft/Phi-3.5-MoE-instruct", os.path.join(models_path_prefix, "microsoft/Phi-3.5-MoE-instruct"),
] ]
......
from typing import List, Optional, Tuple from typing import List, Optional, Tuple
import pytest import pytest
import os
from transformers import AutoModelForVision2Seq, AutoTokenizer from transformers import AutoModelForVision2Seq, AutoTokenizer
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
...@@ -8,6 +9,7 @@ from vllm.sequence import SampleLogprobs ...@@ -8,6 +9,7 @@ from vllm.sequence import SampleLogprobs
from ....conftest import IMAGE_ASSETS from ....conftest import IMAGE_ASSETS
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
from ....utils import models_path_prefix
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign": "stop_sign":
...@@ -33,7 +35,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, ...@@ -33,7 +35,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
return hf_output_ids, hf_output_str, out_logprobs return hf_output_ids, hf_output_str, out_logprobs
@pytest.mark.parametrize("model", ["Salesforce/blip2-opt-2.7b"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b")])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"size_factors", "size_factors",
[ [
......
import pytest import pytest
import os
from ....utils import multi_gpu_test from ....utils import multi_gpu_test
from ....utils import models_path_prefix
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", [ @pytest.mark.parametrize("model", [
"llava-hf/llava-1.5-7b-hf", os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
"llava-hf/llava-v1.6-mistral-7b-hf", os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf"),
"facebook/chameleon-7b", os.path.join(models_path_prefix, "facebook/chameleon-7b"),
]) ])
def test_models(hf_runner, vllm_runner, image_assets, def test_models(hf_runner, vllm_runner, image_assets,
distributed_executor_backend, model) -> None: distributed_executor_backend, model) -> None:
......
from typing import List, Optional, Type from typing import List, Optional, Type
import pytest import pytest
import os
from transformers import AutoModelForVision2Seq, BatchEncoding from transformers import AutoModelForVision2Seq, BatchEncoding
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
...@@ -8,6 +9,7 @@ from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE ...@@ -8,6 +9,7 @@ from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from ...utils import check_outputs_equal from ...utils import check_outputs_equal
from ....utils import models_path_prefix
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign": "stop_sign":
...@@ -16,7 +18,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ ...@@ -16,7 +18,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"USER: <image>\nWhat is the season?\nASSISTANT:", "USER: <image>\nWhat is the season?\nASSISTANT:",
}) })
models = ["facebook/chameleon-7b"] models = [os.path.join(models_path_prefix, "facebook/chameleon-7b")]
def run_test( def run_test(
......
from typing import List, Optional, Tuple, Type from typing import List, Optional, Tuple, Type
import pytest import pytest
import os
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
...@@ -8,6 +9,7 @@ from vllm.utils import is_cpu ...@@ -8,6 +9,7 @@ from vllm.utils import is_cpu
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
from ....utils import models_path_prefix
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign": "stop_sign":
...@@ -16,7 +18,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ ...@@ -16,7 +18,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"What is the season?\n", "What is the season?\n",
}) })
models = ["adept/fuyu-8b"] models = [os.path.join(models_path_prefix, "adept/fuyu-8b")]
def vllm_to_hf_output(vllm_output: Tuple[List[int], str, def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
......
from typing import List, Optional, Tuple, Type from typing import List, Optional, Tuple, Type
import pytest import pytest
import os
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
from vllm.transformers_utils.tokenizer import patch_padding_side from vllm.transformers_utils.tokenizer import patch_padding_side
...@@ -8,6 +9,7 @@ from vllm.transformers_utils.tokenizer import patch_padding_side ...@@ -8,6 +9,7 @@ from vllm.transformers_utils.tokenizer import patch_padding_side
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test from ....utils import large_gpu_test
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
from ....utils import models_path_prefix
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign": "stop_sign":
...@@ -16,7 +18,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ ...@@ -16,7 +18,7 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"What is the season?", "What is the season?",
}) })
models = ["THUDM/glm-4v-9b"] models = [os.path.join(models_path_prefix, "THUDM/glm-4v-9b")]
target_dtype = "bfloat16" target_dtype = "bfloat16"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment