Commit 2216a4e5 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'mirror/main'

parents ad385667 51c24c97
"""
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for multimodal embedding.
For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
from argparse import Namespace
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
from PIL.Image import Image
from vllm import LLM from vllm import LLM
from vllm.assets.image import ImageAsset from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
prompt = "<|image_1|> Represent the given image with the following question: What is in the image" # noqa: E501
class TextQuery(TypedDict):
# Create an LLM. modality: Literal["text"]
llm = LLM( text: str
model="TIGER-Lab/VLM2Vec-Full",
trust_remote_code=True,
max_model_len=4096, class ImageQuery(TypedDict):
max_num_seqs=2, modality: Literal["image"]
mm_processor_kwargs={"num_crops": 16}, image: Image
)
# Generate embedding. The output is a list of EmbeddingRequestOutputs. class TextImageQuery(TypedDict):
outputs = llm.encode({"prompt": prompt, "multi_modal_data": {"image": image}}) modality: Literal["text+image"]
text: str
# Print the outputs. image: Image
for output in outputs:
print(output.outputs.embedding) # list of 3072 floats
QueryModality = Literal["text", "image", "text+image"]
Query = Union[TextQuery, ImageQuery, TextImageQuery]
class ModelRequestData(NamedTuple):
llm: LLM
prompt: str
image: Optional[Image]
def run_e5_v(query: Query):
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501
if query["modality"] == "text":
text = query["text"]
prompt = llama3_template.format(
f"{text}\nSummary above sentence in one word: ")
image = None
elif query["modality"] == "image":
prompt = llama3_template.format(
"<image>\nSummary above image in one word: ")
image = query["image"]
else:
modality = query['modality']
raise ValueError(f"Unsupported query modality: '{modality}'")
llm = LLM(
model="royokong/e5-v",
task="embedding",
max_model_len=4096,
)
return ModelRequestData(
llm=llm,
prompt=prompt,
image=image,
)
def run_vlm2vec(query: Query):
if query["modality"] == "text":
text = query["text"]
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
image = None
elif query["modality"] == "image":
prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image." # noqa: E501
image = query["image"]
elif query["modality"] == "text+image":
text = query["text"]
prompt = f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501
image = query["image"]
else:
modality = query['modality']
raise ValueError(f"Unsupported query modality: '{modality}'")
llm = LLM(
model="TIGER-Lab/VLM2Vec-Full",
task="embedding",
trust_remote_code=True,
mm_processor_kwargs={"num_crops": 4},
)
return ModelRequestData(
llm=llm,
prompt=prompt,
image=image,
)
def get_query(modality: QueryModality):
if modality == "text":
return TextQuery(modality="text", text="A dog sitting in the grass")
if modality == "image":
return ImageQuery(
modality="image",
image=fetch_image(
"https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg" # noqa: E501
),
)
if modality == "text+image":
return TextImageQuery(
modality="text+image",
text="A cat standing in the snow.",
image=fetch_image(
"https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg" # noqa: E501
),
)
msg = f"Modality {modality} is not supported."
raise ValueError(msg)
def run_encode(model: str, modality: QueryModality):
query = get_query(modality)
req_data = model_example_map[model](query)
mm_data = {}
if req_data.image is not None:
mm_data["image"] = req_data.image
outputs = req_data.llm.encode({
"prompt": req_data.prompt,
"multi_modal_data": mm_data,
})
for output in outputs:
print(output.outputs.embedding)
def main(args: Namespace):
run_encode(args.model_name, args.modality)
model_example_map = {
"e5_v": run_e5_v,
"vlm2vec": run_vlm2vec,
}
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'vision language models for multimodal embedding')
parser.add_argument('--model-name',
'-m',
type=str,
default="vlm2vec",
choices=model_example_map.keys(),
help='The name of the embedding model.')
parser.add_argument('--modality',
type=str,
default="image",
choices=get_args(QueryModality),
help='Modality of the input.')
args = parser.parse_args()
main(args)
""" """
This example shows how to use vLLM for running offline inference with This example shows how to use vLLM for running offline inference with
multi-image input on vision language models, using the chat template defined multi-image input on vision language models for text generation,
by the model. using the chat template defined by the model.
""" """
from argparse import Namespace from argparse import Namespace
from typing import List, NamedTuple, Optional from typing import List, NamedTuple, Optional
...@@ -334,7 +334,8 @@ def main(args: Namespace): ...@@ -334,7 +334,8 @@ def main(args: Namespace):
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with ' description='Demo on using vLLM for offline inference with '
'vision language models that support multi-image input') 'vision language models that support multi-image input for text '
'generation')
parser.add_argument('--model-type', parser.add_argument('--model-type',
'-m', '-m',
type=str, type=str,
......
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory
# NOTE: This is just a running example. For benchmarking purpose, # NOTE: This is just a running example. For benchmarking purpose,
# please see benchmarks/benchmark_prefix_caching.py # please see benchmarks/benchmark_prefix_caching.py
...@@ -28,12 +29,9 @@ generating_prompts = [prefix + prompt for prompt in prompts] ...@@ -28,12 +29,9 @@ generating_prompts = [prefix + prompt for prompt in prompts]
# Create a sampling params object. # Create a sampling params object.
sampling_params = SamplingParams(temperature=0.0) sampling_params = SamplingParams(temperature=0.0)
# Create an LLM. # Create an LLM without prefix caching as a baseline.
regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4) regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
prefix_cached_llm = LLM(model="facebook/opt-125m",
enable_prefix_caching=True,
gpu_memory_utilization=0.4)
print("Results without `enable_prefix_caching`") print("Results without `enable_prefix_caching`")
# Generate texts from the prompts. The output is a list of RequestOutput objects # Generate texts from the prompts. The output is a list of RequestOutput objects
...@@ -50,6 +48,15 @@ for output in outputs: ...@@ -50,6 +48,15 @@ for output in outputs:
print("-" * 80) print("-" * 80)
# Destroy the LLM object and free up the GPU memory.
del regular_llm
cleanup_dist_env_and_memory()
# Create an LLM with prefix caching enabled.
prefix_cached_llm = LLM(model="facebook/opt-125m",
enable_prefix_caching=True,
gpu_memory_utilization=0.4)
# Warmup so that the shared prompt's KV cache is computed. # Warmup so that the shared prompt's KV cache is computed.
prefix_cached_llm.generate(generating_prompts[0], sampling_params) prefix_cached_llm.generate(generating_prompts[0], sampling_params)
......
...@@ -7,8 +7,8 @@ Launch the vLLM server with the following command: ...@@ -7,8 +7,8 @@ Launch the vLLM server with the following command:
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
(multi-image inference with Phi-3.5-vision-instruct) (multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
--trust-remote-code --limit-mm-per-prompt image=2 --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
(audio inference with Ultravox) (audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096 vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096
......
...@@ -21,6 +21,20 @@ builtin cd "$(dirname "${BASH_SOURCE:-$0}")" ...@@ -21,6 +21,20 @@ builtin cd "$(dirname "${BASH_SOURCE:-$0}")"
ROOT="$(git rev-parse --show-toplevel)" ROOT="$(git rev-parse --show-toplevel)"
builtin cd "$ROOT" || exit 1 builtin cd "$ROOT" || exit 1
check_command() {
if ! command -v "$1" &> /dev/null; then
echo "❓❓$1 is not installed, please run \`pip install -r requirements-lint.txt\`"
exit 1
fi
}
check_command yapf
check_command ruff
check_command mypy
check_command codespell
check_command isort
check_command clang-format
YAPF_VERSION=$(yapf --version | awk '{print $2}') YAPF_VERSION=$(yapf --version | awk '{print $2}')
RUFF_VERSION=$(ruff --version | awk '{print $2}') RUFF_VERSION=$(ruff --version | awk '{print $2}')
MYPY_VERSION=$(mypy --version | awk '{print $2}') MYPY_VERSION=$(mypy --version | awk '{print $2}')
...@@ -31,7 +45,7 @@ CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}') ...@@ -31,7 +45,7 @@ CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}')
# # params: tool name, tool version, required version # # params: tool name, tool version, required version
tool_version_check() { tool_version_check() {
if [[ $2 != $3 ]]; then if [[ $2 != $3 ]]; then
echo "Wrong $1 version installed: $3 is required, not $2." echo "❓❓Wrong $1 version installed: $3 is required, not $2."
exit 1 exit 1
fi fi
} }
...@@ -281,10 +295,12 @@ tools/actionlint.sh -color ...@@ -281,10 +295,12 @@ tools/actionlint.sh -color
echo 'vLLM actionlint: Done' echo 'vLLM actionlint: Done'
if ! git diff --quiet &>/dev/null; then if ! git diff --quiet &>/dev/null; then
echo 'Reformatted files. Please review and stage the changes.' echo
echo 'Changes not staged for commit:' echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:"
echo
git --no-pager diff --name-only git --no-pager diff --name-only
echo "🔍🔍Format checker passed, but please add, commit and push all the files above to include changes made by the format checker."
exit 1 exit 1
else
echo "✨🎉 Format check passed! Congratulations! 🎉✨"
fi fi
...@@ -39,7 +39,6 @@ assert cwd != package_path, "should not import from the current directory" ...@@ -39,7 +39,6 @@ assert cwd != package_path, "should not import from the current directory"
files_to_copy = [ files_to_copy = [
"vllm/_C.abi3.so", "vllm/_C.abi3.so",
"vllm/_core_C.abi3.so",
"vllm/_moe_C.abi3.so", "vllm/_moe_C.abi3.so",
"vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so", "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
"vllm/vllm_flash_attn/flash_attn_interface.py", "vllm/vllm_flash_attn/flash_attn_interface.py",
......
...@@ -31,4 +31,4 @@ pyyaml ...@@ -31,4 +31,4 @@ pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL. einops # Required for Qwen2-VL.
compressed-tensors == 0.6.0 # required for compressed-tensors compressed-tensors == 0.7.1 # required for compressed-tensors
...@@ -164,6 +164,14 @@ class cmake_build_ext(build_ext): ...@@ -164,6 +164,14 @@ class cmake_build_ext(build_ext):
# on subsequent calls to python. # on subsequent calls to python.
cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))] cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))]
# Override the base directory for FetchContent downloads to $ROOT/.deps
# This allows sharing dependencies between profiles,
# and plays more nicely with sccache.
# To override this, set the FETCHCONTENT_BASE_DIR environment variable.
fc_base_dir = os.path.join(ROOT_DIR, ".deps")
fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir)
cmake_args += ['-DFETCHCONTENT_BASE_DIR={}'.format(fc_base_dir)]
# #
# Setup parallelism and build tool # Setup parallelism and build tool
# #
...@@ -297,10 +305,6 @@ def _build_custom_ops() -> bool: ...@@ -297,10 +305,6 @@ def _build_custom_ops() -> bool:
return _is_cuda() or _is_hip() or _is_cpu() return _is_cuda() or _is_hip() or _is_cpu()
def _build_core_ext() -> bool:
return not (_is_neuron() or _is_tpu() or _is_openvino() or _is_xpu())
def get_hipcc_rocm_version(): def get_hipcc_rocm_version():
# Run the hipcc --version command # Run the hipcc --version command
result = subprocess.run(['hipcc', '--version'], result = subprocess.run(['hipcc', '--version'],
...@@ -530,9 +534,6 @@ def get_requirements() -> List[str]: ...@@ -530,9 +534,6 @@ def get_requirements() -> List[str]:
ext_modules = [] ext_modules = []
if _build_core_ext():
ext_modules.append(CMakeExtension(name="vllm._core_C"))
if _is_cuda() or _is_hip(): if _is_cuda() or _is_hip():
ext_modules.append(CMakeExtension(name="vllm._moe_C")) ext_modules.append(CMakeExtension(name="vllm._moe_C"))
......
...@@ -12,11 +12,11 @@ import torch ...@@ -12,11 +12,11 @@ import torch
from vllm import SamplingParams from vllm import SamplingParams
from vllm.config import ParallelConfig from vllm.config import ParallelConfig
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
from vllm.outputs import RequestOutput as RealRequestOutput from vllm.outputs import RequestOutput as RealRequestOutput
from vllm.sampling_params import RequestOutputKind from vllm.sampling_params import RequestOutputKind
from ..conftest import cleanup
from ..utils import wait_for_gpu_memory_to_clear from ..utils import wait_for_gpu_memory_to_clear
...@@ -157,7 +157,7 @@ async def async_engine(): ...@@ -157,7 +157,7 @@ async def async_engine():
engine.shutdown_background_loop() engine.shutdown_background_loop()
del engine del engine
await asyncio.sleep(0.1) await asyncio.sleep(0.1)
cleanup() cleanup_dist_env_and_memory()
@pytest.fixture() @pytest.fixture()
......
...@@ -19,7 +19,7 @@ from ..utils import multi_gpu_test ...@@ -19,7 +19,7 @@ from ..utils import multi_gpu_test
MODELS = [ MODELS = [
"facebook/opt-125m", "facebook/opt-125m",
"meta-llama/Llama-2-7b-hf", "meta-llama/Llama-3.2-1B",
] ]
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4") TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
......
...@@ -16,7 +16,7 @@ from ..utils import multi_gpu_test ...@@ -16,7 +16,7 @@ from ..utils import multi_gpu_test
MODELS = [ MODELS = [
"facebook/opt-125m", "facebook/opt-125m",
"meta-llama/Llama-2-7b-hf", "meta-llama/Llama-3.2-1B",
] ]
......
...@@ -2,5 +2,5 @@ from ..utils import compare_two_settings ...@@ -2,5 +2,5 @@ from ..utils import compare_two_settings
def test_cpu_offload(): def test_cpu_offload():
compare_two_settings("meta-llama/Llama-2-7b-hf", [], compare_two_settings("meta-llama/Llama-3.2-1B", [],
["--cpu-offload-gb", "4"]) ["--cpu-offload-gb", "1"])
...@@ -13,8 +13,7 @@ from ..utils import compare_all_settings ...@@ -13,8 +13,7 @@ from ..utils import compare_all_settings
@pytest.mark.parametrize( @pytest.mark.parametrize(
"model, model_args, pp_size, tp_size, attn_backend, method, fullgraph", "model, model_args, pp_size, tp_size, attn_backend, method, fullgraph",
[ [
("meta-llama/Meta-Llama-3-8B", [], 2, 2, "FLASH_ATTN", "generate", ("meta-llama/Llama-3.2-1B", [], 2, 2, "FLASH_ATTN", "generate", True),
True),
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples",
["--quantization", "compressed-tensors" ["--quantization", "compressed-tensors"
], 1, 1, "FLASH_ATTN", "generate", True), ], 1, 1, "FLASH_ATTN", "generate", True),
......
...@@ -69,11 +69,11 @@ def check_full_graph_support(model, ...@@ -69,11 +69,11 @@ def check_full_graph_support(model,
os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level) os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level)
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1" os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
# Inductor doesn't support fp8/gptq_marlin_24 yet. # Inductor doesn't support fp8 and the base meta llama uses too
# much memory.
quantization = model_kwargs.get("quantization") quantization = model_kwargs.get("quantization")
if (quantization == "fp8" or quantization == "gptq_marlin" if ((quantization == "fp8" or model == "meta-llama/Meta-Llama-3-8B")
or quantization == "gptq_marlin_24" and optimization_level >= CompilationLevel.INDUCTOR):
) and optimization_level >= CompilationLevel.INDUCTOR:
return return
prompts = [ prompts = [
......
import contextlib
import gc
import json import json
import os import os
import sys import sys
...@@ -25,19 +23,19 @@ from tests.models.utils import (TokensTextLogprobs, ...@@ -25,19 +23,19 @@ from tests.models.utils import (TokensTextLogprobs,
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset from vllm.assets.video import VideoAsset
from vllm.config import TokenizerPoolConfig from vllm.config import TaskOption, TokenizerPoolConfig
from vllm.connections import global_http_connection from vllm.connections import global_http_connection
from vllm.distributed import (destroy_distributed_environment, from vllm.distributed import (cleanup_dist_env_and_memory,
destroy_model_parallel,
init_distributed_environment, init_distributed_environment,
initialize_model_parallel) initialize_model_parallel)
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
to_enc_dec_tuple_list, zip_enc_dec_prompts) to_enc_dec_tuple_list, zip_enc_dec_prompts)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.platforms import current_platform
from vllm.sampling_params import BeamSearchParams from vllm.sampling_params import BeamSearchParams
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless, from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
identity, is_cpu) identity)
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -45,10 +43,12 @@ _TEST_DIR = os.path.dirname(__file__) ...@@ -45,10 +43,12 @@ _TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]] _M = TypeVar("_M")
PromptAudioInput = Union[List[Tuple[np.ndarray, int]], _PromptMultiModalInput = Union[List[_M], List[List[_M]]]
List[List[Tuple[np.ndarray, int]]]]
PromptVideoInput = Union[List[np.ndarray], List[List[np.ndarray]]] PromptImageInput = _PromptMultiModalInput[Image.Image]
PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
def _read_prompts(filename: str) -> List[str]: def _read_prompts(filename: str) -> List[str]:
...@@ -140,17 +140,7 @@ def dist_init(): ...@@ -140,17 +140,7 @@ def dist_init():
) )
initialize_model_parallel(1, 1) initialize_model_parallel(1, 1)
yield yield
cleanup() cleanup_dist_env_and_memory()
def cleanup():
destroy_model_parallel()
destroy_distributed_environment()
with contextlib.suppress(AssertionError):
torch.distributed.destroy_process_group()
gc.collect()
if not is_cpu():
torch.cuda.empty_cache()
@pytest.fixture() @pytest.fixture()
...@@ -167,7 +157,7 @@ def should_do_global_cleanup_after_test(request) -> bool: ...@@ -167,7 +157,7 @@ def should_do_global_cleanup_after_test(request) -> bool:
def cleanup_fixture(should_do_global_cleanup_after_test: bool): def cleanup_fixture(should_do_global_cleanup_after_test: bool):
yield yield
if should_do_global_cleanup_after_test: if should_do_global_cleanup_after_test:
cleanup() cleanup_dist_env_and_memory()
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
...@@ -249,7 +239,8 @@ class HfRunner: ...@@ -249,7 +239,8 @@ class HfRunner:
def wrap_device(self, input: _T, device: Optional[str] = None) -> _T: def wrap_device(self, input: _T, device: Optional[str] = None) -> _T:
if device is None: if device is None:
return self.wrap_device(input, "cpu" if is_cpu() else "cuda") return self.wrap_device(
input, "cpu" if current_platform.is_cpu() else "cuda")
if hasattr(input, "device") and input.device.type == device: if hasattr(input, "device") and input.device.type == device:
return input return input
...@@ -329,12 +320,12 @@ class HfRunner: ...@@ -329,12 +320,12 @@ class HfRunner:
"text": prompt, "text": prompt,
"return_tensors": "pt", "return_tensors": "pt",
} }
if images is not None and images[i] is not None: if images is not None and (image := images[i]) is not None:
processor_kwargs["images"] = images[i] processor_kwargs["images"] = image
if videos is not None and videos[i] is not None: if videos is not None and (video := videos[i]) is not None:
processor_kwargs["videos"] = videos[i] processor_kwargs["videos"] = video
if audios is not None and audios[i] is not None: if audios is not None and (audio_tuple := audios[i]) is not None:
audio, sr = audios[i] audio, sr = audio_tuple
processor_kwargs["audio"] = audio processor_kwargs["audio"] = audio
processor_kwargs["sampling_rate"] = sr processor_kwargs["sampling_rate"] = sr
...@@ -349,7 +340,7 @@ class HfRunner: ...@@ -349,7 +340,7 @@ class HfRunner:
self, self,
prompts: List[str], prompts: List[str],
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
videos: Optional[List[np.ndarray]] = None, videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
**kwargs: Any, **kwargs: Any,
) -> List[Tuple[List[List[int]], List[str]]]: ) -> List[Tuple[List[List[int]], List[str]]]:
...@@ -379,7 +370,7 @@ class HfRunner: ...@@ -379,7 +370,7 @@ class HfRunner:
prompts: List[str], prompts: List[str],
max_tokens: int, max_tokens: int,
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
videos: Optional[List[np.ndarray]] = None, videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
**kwargs: Any, **kwargs: Any,
) -> List[Tuple[List[int], str]]: ) -> List[Tuple[List[int], str]]:
...@@ -420,7 +411,7 @@ class HfRunner: ...@@ -420,7 +411,7 @@ class HfRunner:
prompts: List[str], prompts: List[str],
max_tokens: int, max_tokens: int,
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
videos: Optional[List[np.ndarray]] = None, videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
**kwargs: Any, **kwargs: Any,
) -> List[List[torch.Tensor]]: ) -> List[List[torch.Tensor]]:
...@@ -499,7 +490,7 @@ class HfRunner: ...@@ -499,7 +490,7 @@ class HfRunner:
num_logprobs: int, num_logprobs: int,
images: Optional[PromptImageInput] = None, images: Optional[PromptImageInput] = None,
audios: Optional[PromptAudioInput] = None, audios: Optional[PromptAudioInput] = None,
videos: Optional[List[np.ndarray]] = None, videos: Optional[PromptVideoInput] = None,
**kwargs: Any, **kwargs: Any,
) -> List[TokensTextLogprobs]: ) -> List[TokensTextLogprobs]:
all_inputs = self.get_inputs(prompts, all_inputs = self.get_inputs(prompts,
...@@ -606,7 +597,7 @@ class HfRunner: ...@@ -606,7 +597,7 @@ class HfRunner:
def __exit__(self, exc_type, exc_value, traceback): def __exit__(self, exc_type, exc_value, traceback):
del self.model del self.model
cleanup() cleanup_dist_env_and_memory()
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
...@@ -619,6 +610,7 @@ class VllmRunner: ...@@ -619,6 +610,7 @@ class VllmRunner:
def __init__( def __init__(
self, self,
model_name: str, model_name: str,
task: TaskOption = "auto",
tokenizer_name: Optional[str] = None, tokenizer_name: Optional[str] = None,
# Use smaller max model length, otherwise bigger model cannot run due # Use smaller max model length, otherwise bigger model cannot run due
# to kv cache size limit. # to kv cache size limit.
...@@ -634,6 +626,7 @@ class VllmRunner: ...@@ -634,6 +626,7 @@ class VllmRunner:
) -> None: ) -> None:
self.model = LLM( self.model = LLM(
model=model_name, model=model_name,
task=task,
tokenizer=tokenizer_name, tokenizer=tokenizer_name,
trust_remote_code=True, trust_remote_code=True,
dtype=dtype, dtype=dtype,
...@@ -666,15 +659,18 @@ class VllmRunner: ...@@ -666,15 +659,18 @@ class VllmRunner:
inputs = [TextPrompt(prompt=prompt) for prompt in prompts] inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
if images is not None: if images is not None:
for i, image in enumerate(images): for i, image in enumerate(images):
inputs[i]["multi_modal_data"] = {"image": image} if image is not None:
inputs[i]["multi_modal_data"] = {"image": image}
if videos is not None: if videos is not None:
for i, video in enumerate(videos): for i, video in enumerate(videos):
inputs[i]["multi_modal_data"] = {"video": video} if video is not None:
inputs[i]["multi_modal_data"] = {"video": video}
if audios is not None: if audios is not None:
for i, audio in enumerate(audios): for i, audio in enumerate(audios):
inputs[i]["multi_modal_data"] = {"audio": audio} if audio is not None:
inputs[i]["multi_modal_data"] = {"audio": audio}
return inputs return inputs
...@@ -846,20 +842,27 @@ class VllmRunner: ...@@ -846,20 +842,27 @@ class VllmRunner:
returned_outputs.append((token_ids, texts)) returned_outputs.append((token_ids, texts))
return returned_outputs return returned_outputs
def encode(self, prompts: List[str]) -> List[List[float]]: def encode(
req_outputs = self.model.encode(prompts) self,
outputs = [] prompts: List[str],
for req_output in req_outputs: images: Optional[PromptImageInput] = None,
embedding = req_output.outputs.embedding videos: Optional[PromptVideoInput] = None,
outputs.append(embedding) audios: Optional[PromptAudioInput] = None,
return outputs ) -> List[List[float]]:
inputs = self.get_inputs(prompts,
images=images,
videos=videos,
audios=audios)
req_outputs = self.model.encode(inputs)
return [req_output.outputs.embedding for req_output in req_outputs]
def __enter__(self): def __enter__(self):
return self return self
def __exit__(self, exc_type, exc_value, traceback): def __exit__(self, exc_type, exc_value, traceback):
del self.model del self.model
cleanup() cleanup_dist_env_and_memory()
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
......
...@@ -3,10 +3,9 @@ from typing import Callable, Iterable, Optional ...@@ -3,10 +3,9 @@ from typing import Callable, Iterable, Optional
import pytest import pytest
from vllm import LLM from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.model_executor.utils import set_random_seed from vllm.model_executor.utils import set_random_seed
from ....conftest import cleanup
@pytest.fixture @pytest.fixture
def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
...@@ -37,7 +36,7 @@ def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs, ...@@ -37,7 +36,7 @@ def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
yield llm yield llm
del llm del llm
cleanup() cleanup_dist_env_and_memory()
for llm in generator_inner(): for llm in generator_inner():
yield llm yield llm
......
...@@ -33,7 +33,8 @@ def test_simple(): ...@@ -33,7 +33,8 @@ def test_simple():
num_seq_group = 4 num_seq_group = 4
max_model_len = 16 max_model_len = 16
max_num_batched_tokens = 64 max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(max_num_batched_tokens, scheduler_config = SchedulerConfig("generate",
max_num_batched_tokens,
num_seq_group, num_seq_group,
max_model_len, max_model_len,
enable_chunked_prefill=True) enable_chunked_prefill=True)
...@@ -78,6 +79,7 @@ def test_chunk(): ...@@ -78,6 +79,7 @@ def test_chunk():
max_model_len = 80 max_model_len = 80
max_num_batched_tokens = 64 max_num_batched_tokens = 64
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens, max_num_batched_tokens,
max_seqs, max_seqs,
max_model_len, max_model_len,
...@@ -126,6 +128,7 @@ def test_complex(): ...@@ -126,6 +128,7 @@ def test_complex():
max_model_len = 80 max_model_len = 80
max_num_batched_tokens = 64 max_num_batched_tokens = 64
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens, max_num_batched_tokens,
max_seqs, max_seqs,
max_model_len, max_model_len,
...@@ -196,6 +199,7 @@ def test_maximal_decoding(): ...@@ -196,6 +199,7 @@ def test_maximal_decoding():
max_model_len = 8 max_model_len = 8
max_num_batched_tokens = 2 max_num_batched_tokens = 2
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens, max_num_batched_tokens,
max_seqs, max_seqs,
max_model_len, max_model_len,
...@@ -289,6 +293,7 @@ def test_prompt_limit(): ...@@ -289,6 +293,7 @@ def test_prompt_limit():
max_model_len = 64 max_model_len = 64
max_num_batched_tokens = 32 max_num_batched_tokens = 32
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens, max_num_batched_tokens,
max_seqs, max_seqs,
max_model_len, max_model_len,
...@@ -321,7 +326,8 @@ def test_prompt_limit_exceed(): ...@@ -321,7 +326,8 @@ def test_prompt_limit_exceed():
max_seqs = 64 max_seqs = 64
max_model_len = 32 max_model_len = 32
max_num_batched_tokens = 64 max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(max_num_batched_tokens, scheduler_config = SchedulerConfig("generate",
max_num_batched_tokens,
max_seqs, max_seqs,
max_model_len, max_model_len,
enable_chunked_prefill=True) enable_chunked_prefill=True)
...@@ -348,6 +354,7 @@ def test_swap(): ...@@ -348,6 +354,7 @@ def test_swap():
max_model_len = 200 max_model_len = 200
max_num_batched_tokens = 30 max_num_batched_tokens = 30
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens, max_num_batched_tokens,
max_seqs, max_seqs,
max_model_len, max_model_len,
...@@ -404,6 +411,7 @@ def test_running_prefill_prioritized_over_swap(): ...@@ -404,6 +411,7 @@ def test_running_prefill_prioritized_over_swap():
max_model_len = 200 max_model_len = 200
max_num_batched_tokens = 30 max_num_batched_tokens = 30
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens, max_num_batched_tokens,
max_seqs, max_seqs,
max_model_len, max_model_len,
...@@ -498,6 +506,7 @@ def test_chunked_prefill_preempt(): ...@@ -498,6 +506,7 @@ def test_chunked_prefill_preempt():
max_model_len = 200 max_model_len = 200
max_num_batched_tokens = 30 max_num_batched_tokens = 30
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens, max_num_batched_tokens,
max_seqs, max_seqs,
max_model_len, max_model_len,
...@@ -563,6 +572,7 @@ def test_chunked_prefill_max_seqs(): ...@@ -563,6 +572,7 @@ def test_chunked_prefill_max_seqs():
max_model_len = 80 max_model_len = 80
max_num_batched_tokens = 64 max_num_batched_tokens = 64
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens, max_num_batched_tokens,
max_seqs, max_seqs,
max_model_len, max_model_len,
...@@ -617,6 +627,7 @@ def test_perfix_caching(): ...@@ -617,6 +627,7 @@ def test_perfix_caching():
max_model_len = 80 max_model_len = 80
max_num_batched_tokens = 64 max_num_batched_tokens = 64
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens, max_num_batched_tokens,
max_seqs, max_seqs,
max_model_len, max_model_len,
......
...@@ -20,9 +20,10 @@ from .utils import (append_new_token, append_new_token_seq_group, ...@@ -20,9 +20,10 @@ from .utils import (append_new_token, append_new_token_seq_group,
def test_scheduler_add_seq_group(): def test_scheduler_add_seq_group():
block_size = 4 block_size = 4
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
100, "generate",
64, max_num_batched_tokens=100,
1, max_num_seqs=64,
max_model_len=1,
) )
cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto") cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
cache_config.num_cpu_blocks = 4 cache_config.num_cpu_blocks = 4
...@@ -42,9 +43,10 @@ def test_scheduler_add_seq_group(): ...@@ -42,9 +43,10 @@ def test_scheduler_add_seq_group():
def test_scheduler_abort_seq_group(): def test_scheduler_abort_seq_group():
block_size = 4 block_size = 4
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
100, "generate",
64, max_num_batched_tokens=100,
1, max_num_seqs=64,
max_model_len=1,
) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 4 cache_config.num_cpu_blocks = 4
...@@ -70,9 +72,10 @@ def test_scheduler_schedule_simple(): ...@@ -70,9 +72,10 @@ def test_scheduler_schedule_simple():
num_seq_group = 4 num_seq_group = 4
max_model_len = 16 max_model_len = 16
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
64, "generate",
num_seq_group, max_num_batched_tokens=64,
max_model_len, max_num_seqs=num_seq_group,
max_model_len=max_model_len,
) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 8
...@@ -114,9 +117,10 @@ def test_scheduler_prefill_prioritized(): ...@@ -114,9 +117,10 @@ def test_scheduler_prefill_prioritized():
max_model_len = 30 max_model_len = 30
max_batched_num_tokens = 30 max_batched_num_tokens = 30
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
max_batched_num_tokens, "generate",
2, max_num_batched_tokens=max_batched_num_tokens,
max_model_len, max_num_seqs=2,
max_model_len=max_model_len,
) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16 cache_config.num_cpu_blocks = 16
...@@ -145,9 +149,10 @@ def test_scheduler_schedule_preempt_abort(): ...@@ -145,9 +149,10 @@ def test_scheduler_schedule_preempt_abort():
block_size = 4 block_size = 4
max_model_len = 16 max_model_len = 16
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
64, "generate",
2, max_num_batched_tokens=64,
max_model_len, max_num_seqs=2,
max_model_len=max_model_len,
) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 2 cache_config.num_cpu_blocks = 2
...@@ -204,9 +209,10 @@ def test_scheduler_max_seqs(): ...@@ -204,9 +209,10 @@ def test_scheduler_max_seqs():
max_seq_group = 2 max_seq_group = 2
max_model_len = 16 max_model_len = 16
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
64, "generate",
max_seq_group, max_num_batched_tokens=64,
max_model_len, max_num_seqs=max_seq_group,
max_model_len=max_model_len,
) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8 cache_config.num_cpu_blocks = 8
...@@ -248,9 +254,10 @@ def test_scheduler_max_seqs(): ...@@ -248,9 +254,10 @@ def test_scheduler_max_seqs():
def test_scheduler_delay_factor(): def test_scheduler_delay_factor():
block_size = 4 block_size = 4
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
100, "generate",
64, max_num_batched_tokens=100,
16, max_num_seqs=64,
max_model_len=16,
delay_factor=0.5, delay_factor=0.5,
) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
...@@ -350,9 +357,10 @@ def initialize_scheduler( ...@@ -350,9 +357,10 @@ def initialize_scheduler(
): ):
block_size = block_size block_size = block_size
scheduler_config = SchedulerConfig( scheduler_config = SchedulerConfig(
max_token_budget, "generate",
max_num_seqs, max_num_batched_tokens=max_token_budget,
max_model_len, max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
) )
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = num_cpu_blocks cache_config.num_cpu_blocks = num_cpu_blocks
......
...@@ -36,7 +36,12 @@ def test_scheduler_schedule_simple_encoder_decoder(): ...@@ -36,7 +36,12 @@ def test_scheduler_schedule_simple_encoder_decoder():
block_size = 4 block_size = 4
num_seq_group = 4 num_seq_group = 4
max_model_len = 16 max_model_len = 16
scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len) scheduler_config = SchedulerConfig(
task="generate",
max_num_batched_tokens=64,
max_num_seqs=num_seq_group,
max_model_len=max_model_len,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto") cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16 # enc and dec prompts per seq_group cache_config.num_cpu_blocks = 16 # enc and dec prompts per seq_group
cache_config.num_gpu_blocks = 16 # enc and dec prompts per seq_group cache_config.num_gpu_blocks = 16 # enc and dec prompts per seq_group
......
...@@ -11,6 +11,7 @@ from typing import List, Literal, NamedTuple, Optional ...@@ -11,6 +11,7 @@ from typing import List, Literal, NamedTuple, Optional
import pytest import pytest
from vllm.config import TaskOption
from vllm.logger import init_logger from vllm.logger import init_logger
from ..utils import compare_two_settings, fork_new_process_for_each_test from ..utils import compare_two_settings, fork_new_process_for_each_test
...@@ -27,18 +28,26 @@ class ParallelSetup(NamedTuple): ...@@ -27,18 +28,26 @@ class ParallelSetup(NamedTuple):
chunked_prefill: bool chunked_prefill: bool
class PPTestOptions(NamedTuple):
multi_node_only: bool
trust_remote_code: bool
tokenizer_mode: Optional[str]
@dataclass @dataclass
class PPTestSettings: class PPTestSettings:
parallel_setups: List[ParallelSetup] parallel_setups: List[ParallelSetup]
distributed_backends: List[str] distributed_backends: List[str]
trust_remote_code: bool task: TaskOption
tokenizer_mode: Optional[str] test_options: PPTestOptions
@staticmethod @staticmethod
def detailed( def detailed(
*, *,
tp_base: int = 1, tp_base: int = 1,
pp_base: int = 2, pp_base: int = 2,
multi_node_only: bool = False,
task: TaskOption = "auto",
trust_remote_code: bool = False, trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None, tokenizer_mode: Optional[str] = None,
): ):
...@@ -66,8 +75,10 @@ class PPTestSettings: ...@@ -66,8 +75,10 @@ class PPTestSettings:
chunked_prefill=False), chunked_prefill=False),
], ],
distributed_backends=["mp", "ray"], distributed_backends=["mp", "ray"],
trust_remote_code=trust_remote_code, task=task,
tokenizer_mode=tokenizer_mode, test_options=PPTestOptions(multi_node_only=multi_node_only,
trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode),
) )
@staticmethod @staticmethod
...@@ -75,6 +86,8 @@ class PPTestSettings: ...@@ -75,6 +86,8 @@ class PPTestSettings:
*, *,
tp_base: int = 1, tp_base: int = 1,
pp_base: int = 2, pp_base: int = 2,
task: TaskOption = "auto",
multi_node_only: bool = False,
trust_remote_code: bool = False, trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None, tokenizer_mode: Optional[str] = None,
): ):
...@@ -86,15 +99,19 @@ class PPTestSettings: ...@@ -86,15 +99,19 @@ class PPTestSettings:
chunked_prefill=False), chunked_prefill=False),
], ],
distributed_backends=["mp"], distributed_backends=["mp"],
trust_remote_code=trust_remote_code, task=task,
tokenizer_mode=tokenizer_mode, test_options=PPTestOptions(multi_node_only=multi_node_only,
trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode),
) )
def iter_params(self, model_name: str): def iter_params(self, model_name: str):
opts = self.test_options
for parallel_setup in self.parallel_setups: for parallel_setup in self.parallel_setups:
for distributed_backend in self.distributed_backends: for distributed_backend in self.distributed_backends:
yield (model_name, parallel_setup, distributed_backend, yield (model_name, parallel_setup, distributed_backend,
self.trust_remote_code, self.tokenizer_mode) self.task, opts)
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
...@@ -104,6 +121,7 @@ class PPTestSettings: ...@@ -104,6 +121,7 @@ class PPTestSettings:
GENERATION_MODEL_SETTINGS = { GENERATION_MODEL_SETTINGS = {
# [DETAILED TESTS] # [DETAILED TESTS]
"meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(), "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
"microsoft/Phi-3-mini-4k-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True), # noqa: E501
# [FAST TESTS] # [FAST TESTS]
# Uses Llama # Uses Llama
# "BAAI/AquilaChat-7B": PPTestSettings.fast(), # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
...@@ -145,10 +163,8 @@ GENERATION_MODEL_SETTINGS = { ...@@ -145,10 +163,8 @@ GENERATION_MODEL_SETTINGS = {
"facebook/opt-iml-max-1.3b": PPTestSettings.fast(), "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
"OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True), "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
"microsoft/phi-2": PPTestSettings.fast(), "microsoft/phi-2": PPTestSettings.fast(),
"microsoft/Phi-3-mini-4k-instruct": PPTestSettings.fast(),
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501 "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
# FIXME: https://github.com/vllm-project/vllm/issues/8553 "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
# "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"adept/persimmon-8b-chat": PPTestSettings.fast(), "adept/persimmon-8b-chat": PPTestSettings.fast(),
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True), "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
"Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(), "Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(),
...@@ -199,6 +215,7 @@ TEST_MODELS = [ ...@@ -199,6 +215,7 @@ TEST_MODELS = [
# [LANGUAGE GENERATION] # [LANGUAGE GENERATION]
"meta-llama/Meta-Llama-3-8B", "meta-llama/Meta-Llama-3-8B",
"ibm/PowerLM-3b", "ibm/PowerLM-3b",
"microsoft/Phi-3-mini-4k-instruct",
# [LANGUAGE EMBEDDING] # [LANGUAGE EMBEDDING]
"intfloat/e5-mistral-7b-instruct", "intfloat/e5-mistral-7b-instruct",
"BAAI/bge-multilingual-gemma2", "BAAI/bge-multilingual-gemma2",
...@@ -213,19 +230,22 @@ def _compare_tp( ...@@ -213,19 +230,22 @@ def _compare_tp(
model_name: str, model_name: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
trust_remote_code: bool, task: TaskOption,
tokenizer_mode: Optional[str], test_options: PPTestOptions,
num_gpus_available: int, num_gpus_available: int,
*, *,
method: Literal["generate", "encode"] = "encode", method: Literal["generate", "encode"],
): ):
tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup
multi_node_only, trust_remote_code, tokenizer_mode = test_options
if num_gpus_available < tp_size * pp_size: if num_gpus_available < tp_size * pp_size:
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs") pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
if VLLM_MULTI_NODE and distributed_backend == "mp": if VLLM_MULTI_NODE and distributed_backend == "mp":
pytest.skip("Skipping multi-node pipeline parallel test for " pytest.skip("Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend") "multiprocessing distributed backend")
if multi_node_only and not VLLM_MULTI_NODE:
pytest.skip("Not in multi-node setting")
common_args = [ common_args = [
# use half precision for speed and memory savings in CI environment # use half precision for speed and memory savings in CI environment
...@@ -240,6 +260,8 @@ def _compare_tp( ...@@ -240,6 +260,8 @@ def _compare_tp(
common_args.append("--enable-chunked-prefill") common_args.append("--enable-chunked-prefill")
if eager_mode: if eager_mode:
common_args.append("--enforce-eager") common_args.append("--enforce-eager")
if task != "auto":
common_args.extend(["--task", task])
if trust_remote_code: if trust_remote_code:
common_args.append("--trust-remote-code") common_args.append("--trust-remote-code")
if tokenizer_mode: if tokenizer_mode:
...@@ -297,8 +319,8 @@ def _compare_tp( ...@@ -297,8 +319,8 @@ def _compare_tp(
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend", ("model_name", "parallel_setup", "distributed_backend", "task",
"trust_remote_code", "tokenizer_mode"), "test_options"),
[ [
params for model_name, settings in GENERATION_MODEL_SETTINGS.items() params for model_name, settings in GENERATION_MODEL_SETTINGS.items()
for params in settings.iter_params(model_name) for params in settings.iter_params(model_name)
...@@ -310,22 +332,22 @@ def test_tp_language_generation( ...@@ -310,22 +332,22 @@ def test_tp_language_generation(
model_name: str, model_name: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
trust_remote_code: bool, task: TaskOption,
tokenizer_mode: Optional[str], test_options: PPTestOptions,
num_gpus_available, num_gpus_available,
): ):
_compare_tp(model_name, _compare_tp(model_name,
parallel_setup, parallel_setup,
distributed_backend, distributed_backend,
trust_remote_code, task,
tokenizer_mode, test_options,
num_gpus_available, num_gpus_available,
method="generate") method="generate")
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend", ("model_name", "parallel_setup", "distributed_backend", "task",
"trust_remote_code", "tokenizer_mode"), "test_options"),
[ [
params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items() params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items()
for params in settings.iter_params(model_name) for params in settings.iter_params(model_name)
...@@ -337,22 +359,22 @@ def test_tp_language_embedding( ...@@ -337,22 +359,22 @@ def test_tp_language_embedding(
model_name: str, model_name: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
trust_remote_code: bool, task: TaskOption,
tokenizer_mode: Optional[str], test_options: PPTestOptions,
num_gpus_available, num_gpus_available,
): ):
_compare_tp(model_name, _compare_tp(model_name,
parallel_setup, parallel_setup,
distributed_backend, distributed_backend,
trust_remote_code, task,
tokenizer_mode, test_options,
num_gpus_available, num_gpus_available,
method="encode") method="encode")
@pytest.mark.parametrize( @pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend", ("model_name", "parallel_setup", "distributed_backend", "task",
"trust_remote_code", "tokenizer_mode"), "test_options"),
[ [
params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items() params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items()
for params in settings.iter_params(model_name) for params in settings.iter_params(model_name)
...@@ -364,14 +386,14 @@ def test_tp_multimodal_generation( ...@@ -364,14 +386,14 @@ def test_tp_multimodal_generation(
model_name: str, model_name: str,
parallel_setup: ParallelSetup, parallel_setup: ParallelSetup,
distributed_backend: str, distributed_backend: str,
trust_remote_code: bool, task: TaskOption,
tokenizer_mode: Optional[str], test_options: PPTestOptions,
num_gpus_available, num_gpus_available,
): ):
_compare_tp(model_name, _compare_tp(model_name,
parallel_setup, parallel_setup,
distributed_backend, distributed_backend,
trust_remote_code, task,
tokenizer_mode, test_options,
num_gpus_available, num_gpus_available,
method="generate") method="generate")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment