Commit 2216a4e5 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'mirror/main'

parents ad385667 51c24c97
"""
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for multimodal embedding.
For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
from argparse import Namespace
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
from PIL.Image import Image
from vllm import LLM
from vllm.assets.image import ImageAsset
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
prompt = "<|image_1|> Represent the given image with the following question: What is in the image" # noqa: E501
# Create an LLM.
llm = LLM(
model="TIGER-Lab/VLM2Vec-Full",
trust_remote_code=True,
max_model_len=4096,
max_num_seqs=2,
mm_processor_kwargs={"num_crops": 16},
)
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs = llm.encode({"prompt": prompt, "multi_modal_data": {"image": image}})
# Print the outputs.
for output in outputs:
print(output.outputs.embedding) # list of 3072 floats
from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser
class TextQuery(TypedDict):
modality: Literal["text"]
text: str
class ImageQuery(TypedDict):
modality: Literal["image"]
image: Image
class TextImageQuery(TypedDict):
modality: Literal["text+image"]
text: str
image: Image
QueryModality = Literal["text", "image", "text+image"]
Query = Union[TextQuery, ImageQuery, TextImageQuery]
class ModelRequestData(NamedTuple):
llm: LLM
prompt: str
image: Optional[Image]
def run_e5_v(query: Query):
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501
if query["modality"] == "text":
text = query["text"]
prompt = llama3_template.format(
f"{text}\nSummary above sentence in one word: ")
image = None
elif query["modality"] == "image":
prompt = llama3_template.format(
"<image>\nSummary above image in one word: ")
image = query["image"]
else:
modality = query['modality']
raise ValueError(f"Unsupported query modality: '{modality}'")
llm = LLM(
model="royokong/e5-v",
task="embedding",
max_model_len=4096,
)
return ModelRequestData(
llm=llm,
prompt=prompt,
image=image,
)
def run_vlm2vec(query: Query):
if query["modality"] == "text":
text = query["text"]
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
image = None
elif query["modality"] == "image":
prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image." # noqa: E501
image = query["image"]
elif query["modality"] == "text+image":
text = query["text"]
prompt = f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501
image = query["image"]
else:
modality = query['modality']
raise ValueError(f"Unsupported query modality: '{modality}'")
llm = LLM(
model="TIGER-Lab/VLM2Vec-Full",
task="embedding",
trust_remote_code=True,
mm_processor_kwargs={"num_crops": 4},
)
return ModelRequestData(
llm=llm,
prompt=prompt,
image=image,
)
def get_query(modality: QueryModality):
if modality == "text":
return TextQuery(modality="text", text="A dog sitting in the grass")
if modality == "image":
return ImageQuery(
modality="image",
image=fetch_image(
"https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg" # noqa: E501
),
)
if modality == "text+image":
return TextImageQuery(
modality="text+image",
text="A cat standing in the snow.",
image=fetch_image(
"https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg" # noqa: E501
),
)
msg = f"Modality {modality} is not supported."
raise ValueError(msg)
def run_encode(model: str, modality: QueryModality):
query = get_query(modality)
req_data = model_example_map[model](query)
mm_data = {}
if req_data.image is not None:
mm_data["image"] = req_data.image
outputs = req_data.llm.encode({
"prompt": req_data.prompt,
"multi_modal_data": mm_data,
})
for output in outputs:
print(output.outputs.embedding)
def main(args: Namespace):
run_encode(args.model_name, args.modality)
model_example_map = {
"e5_v": run_e5_v,
"vlm2vec": run_vlm2vec,
}
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'vision language models for multimodal embedding')
parser.add_argument('--model-name',
'-m',
type=str,
default="vlm2vec",
choices=model_example_map.keys(),
help='The name of the embedding model.')
parser.add_argument('--modality',
type=str,
default="image",
choices=get_args(QueryModality),
help='Modality of the input.')
args = parser.parse_args()
main(args)
"""
This example shows how to use vLLM for running offline inference with
multi-image input on vision language models, using the chat template defined
by the model.
multi-image input on vision language models for text generation,
using the chat template defined by the model.
"""
from argparse import Namespace
from typing import List, NamedTuple, Optional
......@@ -334,7 +334,8 @@ def main(args: Namespace):
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'vision language models that support multi-image input')
'vision language models that support multi-image input for text '
'generation')
parser.add_argument('--model-type',
'-m',
type=str,
......
from vllm import LLM, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory
# NOTE: This is just a running example. For benchmarking purpose,
# please see benchmarks/benchmark_prefix_caching.py
......@@ -28,12 +29,9 @@ generating_prompts = [prefix + prompt for prompt in prompts]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.0)
# Create an LLM.
# Create an LLM without prefix caching as a baseline.
regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
prefix_cached_llm = LLM(model="facebook/opt-125m",
enable_prefix_caching=True,
gpu_memory_utilization=0.4)
print("Results without `enable_prefix_caching`")
# Generate texts from the prompts. The output is a list of RequestOutput objects
......@@ -50,6 +48,15 @@ for output in outputs:
print("-" * 80)
# Destroy the LLM object and free up the GPU memory.
del regular_llm
cleanup_dist_env_and_memory()
# Create an LLM with prefix caching enabled.
prefix_cached_llm = LLM(model="facebook/opt-125m",
enable_prefix_caching=True,
gpu_memory_utilization=0.4)
# Warmup so that the shared prompt's KV cache is computed.
prefix_cached_llm.generate(generating_prompts[0], sampling_params)
......
......@@ -7,8 +7,8 @@ Launch the vLLM server with the following command:
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
(multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
--trust-remote-code --limit-mm-per-prompt image=2
vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
(audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096
......
......@@ -21,6 +21,20 @@ builtin cd "$(dirname "${BASH_SOURCE:-$0}")"
ROOT="$(git rev-parse --show-toplevel)"
builtin cd "$ROOT" || exit 1
check_command() {
if ! command -v "$1" &> /dev/null; then
echo "❓❓$1 is not installed, please run \`pip install -r requirements-lint.txt\`"
exit 1
fi
}
check_command yapf
check_command ruff
check_command mypy
check_command codespell
check_command isort
check_command clang-format
YAPF_VERSION=$(yapf --version | awk '{print $2}')
RUFF_VERSION=$(ruff --version | awk '{print $2}')
MYPY_VERSION=$(mypy --version | awk '{print $2}')
......@@ -31,7 +45,7 @@ CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}')
# # params: tool name, tool version, required version
tool_version_check() {
if [[ $2 != $3 ]]; then
echo "Wrong $1 version installed: $3 is required, not $2."
echo "❓❓Wrong $1 version installed: $3 is required, not $2."
exit 1
fi
}
......@@ -281,10 +295,12 @@ tools/actionlint.sh -color
echo 'vLLM actionlint: Done'
if ! git diff --quiet &>/dev/null; then
echo 'Reformatted files. Please review and stage the changes.'
echo 'Changes not staged for commit:'
echo
echo
echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:"
git --no-pager diff --name-only
echo "🔍🔍Format checker passed, but please add, commit and push all the files above to include changes made by the format checker."
exit 1
else
echo "✨🎉 Format check passed! Congratulations! 🎉✨"
fi
......@@ -39,7 +39,6 @@ assert cwd != package_path, "should not import from the current directory"
files_to_copy = [
"vllm/_C.abi3.so",
"vllm/_core_C.abi3.so",
"vllm/_moe_C.abi3.so",
"vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
"vllm/vllm_flash_attn/flash_attn_interface.py",
......
......@@ -31,4 +31,4 @@ pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL.
compressed-tensors == 0.6.0 # required for compressed-tensors
compressed-tensors == 0.7.1 # required for compressed-tensors
......@@ -164,6 +164,14 @@ class cmake_build_ext(build_ext):
# on subsequent calls to python.
cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))]
# Override the base directory for FetchContent downloads to $ROOT/.deps
# This allows sharing dependencies between profiles,
# and plays more nicely with sccache.
# To override this, set the FETCHCONTENT_BASE_DIR environment variable.
fc_base_dir = os.path.join(ROOT_DIR, ".deps")
fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir)
cmake_args += ['-DFETCHCONTENT_BASE_DIR={}'.format(fc_base_dir)]
#
# Setup parallelism and build tool
#
......@@ -297,10 +305,6 @@ def _build_custom_ops() -> bool:
return _is_cuda() or _is_hip() or _is_cpu()
def _build_core_ext() -> bool:
return not (_is_neuron() or _is_tpu() or _is_openvino() or _is_xpu())
def get_hipcc_rocm_version():
# Run the hipcc --version command
result = subprocess.run(['hipcc', '--version'],
......@@ -530,9 +534,6 @@ def get_requirements() -> List[str]:
ext_modules = []
if _build_core_ext():
ext_modules.append(CMakeExtension(name="vllm._core_C"))
if _is_cuda() or _is_hip():
ext_modules.append(CMakeExtension(name="vllm._moe_C"))
......
......@@ -12,11 +12,11 @@ import torch
from vllm import SamplingParams
from vllm.config import ParallelConfig
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
from vllm.outputs import RequestOutput as RealRequestOutput
from vllm.sampling_params import RequestOutputKind
from ..conftest import cleanup
from ..utils import wait_for_gpu_memory_to_clear
......@@ -157,7 +157,7 @@ async def async_engine():
engine.shutdown_background_loop()
del engine
await asyncio.sleep(0.1)
cleanup()
cleanup_dist_env_and_memory()
@pytest.fixture()
......
......@@ -19,7 +19,7 @@ from ..utils import multi_gpu_test
MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-2-7b-hf",
"meta-llama/Llama-3.2-1B",
]
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
......
......@@ -16,7 +16,7 @@ from ..utils import multi_gpu_test
MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-2-7b-hf",
"meta-llama/Llama-3.2-1B",
]
......
......@@ -2,5 +2,5 @@ from ..utils import compare_two_settings
def test_cpu_offload():
compare_two_settings("meta-llama/Llama-2-7b-hf", [],
["--cpu-offload-gb", "4"])
compare_two_settings("meta-llama/Llama-3.2-1B", [],
["--cpu-offload-gb", "1"])
......@@ -13,8 +13,7 @@ from ..utils import compare_all_settings
@pytest.mark.parametrize(
"model, model_args, pp_size, tp_size, attn_backend, method, fullgraph",
[
("meta-llama/Meta-Llama-3-8B", [], 2, 2, "FLASH_ATTN", "generate",
True),
("meta-llama/Llama-3.2-1B", [], 2, 2, "FLASH_ATTN", "generate", True),
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples",
["--quantization", "compressed-tensors"
], 1, 1, "FLASH_ATTN", "generate", True),
......
......@@ -69,11 +69,11 @@ def check_full_graph_support(model,
os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level)
os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
# Inductor doesn't support fp8/gptq_marlin_24 yet.
# Inductor doesn't support fp8 and the base meta llama uses too
# much memory.
quantization = model_kwargs.get("quantization")
if (quantization == "fp8" or quantization == "gptq_marlin"
or quantization == "gptq_marlin_24"
) and optimization_level >= CompilationLevel.INDUCTOR:
if ((quantization == "fp8" or model == "meta-llama/Meta-Llama-3-8B")
and optimization_level >= CompilationLevel.INDUCTOR):
return
prompts = [
......
import contextlib
import gc
import json
import os
import sys
......@@ -25,19 +23,19 @@ from tests.models.utils import (TokensTextLogprobs,
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.config import TokenizerPoolConfig
from vllm.config import TaskOption, TokenizerPoolConfig
from vllm.connections import global_http_connection
from vllm.distributed import (destroy_distributed_environment,
destroy_model_parallel,
from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment,
initialize_model_parallel)
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
to_enc_dec_tuple_list, zip_enc_dec_prompts)
from vllm.logger import init_logger
from vllm.outputs import RequestOutput
from vllm.platforms import current_platform
from vllm.sampling_params import BeamSearchParams
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
identity, is_cpu)
identity)
logger = init_logger(__name__)
......@@ -45,10 +43,12 @@ _TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]]
PromptAudioInput = Union[List[Tuple[np.ndarray, int]],
List[List[Tuple[np.ndarray, int]]]]
PromptVideoInput = Union[List[np.ndarray], List[List[np.ndarray]]]
_M = TypeVar("_M")
_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
PromptImageInput = _PromptMultiModalInput[Image.Image]
PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
PromptVideoInput = _PromptMultiModalInput[np.ndarray]
def _read_prompts(filename: str) -> List[str]:
......@@ -140,17 +140,7 @@ def dist_init():
)
initialize_model_parallel(1, 1)
yield
cleanup()
def cleanup():
destroy_model_parallel()
destroy_distributed_environment()
with contextlib.suppress(AssertionError):
torch.distributed.destroy_process_group()
gc.collect()
if not is_cpu():
torch.cuda.empty_cache()
cleanup_dist_env_and_memory()
@pytest.fixture()
......@@ -167,7 +157,7 @@ def should_do_global_cleanup_after_test(request) -> bool:
def cleanup_fixture(should_do_global_cleanup_after_test: bool):
yield
if should_do_global_cleanup_after_test:
cleanup()
cleanup_dist_env_and_memory()
@pytest.fixture(autouse=True)
......@@ -249,7 +239,8 @@ class HfRunner:
def wrap_device(self, input: _T, device: Optional[str] = None) -> _T:
if device is None:
return self.wrap_device(input, "cpu" if is_cpu() else "cuda")
return self.wrap_device(
input, "cpu" if current_platform.is_cpu() else "cuda")
if hasattr(input, "device") and input.device.type == device:
return input
......@@ -329,12 +320,12 @@ class HfRunner:
"text": prompt,
"return_tensors": "pt",
}
if images is not None and images[i] is not None:
processor_kwargs["images"] = images[i]
if videos is not None and videos[i] is not None:
processor_kwargs["videos"] = videos[i]
if audios is not None and audios[i] is not None:
audio, sr = audios[i]
if images is not None and (image := images[i]) is not None:
processor_kwargs["images"] = image
if videos is not None and (video := videos[i]) is not None:
processor_kwargs["videos"] = video
if audios is not None and (audio_tuple := audios[i]) is not None:
audio, sr = audio_tuple
processor_kwargs["audio"] = audio
processor_kwargs["sampling_rate"] = sr
......@@ -349,7 +340,7 @@ class HfRunner:
self,
prompts: List[str],
images: Optional[PromptImageInput] = None,
videos: Optional[List[np.ndarray]] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
**kwargs: Any,
) -> List[Tuple[List[List[int]], List[str]]]:
......@@ -379,7 +370,7 @@ class HfRunner:
prompts: List[str],
max_tokens: int,
images: Optional[PromptImageInput] = None,
videos: Optional[List[np.ndarray]] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
**kwargs: Any,
) -> List[Tuple[List[int], str]]:
......@@ -420,7 +411,7 @@ class HfRunner:
prompts: List[str],
max_tokens: int,
images: Optional[PromptImageInput] = None,
videos: Optional[List[np.ndarray]] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
**kwargs: Any,
) -> List[List[torch.Tensor]]:
......@@ -499,7 +490,7 @@ class HfRunner:
num_logprobs: int,
images: Optional[PromptImageInput] = None,
audios: Optional[PromptAudioInput] = None,
videos: Optional[List[np.ndarray]] = None,
videos: Optional[PromptVideoInput] = None,
**kwargs: Any,
) -> List[TokensTextLogprobs]:
all_inputs = self.get_inputs(prompts,
......@@ -606,7 +597,7 @@ class HfRunner:
def __exit__(self, exc_type, exc_value, traceback):
del self.model
cleanup()
cleanup_dist_env_and_memory()
@pytest.fixture(scope="session")
......@@ -619,6 +610,7 @@ class VllmRunner:
def __init__(
self,
model_name: str,
task: TaskOption = "auto",
tokenizer_name: Optional[str] = None,
# Use smaller max model length, otherwise bigger model cannot run due
# to kv cache size limit.
......@@ -634,6 +626,7 @@ class VllmRunner:
) -> None:
self.model = LLM(
model=model_name,
task=task,
tokenizer=tokenizer_name,
trust_remote_code=True,
dtype=dtype,
......@@ -666,15 +659,18 @@ class VllmRunner:
inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
if images is not None:
for i, image in enumerate(images):
inputs[i]["multi_modal_data"] = {"image": image}
if image is not None:
inputs[i]["multi_modal_data"] = {"image": image}
if videos is not None:
for i, video in enumerate(videos):
inputs[i]["multi_modal_data"] = {"video": video}
if video is not None:
inputs[i]["multi_modal_data"] = {"video": video}
if audios is not None:
for i, audio in enumerate(audios):
inputs[i]["multi_modal_data"] = {"audio": audio}
if audio is not None:
inputs[i]["multi_modal_data"] = {"audio": audio}
return inputs
......@@ -846,20 +842,27 @@ class VllmRunner:
returned_outputs.append((token_ids, texts))
return returned_outputs
def encode(self, prompts: List[str]) -> List[List[float]]:
req_outputs = self.model.encode(prompts)
outputs = []
for req_output in req_outputs:
embedding = req_output.outputs.embedding
outputs.append(embedding)
return outputs
def encode(
self,
prompts: List[str],
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
) -> List[List[float]]:
inputs = self.get_inputs(prompts,
images=images,
videos=videos,
audios=audios)
req_outputs = self.model.encode(inputs)
return [req_output.outputs.embedding for req_output in req_outputs]
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
del self.model
cleanup()
cleanup_dist_env_and_memory()
@pytest.fixture(scope="session")
......
......@@ -3,10 +3,9 @@ from typing import Callable, Iterable, Optional
import pytest
from vllm import LLM
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.model_executor.utils import set_random_seed
from ....conftest import cleanup
@pytest.fixture
def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
......@@ -37,7 +36,7 @@ def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
yield llm
del llm
cleanup()
cleanup_dist_env_and_memory()
for llm in generator_inner():
yield llm
......
......@@ -33,7 +33,8 @@ def test_simple():
num_seq_group = 4
max_model_len = 16
max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(max_num_batched_tokens,
scheduler_config = SchedulerConfig("generate",
max_num_batched_tokens,
num_seq_group,
max_model_len,
enable_chunked_prefill=True)
......@@ -78,6 +79,7 @@ def test_chunk():
max_model_len = 80
max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
......@@ -126,6 +128,7 @@ def test_complex():
max_model_len = 80
max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
......@@ -196,6 +199,7 @@ def test_maximal_decoding():
max_model_len = 8
max_num_batched_tokens = 2
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
......@@ -289,6 +293,7 @@ def test_prompt_limit():
max_model_len = 64
max_num_batched_tokens = 32
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
......@@ -321,7 +326,8 @@ def test_prompt_limit_exceed():
max_seqs = 64
max_model_len = 32
max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(max_num_batched_tokens,
scheduler_config = SchedulerConfig("generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
enable_chunked_prefill=True)
......@@ -348,6 +354,7 @@ def test_swap():
max_model_len = 200
max_num_batched_tokens = 30
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
......@@ -404,6 +411,7 @@ def test_running_prefill_prioritized_over_swap():
max_model_len = 200
max_num_batched_tokens = 30
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
......@@ -498,6 +506,7 @@ def test_chunked_prefill_preempt():
max_model_len = 200
max_num_batched_tokens = 30
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
......@@ -563,6 +572,7 @@ def test_chunked_prefill_max_seqs():
max_model_len = 80
max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
......@@ -617,6 +627,7 @@ def test_perfix_caching():
max_model_len = 80
max_num_batched_tokens = 64
scheduler_config = SchedulerConfig(
"generate",
max_num_batched_tokens,
max_seqs,
max_model_len,
......
......@@ -20,9 +20,10 @@ from .utils import (append_new_token, append_new_token_seq_group,
def test_scheduler_add_seq_group():
block_size = 4
scheduler_config = SchedulerConfig(
100,
64,
1,
"generate",
max_num_batched_tokens=100,
max_num_seqs=64,
max_model_len=1,
)
cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
cache_config.num_cpu_blocks = 4
......@@ -42,9 +43,10 @@ def test_scheduler_add_seq_group():
def test_scheduler_abort_seq_group():
block_size = 4
scheduler_config = SchedulerConfig(
100,
64,
1,
"generate",
max_num_batched_tokens=100,
max_num_seqs=64,
max_model_len=1,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 4
......@@ -70,9 +72,10 @@ def test_scheduler_schedule_simple():
num_seq_group = 4
max_model_len = 16
scheduler_config = SchedulerConfig(
64,
num_seq_group,
max_model_len,
"generate",
max_num_batched_tokens=64,
max_num_seqs=num_seq_group,
max_model_len=max_model_len,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
......@@ -114,9 +117,10 @@ def test_scheduler_prefill_prioritized():
max_model_len = 30
max_batched_num_tokens = 30
scheduler_config = SchedulerConfig(
max_batched_num_tokens,
2,
max_model_len,
"generate",
max_num_batched_tokens=max_batched_num_tokens,
max_num_seqs=2,
max_model_len=max_model_len,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16
......@@ -145,9 +149,10 @@ def test_scheduler_schedule_preempt_abort():
block_size = 4
max_model_len = 16
scheduler_config = SchedulerConfig(
64,
2,
max_model_len,
"generate",
max_num_batched_tokens=64,
max_num_seqs=2,
max_model_len=max_model_len,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 2
......@@ -204,9 +209,10 @@ def test_scheduler_max_seqs():
max_seq_group = 2
max_model_len = 16
scheduler_config = SchedulerConfig(
64,
max_seq_group,
max_model_len,
"generate",
max_num_batched_tokens=64,
max_num_seqs=max_seq_group,
max_model_len=max_model_len,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 8
......@@ -248,9 +254,10 @@ def test_scheduler_max_seqs():
def test_scheduler_delay_factor():
block_size = 4
scheduler_config = SchedulerConfig(
100,
64,
16,
"generate",
max_num_batched_tokens=100,
max_num_seqs=64,
max_model_len=16,
delay_factor=0.5,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
......@@ -350,9 +357,10 @@ def initialize_scheduler(
):
block_size = block_size
scheduler_config = SchedulerConfig(
max_token_budget,
max_num_seqs,
max_model_len,
"generate",
max_num_batched_tokens=max_token_budget,
max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = num_cpu_blocks
......
......@@ -36,7 +36,12 @@ def test_scheduler_schedule_simple_encoder_decoder():
block_size = 4
num_seq_group = 4
max_model_len = 16
scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len)
scheduler_config = SchedulerConfig(
task="generate",
max_num_batched_tokens=64,
max_num_seqs=num_seq_group,
max_model_len=max_model_len,
)
cache_config = CacheConfig(block_size, 1.0, 1, "auto")
cache_config.num_cpu_blocks = 16 # enc and dec prompts per seq_group
cache_config.num_gpu_blocks = 16 # enc and dec prompts per seq_group
......
......@@ -11,6 +11,7 @@ from typing import List, Literal, NamedTuple, Optional
import pytest
from vllm.config import TaskOption
from vllm.logger import init_logger
from ..utils import compare_two_settings, fork_new_process_for_each_test
......@@ -27,18 +28,26 @@ class ParallelSetup(NamedTuple):
chunked_prefill: bool
class PPTestOptions(NamedTuple):
multi_node_only: bool
trust_remote_code: bool
tokenizer_mode: Optional[str]
@dataclass
class PPTestSettings:
parallel_setups: List[ParallelSetup]
distributed_backends: List[str]
trust_remote_code: bool
tokenizer_mode: Optional[str]
task: TaskOption
test_options: PPTestOptions
@staticmethod
def detailed(
*,
tp_base: int = 1,
pp_base: int = 2,
multi_node_only: bool = False,
task: TaskOption = "auto",
trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None,
):
......@@ -66,8 +75,10 @@ class PPTestSettings:
chunked_prefill=False),
],
distributed_backends=["mp", "ray"],
trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode,
task=task,
test_options=PPTestOptions(multi_node_only=multi_node_only,
trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode),
)
@staticmethod
......@@ -75,6 +86,8 @@ class PPTestSettings:
*,
tp_base: int = 1,
pp_base: int = 2,
task: TaskOption = "auto",
multi_node_only: bool = False,
trust_remote_code: bool = False,
tokenizer_mode: Optional[str] = None,
):
......@@ -86,15 +99,19 @@ class PPTestSettings:
chunked_prefill=False),
],
distributed_backends=["mp"],
trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode,
task=task,
test_options=PPTestOptions(multi_node_only=multi_node_only,
trust_remote_code=trust_remote_code,
tokenizer_mode=tokenizer_mode),
)
def iter_params(self, model_name: str):
opts = self.test_options
for parallel_setup in self.parallel_setups:
for distributed_backend in self.distributed_backends:
yield (model_name, parallel_setup, distributed_backend,
self.trust_remote_code, self.tokenizer_mode)
self.task, opts)
# NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
......@@ -104,6 +121,7 @@ class PPTestSettings:
GENERATION_MODEL_SETTINGS = {
# [DETAILED TESTS]
"meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
"microsoft/Phi-3-mini-4k-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True), # noqa: E501
# [FAST TESTS]
# Uses Llama
# "BAAI/AquilaChat-7B": PPTestSettings.fast(),
......@@ -145,10 +163,8 @@ GENERATION_MODEL_SETTINGS = {
"facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
"OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
"microsoft/phi-2": PPTestSettings.fast(),
"microsoft/Phi-3-mini-4k-instruct": PPTestSettings.fast(),
"microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
# FIXME: https://github.com/vllm-project/vllm/issues/8553
# "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True), # noqa: E501
"adept/persimmon-8b-chat": PPTestSettings.fast(),
"Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
"Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(),
......@@ -199,6 +215,7 @@ TEST_MODELS = [
# [LANGUAGE GENERATION]
"meta-llama/Meta-Llama-3-8B",
"ibm/PowerLM-3b",
"microsoft/Phi-3-mini-4k-instruct",
# [LANGUAGE EMBEDDING]
"intfloat/e5-mistral-7b-instruct",
"BAAI/bge-multilingual-gemma2",
......@@ -213,19 +230,22 @@ def _compare_tp(
model_name: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
trust_remote_code: bool,
tokenizer_mode: Optional[str],
task: TaskOption,
test_options: PPTestOptions,
num_gpus_available: int,
*,
method: Literal["generate", "encode"] = "encode",
method: Literal["generate", "encode"],
):
tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup
multi_node_only, trust_remote_code, tokenizer_mode = test_options
if num_gpus_available < tp_size * pp_size:
pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
if VLLM_MULTI_NODE and distributed_backend == "mp":
pytest.skip("Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend")
if multi_node_only and not VLLM_MULTI_NODE:
pytest.skip("Not in multi-node setting")
common_args = [
# use half precision for speed and memory savings in CI environment
......@@ -240,6 +260,8 @@ def _compare_tp(
common_args.append("--enable-chunked-prefill")
if eager_mode:
common_args.append("--enforce-eager")
if task != "auto":
common_args.extend(["--task", task])
if trust_remote_code:
common_args.append("--trust-remote-code")
if tokenizer_mode:
......@@ -297,8 +319,8 @@ def _compare_tp(
@pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend",
"trust_remote_code", "tokenizer_mode"),
("model_name", "parallel_setup", "distributed_backend", "task",
"test_options"),
[
params for model_name, settings in GENERATION_MODEL_SETTINGS.items()
for params in settings.iter_params(model_name)
......@@ -310,22 +332,22 @@ def test_tp_language_generation(
model_name: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
trust_remote_code: bool,
tokenizer_mode: Optional[str],
task: TaskOption,
test_options: PPTestOptions,
num_gpus_available,
):
_compare_tp(model_name,
parallel_setup,
distributed_backend,
trust_remote_code,
tokenizer_mode,
task,
test_options,
num_gpus_available,
method="generate")
@pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend",
"trust_remote_code", "tokenizer_mode"),
("model_name", "parallel_setup", "distributed_backend", "task",
"test_options"),
[
params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items()
for params in settings.iter_params(model_name)
......@@ -337,22 +359,22 @@ def test_tp_language_embedding(
model_name: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
trust_remote_code: bool,
tokenizer_mode: Optional[str],
task: TaskOption,
test_options: PPTestOptions,
num_gpus_available,
):
_compare_tp(model_name,
parallel_setup,
distributed_backend,
trust_remote_code,
tokenizer_mode,
task,
test_options,
num_gpus_available,
method="encode")
@pytest.mark.parametrize(
("model_name", "parallel_setup", "distributed_backend",
"trust_remote_code", "tokenizer_mode"),
("model_name", "parallel_setup", "distributed_backend", "task",
"test_options"),
[
params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items()
for params in settings.iter_params(model_name)
......@@ -364,14 +386,14 @@ def test_tp_multimodal_generation(
model_name: str,
parallel_setup: ParallelSetup,
distributed_backend: str,
trust_remote_code: bool,
tokenizer_mode: Optional[str],
task: TaskOption,
test_options: PPTestOptions,
num_gpus_available,
):
_compare_tp(model_name,
parallel_setup,
distributed_backend,
trust_remote_code,
tokenizer_mode,
task,
test_options,
num_gpus_available,
method="generate")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment