Merge remote-tracking branch 'mirror/main'

2216a4e5 · zhuwenwen · ad385667 · 51c24c97 · 2216a4e5 · 2216a4e5
Commit 2216a4e5 authored Oct 23, 2024 by zhuwenwen
20 changed files
--- a/examples/offline_inference_vision_language_embedding.py
+++ b/examples/offline_inference_vision_language_embedding.py
+"""
+This example shows how to use vLLM for running offline inference with
+the correct prompt format on vision language models for multimodal embedding.
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+from argparse import Namespace
+from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
+from PIL.Image import Image
 from vllm import LLM
-from vllm.assets.image import ImageAsset
+from vllm.multimodal.utils import fetch_image
+from vllm.utils import FlexibleArgumentParser
-image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-prompt = "<|image_1|> Represent the given image with the following question: What is in the image"  # noqa: E501
+class TextQuery(TypedDict):
-# Create an LLM.
+    modality: Literal["text"]
-llm = LLM(
+    text: str
-    model="TIGER-Lab/VLM2Vec-Full",
-    trust_remote_code=True,
-    max_model_len=4096,
+class ImageQuery(TypedDict):
-    max_num_seqs=2,
+    modality: Literal["image"]
-    mm_processor_kwargs={"num_crops": 16},
+    image: Image
-)
-# Generate embedding. The output is a list of EmbeddingRequestOutputs.
+class TextImageQuery(TypedDict):
-outputs = llm.encode({"prompt": prompt, "multi_modal_data": {"image": image}})
+    modality: Literal["text+image"]
+    text: str
-# Print the outputs.
+    image: Image
-for output in outputs:
-    print(output.outputs.embedding)  # list of 3072 floats
+QueryModality = Literal["text", "image", "text+image"]
+Query = Union[TextQuery, ImageQuery, TextImageQuery]
+class ModelRequestData(NamedTuple):
+    llm: LLM
+    prompt: str
+    image: Optional[Image]
+def run_e5_v(query: Query):
+    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
+    if query["modality"] == "text":
+        text = query["text"]
+        prompt = llama3_template.format(
+            f"{text}\nSummary above sentence in one word: ")
+        image = None
+    elif query["modality"] == "image":
+        prompt = llama3_template.format(
+            "<image>\nSummary above image in one word: ")
+        image = query["image"]
+    else:
+        modality = query['modality']
+        raise ValueError(f"Unsupported query modality: '{modality}'")
+    llm = LLM(
+        model="royokong/e5-v",
+        task="embedding",
+        max_model_len=4096,
+    )
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        image=image,
+    )
+def run_vlm2vec(query: Query):
+    if query["modality"] == "text":
+        text = query["text"]
+        prompt = f"Find me an everyday image that matches the given caption: {text}"  # noqa: E501
+        image = None
+    elif query["modality"] == "image":
+        prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image."  # noqa: E501
+        image = query["image"]
+    elif query["modality"] == "text+image":
+        text = query["text"]
+        prompt = f"<|image_1|> Represent the given image with the following question: {text}"  # noqa: E501
+        image = query["image"]
+    else:
+        modality = query['modality']
+        raise ValueError(f"Unsupported query modality: '{modality}'")
+    llm = LLM(
+        model="TIGER-Lab/VLM2Vec-Full",
+        task="embedding",
+        trust_remote_code=True,
+        mm_processor_kwargs={"num_crops": 4},
+    )
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        image=image,
+    )
+def get_query(modality: QueryModality):
+    if modality == "text":
+        return TextQuery(modality="text", text="A dog sitting in the grass")
+    if modality == "image":
+        return ImageQuery(
+            modality="image",
+            image=fetch_image(
+                "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg"  # noqa: E501
+            ),
+        )
+    if modality == "text+image":
+        return TextImageQuery(
+            modality="text+image",
+            text="A cat standing in the snow.",
+            image=fetch_image(
+                "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg"  # noqa: E501
+            ),
+        )
+    msg = f"Modality {modality} is not supported."
+    raise ValueError(msg)
+def run_encode(model: str, modality: QueryModality):
+    query = get_query(modality)
+    req_data = model_example_map[model](query)
+    mm_data = {}
+    if req_data.image is not None:
+        mm_data["image"] = req_data.image
+    outputs = req_data.llm.encode({
+        "prompt": req_data.prompt,
+        "multi_modal_data": mm_data,
+    })
+    for output in outputs:
+        print(output.outputs.embedding)
+def main(args: Namespace):
+    run_encode(args.model_name, args.modality)
+model_example_map = {
+    "e5_v": run_e5_v,
+    "vlm2vec": run_vlm2vec,
+}
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models for multimodal embedding')
+    parser.add_argument('--model-name',
+                        '-m',
+                        type=str,
+                        default="vlm2vec",
+                        choices=model_example_map.keys(),
+                        help='The name of the embedding model.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        choices=get_args(QueryModality),
+                        help='Modality of the input.')
+    args = parser.parse_args()
+    main(args)
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
 """
 This example shows how to use vLLM for running offline inference with
-multi-image input on vision language models, using the chat template defined
+multi-image input on vision language models for text generation,
-by the model.
+using the chat template defined by the model.
 """
 from argparse import Namespace
 from typing import List, NamedTuple, Optional
@@ -334,7 +334,8 @@ def main(args: Namespace):
 if __name__ == "__main__":
    parser = FlexibleArgumentParser(
        description='Demo on using vLLM for offline inference with '
-        'vision language models that support multi-image input')
+        'vision language models that support multi-image input for text '
+        'generation')
    parser.add_argument('--model-type',
                        '-m',
                        type=str,

--- a/examples/offline_inference_with_prefix.py
+++ b/examples/offline_inference_with_prefix.py
 from vllm import LLM, SamplingParams
+from vllm.distributed import cleanup_dist_env_and_memory
 # NOTE: This is just a running example. For benchmarking purpose,
 # please see benchmarks/benchmark_prefix_caching.py
@@ -28,12 +29,9 @@ generating_prompts = [prefix + prompt for prompt in prompts]
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.0)
-# Create an LLM.
+# Create an LLM without prefix caching as a baseline.
 regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
-prefix_cached_llm = LLM(model="facebook/opt-125m",
-                        enable_prefix_caching=True,
-                        gpu_memory_utilization=0.4)
 print("Results without `enable_prefix_caching`")
 # Generate texts from the prompts. The output is a list of RequestOutput objects
@@ -50,6 +48,15 @@ for output in outputs:
 print("-" * 80)
+# Destroy the LLM object and free up the GPU memory.
+del regular_llm
+cleanup_dist_env_and_memory()
+# Create an LLM with prefix caching enabled.
+prefix_cached_llm = LLM(model="facebook/opt-125m",
+                        enable_prefix_caching=True,
+                        gpu_memory_utilization=0.4)
 # Warmup so that the shared prompt's KV cache is computed.
 prefix_cached_llm.generate(generating_prompts[0], sampling_params)

--- a/examples/openai_api_client_for_multimodal.py
+++ b/examples/openai_api_client_for_multimodal.py
@@ -7,8 +7,8 @@ Launch the vLLM server with the following command:
 vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
 (multi-image inference with Phi-3.5-vision-instruct)
-vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
+vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
-    --trust-remote-code --limit-mm-per-prompt image=2
+    --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
 (audio inference with Ultravox)
 vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096

--- a/format.sh
+++ b/format.sh
@@ -21,6 +21,20 @@ builtin cd "$(dirname "${BASH_SOURCE:-$0}")"
 ROOT="$(git rev-parse --show-toplevel)"
 builtin cd "$ROOT" || exit 1
+check_command() {
+    if ! command -v "$1" &> /dev/null; then
+        echo "❓❓$1 is not installed, please run \`pip install -r requirements-lint.txt\`"
+        exit 1
+    fi
+}
+check_command yapf
+check_command ruff
+check_command mypy
+check_command codespell
+check_command isort
+check_command clang-format
 YAPF_VERSION=$(yapf --version | awk '{print $2}')
 RUFF_VERSION=$(ruff --version | awk '{print $2}')
 MYPY_VERSION=$(mypy --version | awk '{print $2}')
@@ -31,7 +45,7 @@ CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}')
 # # params: tool name, tool version, required version
 tool_version_check() {
    if [[ $2 != $3 ]]; then
-        echo "Wrong $1 version installed: $3 is required, not $2."
+        echo "❓❓Wrong $1 version installed: $3 is required, not $2."
        exit 1
    fi
 }
@@ -281,10 +295,12 @@ tools/actionlint.sh -color
 echo 'vLLM actionlint: Done'
 if ! git diff --quiet &>/dev/null; then
-    echo 'Reformatted files. Please review and stage the changes.'
+    echo 
-    echo 'Changes not staged for commit:'
+    echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:"
-    echo
    git --no-pager diff --name-only
+    echo "🔍🔍Format checker passed, but please add, commit and push all the files above to include changes made by the format checker."
    exit 1
+else
+    echo "✨🎉 Format check passed! Congratulations! 🎉✨"
 fi
--- a/python_only_dev.py
+++ b/python_only_dev.py
@@ -39,7 +39,6 @@ assert cwd != package_path, "should not import from the current directory"
 files_to_copy = [
    "vllm/_C.abi3.so",
-    "vllm/_core_C.abi3.so",
    "vllm/_moe_C.abi3.so",
    "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
    "vllm/vllm_flash_attn/flash_attn_interface.py",

--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -31,4 +31,4 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.6.0 # required for compressed-tensors
+compressed-tensors == 0.7.1 # required for compressed-tensors
--- a/setup.py
+++ b/setup.py
@@ -164,6 +164,14 @@ class cmake_build_ext(build_ext):
        # on subsequent calls to python.
        cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))]
+        # Override the base directory for FetchContent downloads to $ROOT/.deps
+        # This allows sharing dependencies between profiles,
+        # and plays more nicely with sccache.
+        # To override this, set the FETCHCONTENT_BASE_DIR environment variable.
+        fc_base_dir = os.path.join(ROOT_DIR, ".deps")
+        fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir)
+        cmake_args += ['-DFETCHCONTENT_BASE_DIR={}'.format(fc_base_dir)]
        #
        # Setup parallelism and build tool
        #
@@ -297,10 +305,6 @@ def _build_custom_ops() -> bool:
    return _is_cuda() or _is_hip() or _is_cpu()
-def _build_core_ext() -> bool:
-    return not (_is_neuron() or _is_tpu() or _is_openvino() or _is_xpu())
 def get_hipcc_rocm_version():
    # Run the hipcc --version command
    result = subprocess.run(['hipcc', '--version'],
@@ -530,9 +534,6 @@ def get_requirements() -> List[str]:
 ext_modules = []
-if _build_core_ext():
-    ext_modules.append(CMakeExtension(name="vllm._core_C"))
 if _is_cuda() or _is_hip():
    ext_modules.append(CMakeExtension(name="vllm._moe_C"))

--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -12,11 +12,11 @@ import torch
 from vllm import SamplingParams
 from vllm.config import ParallelConfig
+from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.engine.async_llm_engine import AsyncEngineArgs, AsyncLLMEngine
 from vllm.outputs import RequestOutput as RealRequestOutput
 from vllm.sampling_params import RequestOutputKind
-from ..conftest import cleanup
 from ..utils import wait_for_gpu_memory_to_clear
@@ -157,7 +157,7 @@ async def async_engine():
        engine.shutdown_background_loop()
        del engine
        await asyncio.sleep(0.1)
-        cleanup()
+        cleanup_dist_env_and_memory()
 @pytest.fixture()

--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -19,7 +19,7 @@ from ..utils import multi_gpu_test
 MODELS = [
    "facebook/opt-125m",
-    "meta-llama/Llama-2-7b-hf",
+    "meta-llama/Llama-3.2-1B",
 ]
 TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")

--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -16,7 +16,7 @@ from ..utils import multi_gpu_test
 MODELS = [
    "facebook/opt-125m",
-    "meta-llama/Llama-2-7b-hf",
+    "meta-llama/Llama-3.2-1B",
 ]

--- a/tests/basic_correctness/test_cpu_offload.py
+++ b/tests/basic_correctness/test_cpu_offload.py
@@ -2,5 +2,5 @@ from ..utils import compare_two_settings
 def test_cpu_offload():
-    compare_two_settings("meta-llama/Llama-2-7b-hf", [],
+    compare_two_settings("meta-llama/Llama-3.2-1B", [],
-                         ["--cpu-offload-gb", "4"])
+                         ["--cpu-offload-gb", "1"])
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -13,8 +13,7 @@ from ..utils import compare_all_settings
 @pytest.mark.parametrize(
    "model, model_args, pp_size, tp_size, attn_backend, method, fullgraph",
    [
-        ("meta-llama/Meta-Llama-3-8B", [], 2, 2, "FLASH_ATTN", "generate",
+        ("meta-llama/Llama-3.2-1B", [], 2, 2, "FLASH_ATTN", "generate", True),
-         True),
        ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples",
         ["--quantization", "compressed-tensors"
          ], 1, 1, "FLASH_ATTN", "generate", True),

--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@@ -69,11 +69,11 @@ def check_full_graph_support(model,
    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level)
    os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
-    # Inductor doesn't support fp8/gptq_marlin_24 yet.
+    # Inductor doesn't support fp8 and the base meta llama uses too
+    # much memory.
    quantization = model_kwargs.get("quantization")
-    if (quantization == "fp8" or quantization == "gptq_marlin"
+    if ((quantization == "fp8" or model == "meta-llama/Meta-Llama-3-8B")
-            or quantization == "gptq_marlin_24"
+            and optimization_level >= CompilationLevel.INDUCTOR):
-        ) and optimization_level >= CompilationLevel.INDUCTOR:
        return
    prompts = [

--- a/tests/conftest.py
+++ b/tests/conftest.py
-import contextlib
-import gc
 import json
 import os
 import sys
@@ -25,19 +23,19 @@ from tests.models.utils import (TokensTextLogprobs,
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TokenizerPoolConfig
+from vllm.config import TaskOption, TokenizerPoolConfig
 from vllm.connections import global_http_connection
-from vllm.distributed import (destroy_distributed_environment,
+from vllm.distributed import (cleanup_dist_env_and_memory,
-                              destroy_model_parallel,
                              init_distributed_environment,
                              initialize_model_parallel)
 from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
                         to_enc_dec_tuple_list, zip_enc_dec_prompts)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
+from vllm.platforms import current_platform
 from vllm.sampling_params import BeamSearchParams
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
-                        identity, is_cpu)
+                        identity)
 logger = init_logger(__name__)
@@ -45,10 +43,12 @@ _TEST_DIR = os.path.dirname(__file__)
 _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
-PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]]
+_M = TypeVar("_M")
-PromptAudioInput = Union[List[Tuple[np.ndarray, int]],
+_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
-                         List[List[Tuple[np.ndarray, int]]]]
-PromptVideoInput = Union[List[np.ndarray], List[List[np.ndarray]]]
+PromptImageInput = _PromptMultiModalInput[Image.Image]
+PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
+PromptVideoInput = _PromptMultiModalInput[np.ndarray]
 def _read_prompts(filename: str) -> List[str]:
@@ -140,17 +140,7 @@ def dist_init():
    )
    initialize_model_parallel(1, 1)
    yield
-    cleanup()
+    cleanup_dist_env_and_memory()
-def cleanup():
-    destroy_model_parallel()
-    destroy_distributed_environment()
-    with contextlib.suppress(AssertionError):
-        torch.distributed.destroy_process_group()
-    gc.collect()
-    if not is_cpu():
-        torch.cuda.empty_cache()
 @pytest.fixture()
@@ -167,7 +157,7 @@ def should_do_global_cleanup_after_test(request) -> bool:
 def cleanup_fixture(should_do_global_cleanup_after_test: bool):
    yield
    if should_do_global_cleanup_after_test:
-        cleanup()
+        cleanup_dist_env_and_memory()
 @pytest.fixture(autouse=True)
@@ -249,7 +239,8 @@ class HfRunner:
    def wrap_device(self, input: _T, device: Optional[str] = None) -> _T:
        if device is None:
-            return self.wrap_device(input, "cpu" if is_cpu() else "cuda")
+            return self.wrap_device(
+                input, "cpu" if current_platform.is_cpu() else "cuda")
        if hasattr(input, "device") and input.device.type == device:
            return input
@@ -329,12 +320,12 @@ class HfRunner:
                "text": prompt,
                "return_tensors": "pt",
            }
-            if images is not None and images[i] is not None:
+            if images is not None and (image := images[i]) is not None:
-                processor_kwargs["images"] = images[i]
+                processor_kwargs["images"] = image
-            if videos is not None and videos[i] is not None:
+            if videos is not None and (video := videos[i]) is not None:
-                processor_kwargs["videos"] = videos[i]
+                processor_kwargs["videos"] = video
-            if audios is not None and audios[i] is not None:
+            if audios is not None and (audio_tuple := audios[i]) is not None:
-                audio, sr = audios[i]
+                audio, sr = audio_tuple
                processor_kwargs["audio"] = audio
                processor_kwargs["sampling_rate"] = sr
@@ -349,7 +340,7 @@ class HfRunner:
        self,
        prompts: List[str],
        images: Optional[PromptImageInput] = None,
-        videos: Optional[List[np.ndarray]] = None,
+        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
    ) -> List[Tuple[List[List[int]], List[str]]]:
@@ -379,7 +370,7 @@ class HfRunner:
        prompts: List[str],
        max_tokens: int,
        images: Optional[PromptImageInput] = None,
-        videos: Optional[List[np.ndarray]] = None,
+        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
    ) -> List[Tuple[List[int], str]]:
@@ -420,7 +411,7 @@ class HfRunner:
        prompts: List[str],
        max_tokens: int,
        images: Optional[PromptImageInput] = None,
-        videos: Optional[List[np.ndarray]] = None,
+        videos: Optional[PromptVideoInput] = None,
        audios: Optional[PromptAudioInput] = None,
        **kwargs: Any,
    ) -> List[List[torch.Tensor]]:
@@ -499,7 +490,7 @@ class HfRunner:
        num_logprobs: int,
        images: Optional[PromptImageInput] = None,
        audios: Optional[PromptAudioInput] = None,
-        videos: Optional[List[np.ndarray]] = None,
+        videos: Optional[PromptVideoInput] = None,
        **kwargs: Any,
    ) -> List[TokensTextLogprobs]:
        all_inputs = self.get_inputs(prompts,
@@ -606,7 +597,7 @@ class HfRunner:
    def __exit__(self, exc_type, exc_value, traceback):
        del self.model
-        cleanup()
+        cleanup_dist_env_and_memory()
 @pytest.fixture(scope="session")
@@ -619,6 +610,7 @@ class VllmRunner:
    def __init__(
        self,
        model_name: str,
+        task: TaskOption = "auto",
        tokenizer_name: Optional[str] = None,
        # Use smaller max model length, otherwise bigger model cannot run due
        # to kv cache size limit.
@@ -634,6 +626,7 @@ class VllmRunner:
    ) -> None:
        self.model = LLM(
            model=model_name,
+            task=task,
            tokenizer=tokenizer_name,
            trust_remote_code=True,
            dtype=dtype,
@@ -666,15 +659,18 @@ class VllmRunner:
        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
        if images is not None:
            for i, image in enumerate(images):
-                inputs[i]["multi_modal_data"] = {"image": image}
+                if image is not None:
+                    inputs[i]["multi_modal_data"] = {"image": image}
        if videos is not None:
            for i, video in enumerate(videos):
-                inputs[i]["multi_modal_data"] = {"video": video}
+                if video is not None:
+                    inputs[i]["multi_modal_data"] = {"video": video}
        if audios is not None:
            for i, audio in enumerate(audios):
-                inputs[i]["multi_modal_data"] = {"audio": audio}
+                if audio is not None:
+                    inputs[i]["multi_modal_data"] = {"audio": audio}
        return inputs
@@ -846,20 +842,27 @@ class VllmRunner:
            returned_outputs.append((token_ids, texts))
        return returned_outputs
-    def encode(self, prompts: List[str]) -> List[List[float]]:
+    def encode(
-        req_outputs = self.model.encode(prompts)
+        self,
-        outputs = []
+        prompts: List[str],
-        for req_output in req_outputs:
+        images: Optional[PromptImageInput] = None,
-            embedding = req_output.outputs.embedding
+        videos: Optional[PromptVideoInput] = None,
-            outputs.append(embedding)
+        audios: Optional[PromptAudioInput] = None,
-        return outputs
+    ) -> List[List[float]]:
+        inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)
+        req_outputs = self.model.encode(inputs)
+        return [req_output.outputs.embedding for req_output in req_outputs]
    def __enter__(self):
        return self
    def __exit__(self, exc_type, exc_value, traceback):
        del self.model
-        cleanup()
+        cleanup_dist_env_and_memory()
 @pytest.fixture(scope="session")

--- a/tests/core/block/e2e/conftest.py
+++ b/tests/core/block/e2e/conftest.py
@@ -3,10 +3,9 @@ from typing import Callable, Iterable, Optional
 import pytest
 from vllm import LLM
+from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.model_executor.utils import set_random_seed
-from ....conftest import cleanup
 @pytest.fixture
 def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
@@ -37,7 +36,7 @@ def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
        yield llm
        del llm
-        cleanup()
+        cleanup_dist_env_and_memory()
    for llm in generator_inner():
        yield llm

--- a/tests/core/test_chunked_prefill_scheduler.py
+++ b/tests/core/test_chunked_prefill_scheduler.py
@@ -33,7 +33,8 @@ def test_simple():
    num_seq_group = 4
    max_model_len = 16
    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+    scheduler_config = SchedulerConfig("generate",
+                                       max_num_batched_tokens,
                                       num_seq_group,
                                       max_model_len,
                                       enable_chunked_prefill=True)
@@ -78,6 +79,7 @@ def test_chunk():
    max_model_len = 80
    max_num_batched_tokens = 64
    scheduler_config = SchedulerConfig(
+        "generate",
        max_num_batched_tokens,
        max_seqs,
        max_model_len,
@@ -126,6 +128,7 @@ def test_complex():
    max_model_len = 80
    max_num_batched_tokens = 64
    scheduler_config = SchedulerConfig(
+        "generate",
        max_num_batched_tokens,
        max_seqs,
        max_model_len,
@@ -196,6 +199,7 @@ def test_maximal_decoding():
    max_model_len = 8
    max_num_batched_tokens = 2
    scheduler_config = SchedulerConfig(
+        "generate",
        max_num_batched_tokens,
        max_seqs,
        max_model_len,
@@ -289,6 +293,7 @@ def test_prompt_limit():
    max_model_len = 64
    max_num_batched_tokens = 32
    scheduler_config = SchedulerConfig(
+        "generate",
        max_num_batched_tokens,
        max_seqs,
        max_model_len,
@@ -321,7 +326,8 @@ def test_prompt_limit_exceed():
    max_seqs = 64
    max_model_len = 32
    max_num_batched_tokens = 64
-    scheduler_config = SchedulerConfig(max_num_batched_tokens,
+    scheduler_config = SchedulerConfig("generate",
+                                       max_num_batched_tokens,
                                       max_seqs,
                                       max_model_len,
                                       enable_chunked_prefill=True)
@@ -348,6 +354,7 @@ def test_swap():
    max_model_len = 200
    max_num_batched_tokens = 30
    scheduler_config = SchedulerConfig(
+        "generate",
        max_num_batched_tokens,
        max_seqs,
        max_model_len,
@@ -404,6 +411,7 @@ def test_running_prefill_prioritized_over_swap():
    max_model_len = 200
    max_num_batched_tokens = 30
    scheduler_config = SchedulerConfig(
+        "generate",
        max_num_batched_tokens,
        max_seqs,
        max_model_len,
@@ -498,6 +506,7 @@ def test_chunked_prefill_preempt():
    max_model_len = 200
    max_num_batched_tokens = 30
    scheduler_config = SchedulerConfig(
+        "generate",
        max_num_batched_tokens,
        max_seqs,
        max_model_len,
@@ -563,6 +572,7 @@ def test_chunked_prefill_max_seqs():
    max_model_len = 80
    max_num_batched_tokens = 64
    scheduler_config = SchedulerConfig(
+        "generate",
        max_num_batched_tokens,
        max_seqs,
        max_model_len,
@@ -617,6 +627,7 @@ def test_perfix_caching():
    max_model_len = 80
    max_num_batched_tokens = 64
    scheduler_config = SchedulerConfig(
+        "generate",
        max_num_batched_tokens,
        max_seqs,
        max_model_len,

--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -20,9 +20,10 @@ from .utils import (append_new_token, append_new_token_seq_group,
 def test_scheduler_add_seq_group():
    block_size = 4
    scheduler_config = SchedulerConfig(
-        100,
+        "generate",
-        64,
+        max_num_batched_tokens=100,
-        1,
+        max_num_seqs=64,
+        max_model_len=1,
    )
    cache_config = CacheConfig(block_size, 1.0, 1, cache_dtype="auto")
    cache_config.num_cpu_blocks = 4
@@ -42,9 +43,10 @@ def test_scheduler_add_seq_group():
 def test_scheduler_abort_seq_group():
    block_size = 4
    scheduler_config = SchedulerConfig(
-        100,
+        "generate",
-        64,
+        max_num_batched_tokens=100,
-        1,
+        max_num_seqs=64,
+        max_model_len=1,
    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 4
@@ -70,9 +72,10 @@ def test_scheduler_schedule_simple():
    num_seq_group = 4
    max_model_len = 16
    scheduler_config = SchedulerConfig(
-        64,
+        "generate",
-        num_seq_group,
+        max_num_batched_tokens=64,
-        max_model_len,
+        max_num_seqs=num_seq_group,
+        max_model_len=max_model_len,
    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 8
@@ -114,9 +117,10 @@ def test_scheduler_prefill_prioritized():
    max_model_len = 30
    max_batched_num_tokens = 30
    scheduler_config = SchedulerConfig(
-        max_batched_num_tokens,
+        "generate",
-        2,
+        max_num_batched_tokens=max_batched_num_tokens,
-        max_model_len,
+        max_num_seqs=2,
+        max_model_len=max_model_len,
    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 16
@@ -145,9 +149,10 @@ def test_scheduler_schedule_preempt_abort():
    block_size = 4
    max_model_len = 16
    scheduler_config = SchedulerConfig(
-        64,
+        "generate",
-        2,
+        max_num_batched_tokens=64,
-        max_model_len,
+        max_num_seqs=2,
+        max_model_len=max_model_len,
    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 2
@@ -204,9 +209,10 @@ def test_scheduler_max_seqs():
    max_seq_group = 2
    max_model_len = 16
    scheduler_config = SchedulerConfig(
-        64,
+        "generate",
-        max_seq_group,
+        max_num_batched_tokens=64,
-        max_model_len,
+        max_num_seqs=max_seq_group,
+        max_model_len=max_model_len,
    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 8
@@ -248,9 +254,10 @@ def test_scheduler_max_seqs():
 def test_scheduler_delay_factor():
    block_size = 4
    scheduler_config = SchedulerConfig(
-        100,
+        "generate",
-        64,
+        max_num_batched_tokens=100,
-        16,
+        max_num_seqs=64,
+        max_model_len=16,
        delay_factor=0.5,
    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
@@ -350,9 +357,10 @@ def initialize_scheduler(
 ):
    block_size = block_size
    scheduler_config = SchedulerConfig(
-        max_token_budget,
+        "generate",
-        max_num_seqs,
+        max_num_batched_tokens=max_token_budget,
-        max_model_len,
+        max_num_seqs=max_num_seqs,
+        max_model_len=max_model_len,
    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = num_cpu_blocks

--- a/tests/core/test_scheduler_encoder_decoder.py
+++ b/tests/core/test_scheduler_encoder_decoder.py
@@ -36,7 +36,12 @@ def test_scheduler_schedule_simple_encoder_decoder():
    block_size = 4
    num_seq_group = 4
    max_model_len = 16
-    scheduler_config = SchedulerConfig(64, num_seq_group, max_model_len)
+    scheduler_config = SchedulerConfig(
+        task="generate",
+        max_num_batched_tokens=64,
+        max_num_seqs=num_seq_group,
+        max_model_len=max_model_len,
+    )
    cache_config = CacheConfig(block_size, 1.0, 1, "auto")
    cache_config.num_cpu_blocks = 16  # enc and dec prompts per seq_group
    cache_config.num_gpu_blocks = 16  # enc and dec prompts per seq_group

--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -11,6 +11,7 @@ from typing import List, Literal, NamedTuple, Optional
 import pytest
+from vllm.config import TaskOption
 from vllm.logger import init_logger
 from ..utils import compare_two_settings, fork_new_process_for_each_test
@@ -27,18 +28,26 @@ class ParallelSetup(NamedTuple):
    chunked_prefill: bool
+class PPTestOptions(NamedTuple):
+    multi_node_only: bool
+    trust_remote_code: bool
+    tokenizer_mode: Optional[str]
 @dataclass
 class PPTestSettings:
    parallel_setups: List[ParallelSetup]
    distributed_backends: List[str]
-    trust_remote_code: bool
+    task: TaskOption
-    tokenizer_mode: Optional[str]
+    test_options: PPTestOptions
    @staticmethod
    def detailed(
        *,
        tp_base: int = 1,
        pp_base: int = 2,
+        multi_node_only: bool = False,
+        task: TaskOption = "auto",
        trust_remote_code: bool = False,
        tokenizer_mode: Optional[str] = None,
    ):
@@ -66,8 +75,10 @@ class PPTestSettings:
                              chunked_prefill=False),
            ],
            distributed_backends=["mp", "ray"],
-            trust_remote_code=trust_remote_code,
+            task=task,
-            tokenizer_mode=tokenizer_mode,
+            test_options=PPTestOptions(multi_node_only=multi_node_only,
+                                       trust_remote_code=trust_remote_code,
+                                       tokenizer_mode=tokenizer_mode),
        )
    @staticmethod
@@ -75,6 +86,8 @@ class PPTestSettings:
        *,
        tp_base: int = 1,
        pp_base: int = 2,
+        task: TaskOption = "auto",
+        multi_node_only: bool = False,
        trust_remote_code: bool = False,
        tokenizer_mode: Optional[str] = None,
    ):
@@ -86,15 +99,19 @@ class PPTestSettings:
                              chunked_prefill=False),
            ],
            distributed_backends=["mp"],
-            trust_remote_code=trust_remote_code,
+            task=task,
-            tokenizer_mode=tokenizer_mode,
+            test_options=PPTestOptions(multi_node_only=multi_node_only,
+                                       trust_remote_code=trust_remote_code,
+                                       tokenizer_mode=tokenizer_mode),
        )
    def iter_params(self, model_name: str):
+        opts = self.test_options
        for parallel_setup in self.parallel_setups:
            for distributed_backend in self.distributed_backends:
                yield (model_name, parallel_setup, distributed_backend,
-                       self.trust_remote_code, self.tokenizer_mode)
+                       self.task, opts)
 # NOTE: You can adjust tp_base and/or pp_base locally to fit the model in GPU
@@ -104,6 +121,7 @@ class PPTestSettings:
 GENERATION_MODEL_SETTINGS = {
    # [DETAILED TESTS]
    "meta-llama/Meta-Llama-3-8B": PPTestSettings.detailed(),
+    "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True),  # noqa: E501
    # [FAST TESTS]
    # Uses Llama
    # "BAAI/AquilaChat-7B": PPTestSettings.fast(),
@@ -145,10 +163,8 @@ GENERATION_MODEL_SETTINGS = {
    "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
    "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
    "microsoft/phi-2": PPTestSettings.fast(),
-    "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.fast(),
    "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
-    # FIXME: https://github.com/vllm-project/vllm/issues/8553
+    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
-    # "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
    "adept/persimmon-8b-chat": PPTestSettings.fast(),
    "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
    "Qwen/Qwen2-beta-7B-Chat": PPTestSettings.fast(),
@@ -199,6 +215,7 @@ TEST_MODELS = [
    # [LANGUAGE GENERATION]
    "meta-llama/Meta-Llama-3-8B",
    "ibm/PowerLM-3b",
+    "microsoft/Phi-3-mini-4k-instruct",
    # [LANGUAGE EMBEDDING]
    "intfloat/e5-mistral-7b-instruct",
    "BAAI/bge-multilingual-gemma2",
@@ -213,19 +230,22 @@ def _compare_tp(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
-    trust_remote_code: bool,
+    task: TaskOption,
-    tokenizer_mode: Optional[str],
+    test_options: PPTestOptions,
    num_gpus_available: int,
    *,
-    method: Literal["generate", "encode"] = "encode",
+    method: Literal["generate", "encode"],
 ):
    tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup
+    multi_node_only, trust_remote_code, tokenizer_mode = test_options
    if num_gpus_available < tp_size * pp_size:
        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
    if VLLM_MULTI_NODE and distributed_backend == "mp":
        pytest.skip("Skipping multi-node pipeline parallel test for "
                    "multiprocessing distributed backend")
+    if multi_node_only and not VLLM_MULTI_NODE:
+        pytest.skip("Not in multi-node setting")
    common_args = [
        # use half precision for speed and memory savings in CI environment
@@ -240,6 +260,8 @@ def _compare_tp(
        common_args.append("--enable-chunked-prefill")
    if eager_mode:
        common_args.append("--enforce-eager")
+    if task != "auto":
+        common_args.extend(["--task", task])
    if trust_remote_code:
        common_args.append("--trust-remote-code")
    if tokenizer_mode:
@@ -297,8 +319,8 @@ def _compare_tp(
 @pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend",
+    ("model_name", "parallel_setup", "distributed_backend", "task",
-     "trust_remote_code", "tokenizer_mode"),
+     "test_options"),
    [
        params for model_name, settings in GENERATION_MODEL_SETTINGS.items()
        for params in settings.iter_params(model_name)
@@ -310,22 +332,22 @@ def test_tp_language_generation(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
-    trust_remote_code: bool,
+    task: TaskOption,
-    tokenizer_mode: Optional[str],
+    test_options: PPTestOptions,
    num_gpus_available,
 ):
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
-                trust_remote_code,
+                task,
-                tokenizer_mode,
+                test_options,
                num_gpus_available,
                method="generate")
 @pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend",
+    ("model_name", "parallel_setup", "distributed_backend", "task",
-     "trust_remote_code", "tokenizer_mode"),
+     "test_options"),
    [
        params for model_name, settings in EMBEDDING_MODEL_SETTINGS.items()
        for params in settings.iter_params(model_name)
@@ -337,22 +359,22 @@ def test_tp_language_embedding(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
-    trust_remote_code: bool,
+    task: TaskOption,
-    tokenizer_mode: Optional[str],
+    test_options: PPTestOptions,
    num_gpus_available,
 ):
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
-                trust_remote_code,
+                task,
-                tokenizer_mode,
+                test_options,
                num_gpus_available,
                method="encode")
 @pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend",
+    ("model_name", "parallel_setup", "distributed_backend", "task",
-     "trust_remote_code", "tokenizer_mode"),
+     "test_options"),
    [
        params for model_name, settings in MULTIMODAL_MODEL_SETTINGS.items()
        for params in settings.iter_params(model_name)
@@ -364,14 +386,14 @@ def test_tp_multimodal_generation(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
-    trust_remote_code: bool,
+    task: TaskOption,
-    tokenizer_mode: Optional[str],
+    test_options: PPTestOptions,
    num_gpus_available,
 ):
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
-                trust_remote_code,
+                task,
-                tokenizer_mode,
+                test_options,
                num_gpus_available,
                method="generate")