Merge tag 'v0.6.6.post1' into v0.6.6.post1-dev

96ae75ad · zhuwenwen · f9f4a735 · 2339d59f · 96ae75ad · 96ae75ad
Commit 96ae75ad authored Jan 04, 2025 by zhuwenwen
20 changed files
--- a/tests/weight_loading/run_model_weight_loading_test.sh
+++ b/tests/weight_loading/run_model_weight_loading_test.sh
@@ -26,6 +26,10 @@ do
    export QUANTIZATION=${array[0]}
    export MODEL_NAME=${array[1]}
    export REVISION=${array[2]}
+    # If array length is larger than 3, then MIN_CAPABILITY is provided
+    if [ ${#array[@]} -gt 3 ]; then
+        export MIN_CAPABILITY=${array[3]}
+    fi
    pytest -s weight_loading/test_weight_loading.py || LOCAL_SUCCESS=$?
    if [[ $LOCAL_SUCCESS == 0 ]]; then

--- a/tests/weight_loading/test_weight_loading.py
+++ b/tests/weight_loading/test_weight_loading.py
 import os
+import pytest
 import torch
 from ..utils import models_path_prefix
+from vllm.platforms import current_platform
 MAX_MODEL_LEN = 1024
 MODEL_NAME = os.environ.get("MODEL_NAME",
                            os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-gptq"))
 REVISION = os.environ.get("REVISION", "main")
 QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
+MIN_CAPABILITY = os.environ.get("MIN_CAPABILITY", "89")
+@pytest.mark.skipif(
+    not current_platform.has_device_capability(int(MIN_CAPABILITY)),
+    reason="Current system does not have minimum capability.")
 def test_weight_loading(vllm_runner):
    """
    Test parameter weight loading with tp>1.

--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
 import contextlib
-import functools
 import importlib
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union, Type
@@ -44,34 +42,6 @@ else:
        from torch.library import impl_abstract as register_fake
-def hint_on_error(fn):
-    @functools.wraps(fn)
-    def wrapper(*args, **kwargs):
-        try:
-            return fn(*args, **kwargs)
-        except NotImplementedError as e:
-            msg = (
-                "Error in calling custom op %s: %s\n"
-                "Not implemented or built, mostly likely because the current current device "
-                "does not support this kernel (less likely TORCH_CUDA_ARCH_LIST was set "
-                "incorrectly while building)")
-            logger.error(msg, fn.__name__, e)
-            raise NotImplementedError(msg % (fn.__name__, e)) from e
-        except AttributeError as e:
-            msg = (
-                "Error in calling custom op %s: %s\n"
-                "Possibly you have built or installed an obsolete version of vllm.\n"
-                "Please try a clean build and install of vllm,"
-                "or remove old built files such as vllm/*cpython*.so and build/ ."
-            )
-            logger.error(msg, fn.__name__, e)
-            raise e
-    return wrapper
 # activation ops
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
    torch.ops._C.silu_and_mul(out, x)
@@ -984,6 +954,114 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
    return out
+def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool:
+    return torch.ops._C.cutlass_sparse_scaled_mm_supported(
+        cuda_device_capability)
+def cutlass_sparse_compress(a: torch.Tensor) \
+    -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compresses a sparse matrix for use with Cutlass sparse operations.
+    This function takes a dense tensor and compresses it into two components:
+    non-zero elements and metadata. The compressed representation is compatible
+    with Cutlass sparse kernels.
+    Args:
+        a (torch.Tensor): 
+            The input tensor to be compressed. Must have one of the following data types:
+            - `torch.int8`
+            - `torch.float8_e4m3fn`
+            - `torch.bfloat16`
+            - `torch.float16`
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: 
+            A tuple containing:
+            - `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`.
+            - `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation.
+    Raises:
+        ValueError: If the compression operation fails.
+    Notes:
+        - The `a_meta` tensor has a data type of `torch.uint8`.
+        - Each metadata element encodes the sparsity of 4 non-zero elements (i.e., `elemsPerMetaElem = 4`).
+        - The shape of `a_nzs` is `(m, k // 2)`, where `m` and `k` are the dimensions of the input tensor.
+        - The shape of `a_meta` is `(m, k // 2 // elemsPerMetaElem)`.
+    """
+    assert (a.dtype in [
+        torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16
+    ])
+    assert (a.is_contiguous())
+    # a_meta.dtype: torch.uint8 so elemsPerMetaElem = 8b / 2b_per_nz = 4
+    elemsPerMetaElem = 4
+    m = a.shape[0]
+    k = a.shape[1]
+    assert (k % 2 == 0)
+    a_nzs = torch.empty((m, k // 2), dtype=a.dtype, device=a.device)
+    a_meta = torch.empty((m, k // 2 // elemsPerMetaElem),
+                         dtype=torch.uint8,
+                         device=a.device)
+    if not (torch.ops._C.cutlass_sparse_compress_entry(a_nzs, a_meta, a)):
+        raise ValueError
+    assert (a_nzs.is_contiguous())
+    assert (a_meta.is_contiguous())
+    return a_nzs, a_meta
+def cutlass_scaled_sparse_mm(
+        a: torch.Tensor,
+        bt_nzs: torch.Tensor,
+        bt_meta: torch.Tensor,
+        scale_a: torch.Tensor,
+        scale_b: torch.Tensor,
+        out_dtype: torch.dtype,
+        bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    Performs a scaled sparse matrix multiplication using Cutlass.
+    Steps:
+    1. Create a dense matrix `a` of shape (m, k) on the CUDA device:
+    `a = torch.randn((m, k), device='cuda')`.
+    2. Create a dense matrix `b` of shape (k, n) on the CUDA device:
+    `b = torch.randn((k, n), device='cuda')`.
+    3. Prune matrix `b` to 2:4 sparsity along the specified dimension:
+    `b = prune_to_2_4(b, dim=0)`.
+    4. Compress the transposed sparse matrix `b.t()`:
+    `bt_nzs, bt_meta = cutlass_sparse_compress(b.t())`.
+    5. Perform sparse matrix multiplication using the compressed matrix,
+    applying scaling factors for `a` and `b`, and the output data type:
+    `out = cutlass_scaled_sparse_mm(a, bt_nzs, bt_meta, scale_a, scale_b, out_dtype)`.
+    Returns:
+    - The result of the scaled sparse matrix multiplication.
+    """
+    assert (bt_nzs.shape[0] % 16 == 0 and bt_nzs.shape[1] % 16 == 0)
+    assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
+    assert bias is None or bias.shape[0] == bt_nzs.shape[0] \
+        and bias.dtype == out_dtype
+    m = a.shape[0]
+    n = bt_nzs.shape[0]
+    out = torch.empty((m, n), dtype=out_dtype, device=a.device)
+    torch.ops._C.cutlass_scaled_sparse_mm(out, a, bt_nzs, bt_meta, scale_a,
+                                          scale_b, bias)
+    return out
 # aqlm
 def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
              codebooks: torch.Tensor, scales: torch.Tensor,
@@ -1426,6 +1504,7 @@ def register_graph_buffers(fa: int, handles: List[List[int]],
                           offsets: List[List[int]]) -> None:
    torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
 def read_cache(
        keys: torch.Tensor,
        values: torch.Tensor,
@@ -1449,26 +1528,3 @@ def write_cache_multi_layers(
    torch.ops._C_cache_ops.write_cache_multi_layers(keys, values, key_caches,
                                                    value_caches, slot_mapping,
                                                    kv_cache_dtype)
-# temporary fix for https://github.com/vllm-project/vllm/issues/5456
-# TODO: remove this in v0.6.0
-names_and_values = globals()
-names_and_values_to_update = {}
-# prepare variables to avoid dict size change during iteration
-k, v, arg = None, None, None
-fn_type = type(lambda x: x)
-for k, v in names_and_values.items():
-    # find functions that are defined in this file and have torch.Tensor
-    # in their annotations. `arg == "torch.Tensor"` is used to handle
-    # the case when users use `import __annotations__` to turn type
-    # hints into strings.
-    if isinstance(v, fn_type) \
-        and v.__code__.co_filename == __file__ \
-        and any(arg is torch.Tensor or arg == "torch.Tensor"
-                for arg in v.__annotations__.values()):
-        names_and_values_to_update[k] = hint_on_error(v)
-names_and_values.update(names_and_values_to_update)
-del names_and_values_to_update, names_and_values, v, k, fn_type
--- a/vllm/adapter_commons/models.py
+++ b/vllm/adapter_commons/models.py
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, Hashable, Optional, TypeVar
+from typing import Any, Callable, Dict, Optional, TypeVar
 from torch import nn
@@ -24,14 +24,13 @@ class AdapterModel(ABC):
 T = TypeVar('T')
-class AdapterLRUCache(LRUCache[T]):
+class AdapterLRUCache(LRUCache[int, T]):
-    def __init__(self, capacity: int, deactivate_fn: Callable[[Hashable],
+    def __init__(self, capacity: int, deactivate_fn: Callable[[int], object]):
-                                                              None]):
        super().__init__(capacity)
        self.deactivate_fn = deactivate_fn
-    def _on_remove(self, key: Hashable, value: Optional[T]):
+    def _on_remove(self, key: int, value: Optional[T]):
        logger.debug("Removing adapter int id: %d", key)
        self.deactivate_fn(key)
        return super()._on_remove(key, value)

--- a/vllm/assets/audio.py
+++ b/vllm/assets/audio.py
 from dataclasses import dataclass
-from typing import Literal, Tuple
+from typing import Literal
 from urllib.parse import urljoin
-import librosa
+import numpy.typing as npt
-import numpy as np
-from vllm.assets.base import get_vllm_public_assets, vLLM_S3_BUCKET_URL
+from vllm.utils import PlaceholderModule
+from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets
+try:
+    import librosa
+except ImportError:
+    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
 ASSET_DIR = "multimodal_asset"
@@ -15,8 +21,7 @@ class AudioAsset:
    name: Literal["winning_call", "mary_had_lamb"]
    @property
-    def audio_and_sample_rate(self) -> Tuple[np.ndarray, int]:
+    def audio_and_sample_rate(self) -> tuple[npt.NDArray, int]:
        audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg",
                                            s3_prefix=ASSET_DIR)
        y, sr = librosa.load(audio_path, sr=None)
@@ -25,4 +30,4 @@ class AudioAsset:
    @property
    def url(self) -> str:
-        return urljoin(vLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
+        return urljoin(VLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
--- a/vllm/assets/base.py
+++ b/vllm/assets/base.py
@@ -4,9 +4,8 @@ from typing import Optional
 import vllm.envs as envs
 from vllm.connections import global_http_connection
-from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
-vLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
+VLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
 def get_cache_dir() -> Path:
@@ -32,8 +31,8 @@ def get_vllm_public_assets(filename: str,
        if s3_prefix is not None:
            filename = s3_prefix + "/" + filename
        global_http_connection.download_file(
-            f"{vLLM_S3_BUCKET_URL}/{filename}",
+            f"{VLLM_S3_BUCKET_URL}/{filename}",
            asset_path,
-            timeout=VLLM_IMAGE_FETCH_TIMEOUT)
+            timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT)
    return asset_path
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -4,7 +4,7 @@ from typing import Literal
 import torch
 from PIL import Image
-from vllm.assets.base import get_vllm_public_assets
+from .base import get_vllm_public_assets
 VLM_IMAGES_DIR = "vision_model_images"
@@ -15,7 +15,6 @@ class ImageAsset:
    @property
    def pil_image(self) -> Image.Image:
        image_path = get_vllm_public_assets(filename=f"{self.name}.jpg",
                                            s3_prefix=VLM_IMAGES_DIR)
        return Image.open(image_path)

--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -2,13 +2,13 @@ from dataclasses import dataclass
 from functools import lru_cache
 from typing import List, Literal
+import cv2
 import numpy as np
 import numpy.typing as npt
 from huggingface_hub import hf_hub_download
 from PIL import Image
-from vllm.multimodal.utils import (sample_frames_from_video,
+from vllm.multimodal.video import sample_frames_from_video
-                                   try_import_video_packages)
 from .base import get_cache_dir
@@ -19,7 +19,7 @@ def download_video_asset(filename: str) -> str:
    Download and open an image from huggingface
    repo: raushan-testing-hf/videos-test
    """
-    video_directory = get_cache_dir() / "video-eample-data"
+    video_directory = get_cache_dir() / "video-example-data"
    video_directory.mkdir(parents=True, exist_ok=True)
    video_path = video_directory / filename
@@ -35,8 +35,6 @@ def download_video_asset(filename: str) -> str:
 def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
-    cv2, _ = try_import_video_packages()
    cap = cv2.VideoCapture(path)
    if not cap.isOpened():
        raise ValueError(f"Could not open video file {path}")
@@ -59,7 +57,6 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
 def video_to_pil_images_list(path: str,
                             num_frames: int = -1) -> List[Image.Image]:
-    cv2, _ = try_import_video_packages()
    frames = video_to_ndarrays(path, num_frames)
    return [
        Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -447,7 +447,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
        Returns:
            shape = [num_tokens, num_heads * head_size]
        """
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
        # If the feature combo become valid
        if attn_type != AttentionType.DECODER:
            raise NotImplementedError("Encoder self-attention and "

--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -191,6 +191,7 @@ class MultiHeadAttention(nn.Module):
                                        kv_cache_dtype=None,
                                        block_size=16,
                                        is_attention_free=False)
+        attn_backend = backend_name_to_enum(attn_backend.get_name())
        if attn_backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
            attn_backend = _Backend.XFORMERS

--- a/vllm/benchmarks/benchmark_throughput.py
+++ b/vllm/benchmarks/benchmark_throughput.py
@@ -4,7 +4,8 @@ import dataclasses
 import json
 import random
 import time
-from typing import List, Optional
+from functools import cache
+from typing import Dict, List, Optional, Tuple
 import numpy as np
 import torch
@@ -20,8 +21,11 @@ from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
    build_async_engine_client_from_engine_args)
 from vllm.inputs import TextPrompt
+from vllm.lora.request import LoRARequest
+from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
 from vllm.sampling_params import BeamSearchParams
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators
@@ -31,15 +35,17 @@ class SampleRequest:
    Attributes:
        prompt: The input text prompt for the model.
-        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
-            images).
        prompt_len: The length of the prompt in tokens.
        expected_output_len: The expected length of the output in tokens.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        lora_request: Optional LoRARequest specifying the LoRA to use. 
    """
    prompt: str
    prompt_len: int
    expected_output_len: int
    multi_modal_data: Optional[MultiModalDataDict] = None
+    lora_request: Optional[LoRARequest] = None
 def _get_prompt_for_image_model(question: str, *, model: str) -> str:
@@ -63,8 +69,30 @@ def _get_prompt_for_image_model(question: str, *, model: str) -> str:
    raise ValueError(f"Unsupported model {model}")
+@cache
+def lora_path_on_disk(lora_path: str) -> str:
+    return get_adapter_absolute_path(lora_path)
+lora_tokenizer_cache: Dict[int, AnyTokenizer] = {}
+def get_random_lora_request(
+        args: argparse.Namespace
+) -> Tuple[LoRARequest, Optional[AnyTokenizer]]:
+    global lora_tokenizer_cache
+    lora_id = random.randint(1, args.max_loras)
+    lora_request = LoRARequest(lora_name=str(lora_id),
+                               lora_int_id=lora_id,
+                               lora_path=lora_path_on_disk(args.lora_path))
+    if lora_id not in lora_tokenizer_cache:
+        lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
+    return lora_request, lora_tokenizer_cache[lora_id]
 def sample_requests(tokenizer: PreTrainedTokenizerBase,
                    args: argparse.Namespace) -> List[SampleRequest]:
    dataset_path: str = args.dataset
    num_requests: int = args.num_prompts
    fixed_output_len: Optional[int] = args.output_len
@@ -82,7 +110,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
    # Filter out sequences that are too long or too short
    filtered_dataset: List[SampleRequest] = []
-    for data in dataset:
+    for data in tqdm(dataset,
+                     total=len(filtered_dataset),
+                     desc="sampling requests"):
        if len(filtered_dataset) == num_requests:
            break
@@ -105,9 +135,16 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
                continue
            prompt = _get_prompt_for_image_model(question=prompt, model=model)
+        request_tokenizer = tokenizer
+        lora_request: Optional[LoRARequest] = None
+        if args.enable_lora:
+            lora_request, lora_tokenizer = get_random_lora_request(args)
+            if lora_tokenizer:
+                request_tokenizer = lora_tokenizer
        # Tokenize the prompts and completions.
-        prompt_token_ids = tokenizer(prompt).input_ids
+        prompt_token_ids = request_tokenizer(prompt).input_ids
-        completion_token_ids = tokenizer(completion).input_ids
+        completion_token_ids = request_tokenizer(completion).input_ids
        prompt_len = len(prompt_token_ids)
        output_len = len(completion_token_ids
                         ) if fixed_output_len is None else fixed_output_len
@@ -121,7 +158,8 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
            SampleRequest(prompt=prompt,
                          prompt_len=prompt_len,
                          expected_output_len=output_len,
-                          multi_modal_data=multi_modal_data))
+                          multi_modal_data=multi_modal_data,
+                          lora_request=lora_request))
    return filtered_dataset
@@ -150,11 +188,14 @@ def run_vllm(
                ignore_eos=True,
                max_tokens=request.expected_output_len,
            ))
+    lora_requests: Optional[List[LoRARequest]] = None
+    if engine_args.enable_lora:
+        lora_requests = [request.lora_request for request in requests]
    # warmup
    warmup_prompts: List[TextPrompt] = []
    warmup_sampling_params: List[SamplingParams] = []
-    for request in warmup_prompts:
+    for request in warmup_requests:
        warmup_prompts.append(
            TextPrompt(prompt=request.prompt,
                       multi_modal_data=request.multi_modal_data))
@@ -191,9 +232,13 @@ def run_vllm(
    if not use_beam_search:
        start = time.perf_counter()
-        llm.generate(prompts, sampling_params, use_tqdm=True)
+        llm.generate(prompts,
+                     sampling_params,
+                     lora_request=lora_requests,
+                     use_tqdm=True)
        end = time.perf_counter()
    else:
+        assert lora_requests is None, "BeamSearch API does not support LoRA"
        prompts = [request.prompt for request in requests]
        # output_len should be the same for all requests.
        output_len = requests[0][2]
@@ -225,6 +270,7 @@ async def run_vllm_async(
        # Add the requests to the engine.
        prompts: List[TextPrompt] = []
        sampling_params: List[SamplingParams] = []
+        lora_requests: List[Optional[LoRARequest]] = []
        for request in requests:
            prompts.append(
                TextPrompt(prompt=request.prompt,
@@ -237,11 +283,16 @@ async def run_vllm_async(
                    ignore_eos=True,
                    max_tokens=request.expected_output_len,
                ))
+            lora_requests.append(request.lora_request)
        generators = []
        start = time.perf_counter()
-        for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
+        for i, (prompt, sp,
-            generator = llm.generate(prompt, sp, request_id=f"test{i}")
+                lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
+            generator = llm.generate(prompt,
+                                     sp,
+                                     lora_request=lr,
+                                     request_id=f"test{i}")
            generators.append(generator)
        all_gens = merge_async_iterators(*generators)
        async for i, res in all_gens:
@@ -340,6 +391,14 @@ def main(args: argparse.Namespace):
        vocab_size = tokenizer.vocab_size
        requests = []
        for _ in range(args.num_prompts):
+            request_tokenizer = tokenizer
+            lora_request: Optional[LoRARequest] = None
+            if args.enable_lora:
+                lora_request, lora_tokenizer = get_random_lora_request(args)
+                if lora_tokenizer:
+                    request_tokenizer = lora_tokenizer
            # Synthesize a prompt with the given input length.
            candidate_ids = [
                random.randint(0, vocab_size - 1)
@@ -348,8 +407,8 @@ def main(args: argparse.Namespace):
            # As tokenizer may add additional tokens like BOS, we need to try
            # different lengths to get the desired input length.
            for _ in range(5):  # Max attempts to correct
-                candidate_prompt = tokenizer.decode(candidate_ids)
+                candidate_prompt = request_tokenizer.decode(candidate_ids)
-                tokenized_len = len(tokenizer.encode(candidate_prompt))
+                tokenized_len = len(request_tokenizer.encode(candidate_prompt))
                if tokenized_len == args.input_len:
                    break
@@ -366,40 +425,14 @@ def main(args: argparse.Namespace):
            requests.append(
                SampleRequest(prompt=candidate_prompt,
                              prompt_len=args.input_len,
-                              expected_output_len=args.output_len))
+                              expected_output_len=args.output_len,
+                              lora_request=lora_request))
    else:
        requests = sample_requests(tokenizer, args)
    is_multi_modal = any(request.multi_modal_data is not None
                         for request in requests)
    if args.backend == "vllm":
-        # if args.async_engine:
-        #     run_args = [
-        #         requests, args.model, args.tokenizer, args.quantization,
-        #         args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
-        #         args.trust_remote_code, args.dtype, args.max_model_len,
-        #         args.enforce_eager, args.kv_cache_dtype,
-        #         args.quantization_param_path, args.device,
-        #         args.enable_prefix_caching, args.enable_chunked_prefill,
-        #         args.max_num_batched_tokens, args.distributed_executor_backend,
-        #         args.gpu_memory_utilization, args.num_scheduler_steps,
-        #         args.use_v2_block_manager, args.download_dir, args.load_format,
-        #         args.disable_async_output_proc
-        #     ]
-        # else:
-        #     run_args = [
-        #         warmup_requests, requests, args.model, args.tokenizer, args.quantization,
-        #         args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
-        #         args.trust_remote_code, args.dtype, args.max_model_len,
-        #         args.enforce_eager, args.kv_cache_dtype,
-        #         args.quantization_param_path, args.device,
-        #         args.enable_prefix_caching, args.enable_chunked_prefill,
-        #         args.max_num_batched_tokens, args.distributed_executor_backend,
-        #         args.gpu_memory_utilization, args.num_scheduler_steps,
-        #         args.use_v2_block_manager, args.download_dir, args.load_format,
-        #         args.disable_async_output_proc
-        #     ]
        if args.async_engine:
            elapsed_time = uvloop.run(
                run_vllm_async(
@@ -409,7 +442,7 @@ def main(args: argparse.Namespace):
                    args.disable_frontend_multiprocessing,
                ))
        else:
-            elapsed_time = run_vllm(requests, args.n,
+            elapsed_time = run_vllm(warmup_requests, requests, args.n,
                                    EngineArgs.from_cli_args(args))
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
@@ -496,6 +529,14 @@ if __name__ == "__main__":
                        action='store_true',
                        default=False,
                        help="Disable decoupled async engine frontend.")
+    # LoRA
+    parser.add_argument(
+        "--lora-path",
+        type=str,
+        default=None,
+        help="Path to the lora adapters to use. This can be an absolute path, "
+        "a relative path, or a Hugging Face model identifier.")
    parser = AsyncEngineArgs.add_cli_args(parser)
    args = parser.parse_args()
    if args.tokenizer is None:
@@ -505,6 +546,8 @@ if __name__ == "__main__":
        assert args.output_len is not None
    else:
        assert args.input_len is None
+    if args.enable_lora:
+        assert args.lora_path is not None
    if args.backend == "vllm":
        if args.hf_max_batch_size is not None:
@@ -514,6 +557,9 @@ if __name__ == "__main__":
            raise ValueError("HF max batch size is required for HF backend.")
        if args.quantization is not None:
            raise ValueError("Quantization is only for vLLM backend.")
+        if args.enable_lora is not None:
+            raise ValueError("LoRA benchmarking is only supported for vLLM"
+                             " backend")
    elif args.backend == "mii":
        if args.dtype != "auto":
            raise ValueError("dtype must be auto for MII backend.")
@@ -526,4 +572,7 @@ if __name__ == "__main__":
        if args.tokenizer != args.model:
            raise ValueError("Tokenizer must be the same as the model for MII "
                             "backend.")
+        if args.enable_lora is not None:
+            raise ValueError("LoRA benchmarking is only supported for vLLM"
+                             " backend")
    main(args)
\ No newline at end of file
--- a/vllm/block.py
+++ b/vllm/block.py
-"""Token blocks."""
-from typing import TYPE_CHECKING, Iterator, List, Optional
-from vllm.utils import Device
-DEFAULT_LAST_ACCESSED_TIME: float = -1
-class PhysicalTokenBlock:
-    """Represents the state of a block in the KV cache."""
-    def __init__(
-        self,
-        device: Device,
-        block_number: int,
-        block_size: int,
-        block_hash: int,
-        num_hashed_tokens: int,
-    ) -> None:
-        self.device = device
-        self.block_number = block_number
-        self.block_size = block_size
-        self.block_hash = block_hash
-        self.num_hashed_tokens = num_hashed_tokens
-        self.ref_count = 0
-        self.last_accessed = DEFAULT_LAST_ACCESSED_TIME
-        self.computed = False
-    def __repr__(self) -> str:
-        return (f'PhysicalTokenBlock(device={self.device}, '
-                f'block_number={self.block_number}, '
-                f'num_hashed_tokens={self.num_hashed_tokens}, '
-                f'ref_count={self.ref_count}, '
-                f'last_accessed={self.last_accessed}, '
-                f'computed={self.computed})')
-class BlockTable:
-    """Holds a list of blocks with caching of their associated block_ids 
-    """
-    def __init__(self, blocks: Optional[List[PhysicalTokenBlock]] = None):
-        self._blocks: List[PhysicalTokenBlock] = []
-        self._block_ids: List[int] = []
-        if blocks is not None:
-            for block in blocks:
-                self.append(block)
-    def append(self, block: PhysicalTokenBlock):
-        self._blocks.append(block)
-        self._block_ids.append(block.block_number)
-    def __len__(self) -> int:
-        return len(self._blocks)
-    def __getitem__(self, key):
-        return self._blocks[key]
-    if TYPE_CHECKING:
-        def __iter__(self) -> Iterator[PhysicalTokenBlock]:
-            raise RuntimeError("Method should be automatically generated")
-    def __setitem__(self, key, value):
-        if isinstance(key, slice):
-            blocks = value
-            self._blocks[key] = blocks
-            self._block_ids[key] = [b.block_number for b in blocks]
-        else:
-            block = value
-            self._blocks[key] = block
-            self._block_ids[key] = block.block_number
-    def reset(self):
-        self._blocks = []
-        self._block_ids = []
-    def copy(self) -> "BlockTable":
-        return BlockTable(self._blocks)
-    def list(self) -> List[PhysicalTokenBlock]:
-        return self._blocks
-    def ids(self) -> List[int]:
-        return self._block_ids
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -141,14 +141,14 @@ class AlwaysHitShapeEnv:
        return ""
-def wrap_inductor(graph,
+def wrap_inductor(graph: fx.GraphModule,
                  example_inputs,
                  additional_inductor_config,
                  compilation_config: CompilationConfig,
                  graph_index: int = 0,
                  num_graphs: int = 1,
                  runtime_shape: Optional[int] = None,
-                  use_inductor: bool = True):
+                  use_inductor: bool = True) -> Any:
    if graph_index == 0:
        # before compiling the first graph, record the start time
        global compilation_start_time
@@ -208,7 +208,7 @@ def wrap_inductor(graph,
        from torch._inductor.compile_fx import graph_returns_tuple
        returns_tuple = graph_returns_tuple(graph)
-        # this is the graph we return to Dynamo to run
+        # this is the callable we return to Dynamo to run
        def compiled_graph(*args):
            # convert args to list
            list_args = list(args)
@@ -247,7 +247,7 @@ def wrap_inductor(graph,
            # see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa
            return
-        def _get_shape_env():
+        def _get_shape_env() -> AlwaysHitShapeEnv:
            return AlwaysHitShapeEnv()
        with patch(# for hijacking the hash of the compiled graph
@@ -537,6 +537,7 @@ class VllmBackend:
            example_inputs[x].clone() for x in self.sym_tensor_indices
        ]
+        # this is the callable we return to Dynamo to run
        def copy_and_call(*args):
            list_args = list(args)
            for i, index in enumerate(self.sym_tensor_indices):

--- a/vllm/compilation/multi_output_match.py
+++ b/vllm/compilation/multi_output_match.py
@@ -7,6 +7,7 @@ from torch import fx
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._inductor import pattern_matcher as pm
 from torch._ops import OpOverload
+from torch.fx import Node
 from vllm.compilation.fx_utils import find_auto_fn
@@ -97,7 +98,7 @@ class MultiOutputMatch(abc.ABC):
                self.graph.call_function(operator.getitem, (tuple_node, idx))
                for idx in indices)
-    def insert_auto_fn(self, op: OpOverload, kwargs):
+    def insert_auto_fn(self, op: OpOverload, kwargs) -> Node:
        """
        Insert an auto_functionalized node with the given op and kwargs.
        """

--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
-from typing import List
+from typing import Any, Dict, List
 from torch import fx as fx
@@ -53,7 +53,7 @@ class PostGradPassManager:
        assert isinstance(pass_, InductorPass)
        self.passes.append(pass_)
-    def __getstate__(self):
+    def __getstate__(self) -> Dict[str, List[Any]]:
        """
        Custom pickling for the pass manager, as some passes cannot be pickled.
        Pickling occurs because the pass manager is set as the value of

--- a/vllm/config.py
+++ b/vllm/config.py
@@ -22,12 +22,15 @@ from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
                                                     get_quantization_config)
 from vllm.model_executor.models import ModelRegistry
-from vllm.platforms import current_platform
+from vllm.platforms import current_platform, interface
 from vllm.tracing import is_otel_available, otel_import_error_traceback
 from vllm.transformers_utils.config import (
    ConfigFormat, get_config, get_hf_image_processor_config,
    get_hf_text_config, get_pooling_config,
-    get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope)
+    get_sentence_transformer_tokenizer_config, is_encoder_decoder,
+    try_get_generation_config, uses_mrope)
+from vllm.transformers_utils.s3_utils import S3Model
+from vllm.transformers_utils.utils import is_s3
 from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
                        get_cpu_memory, print_warning_once, random_uuid,
                        resolve_obj_by_qualname)
@@ -148,9 +151,8 @@ class ModelConfig:
            HuggingFace config.
        mm_processor_kwargs: Arguments to be forwarded to the model's processor
            for multi-modal data, e.g., image processor.
-        mm_cache_preprocessor: If true, then enables caching of the multi-modal 
+        disable_mm_preprocessor_cache: If true, then disables caching of the
-            preprocessor/mapper. Otherwise, the mapper executes each time, and 
+            multi-modal preprocessor/mapper. (not recommended)
-            for better performance consider enabling frontend process.
        override_neuron_config: Initialize non default neuron config or
            override default neuron config that are specific to Neuron devices,
            this argument will be used to configure the neuron config that
@@ -159,8 +161,9 @@ class ModelConfig:
            override default pooling config for the pooling model.
        logits_processor_pattern: Optional regex pattern specifying valid
            logits processor qualified names that can be passed with the
-            `logits_processors` extra completion argument. Defaults to None, 
+            `logits_processors` extra completion argument. Defaults to None,
            which allows no processors.
+        generation_config: Configuration parameter file for generation.
    """
    def compute_hash(self) -> str:
@@ -216,10 +219,11 @@ class ModelConfig:
                 config_format: ConfigFormat = ConfigFormat.AUTO,
                 hf_overrides: Optional[HfOverrides] = None,
                 mm_processor_kwargs: Optional[Dict[str, Any]] = None,
-                 mm_cache_preprocessor: bool = False,
+                 disable_mm_preprocessor_cache: bool = False,
                 override_neuron_config: Optional[Dict[str, Any]] = None,
                 override_pooler_config: Optional["PoolerConfig"] = None,
-                 logits_processor_pattern: Optional[str] = None) -> None:
+                 logits_processor_pattern: Optional[str] = None,
+                 generation_config: Optional[str] = None) -> None:
        self.model = model
        self.tokenizer = tokenizer
        self.tokenizer_mode = tokenizer_mode
@@ -254,6 +258,8 @@ class ModelConfig:
                   f"'Please instead use `--hf-overrides '{hf_override!r}'`")
            warnings.warn(DeprecationWarning(msg), stacklevel=2)
+        self.maybe_pull_model_tokenizer_for_s3(model, tokenizer)
        # The tokenizer version is consistent with the model version by default.
        if tokenizer_revision is None:
            self.tokenizer_revision = revision
@@ -286,7 +292,7 @@ class ModelConfig:
        self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
        self.use_async_output_proc = use_async_output_proc
        self.mm_processor_kwargs = mm_processor_kwargs
-        self.mm_cache_preprocessor = mm_cache_preprocessor
+        self.disable_mm_preprocessor_cache = disable_mm_preprocessor_cache
        # Set enforce_eager to False if the value is unset.
        if self.enforce_eager is None:
@@ -349,10 +355,36 @@ class ModelConfig:
        self.pooler_config = self._init_pooler_config(override_pooler_config)
        self.logits_processor_pattern = logits_processor_pattern
+        self.generation_config = generation_config
        self._verify_quantization()
        self._verify_cuda_graph()
        self._verify_bnb_config()
+    def maybe_pull_model_tokenizer_for_s3(self, model: str,
+                                          tokenizer: str) -> None:
+        """
+        Pull the model config or tokenizer to a temporary
+        directory in case of S3.
+        Args:
+            model: The model name or path.
+            tokenizer: The tokenizer name or path.
+        """
+        if is_s3(model) or is_s3(tokenizer):
+            if is_s3(model):
+                self.s3_model = S3Model()
+                self.s3_model.pull_files(model, allow_pattern=["*config.json"])
+                self.model_weights = self.model
+                self.model = self.s3_model.dir
+            if is_s3(tokenizer):
+                self.s3_tokenizer = S3Model()
+                self.s3_tokenizer.pull_files(
+                    model, ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
+                self.tokenizer = self.s3_tokenizer.dir
    def _init_multimodal_config(
        self, limit_mm_per_prompt: Optional[Mapping[str, int]]
    ) -> Optional["MultiModalConfig"]:
@@ -564,6 +596,12 @@ class ModelConfig:
        self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                          self.max_model_len)
+        if (self.hf_config.model_type == 'deepseek_v3'
+                and not self.enforce_eager):
+            logger.warning("CUDA graph is not supported for Deepseek V3 yet, "
+                           "fallback to the eager mode.")
+            self.enforce_eager = True
    def _verify_bnb_config(self) -> None:
        """
        The current version of bitsandbytes (0.44.0) with 8-bit models does not
@@ -598,7 +636,7 @@ class ModelConfig:
            self.use_async_output_proc = False
            return
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
        # If the feature combo become valid
        if not current_platform.is_async_output_supported(self.enforce_eager):
            logger.warning(
@@ -618,7 +656,7 @@ class ModelConfig:
        if self.runner_type == "pooling":
            self.use_async_output_proc = False
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
        # If the feature combo become valid
        if speculative_config:
            logger.warning("Async output processing is not supported with"
@@ -680,8 +718,9 @@ class ModelConfig:
    def get_head_size(self) -> int:
        # TODO remove hard code
-        if hasattr(self.hf_text_config, "model_type"
+        if hasattr(self.hf_text_config,
-                   ) and self.hf_text_config.model_type == 'deepseek_v2':
+                   "model_type") and (self.hf_text_config.model_type
+                                      in ('deepseek_v2', 'deepseek_v3')):
            # FlashAttention supports only head_size 32, 64, 128, 256,
            # we need to pad head_size 192 to 256
            return 256
@@ -814,6 +853,56 @@ class ModelConfig:
        return self.multimodal_config
+    def try_get_generation_config(self) -> Dict[str, Any]:
+        if self.generation_config is None or self.generation_config == "auto":
+            config = try_get_generation_config(
+                self.model,
+                trust_remote_code=self.trust_remote_code,
+                revision=self.revision,
+            )
+        else:
+            config = try_get_generation_config(
+                self.generation_config,
+                trust_remote_code=self.trust_remote_code,
+            )
+        if config is None:
+            return {}
+        return config.to_diff_dict()
+    def get_diff_sampling_param(self) -> Dict[str, Any]:
+        """
+        This method returns a dictionary containing the parameters
+        that differ from the default sampling parameters, but only
+        if `generation_config` is set. If `generation_config` is not
+        set, an empty dictionary is returned.
+        Returns:
+            Dict[str, Any]: A dictionary with the differing sampling
+            parameters if `generation_config` is set, otherwise an
+            empty dictionary.
+        """
+        if self.generation_config is None:
+            # When generation_config is not set
+            return {}
+        config = self.try_get_generation_config()
+        available_params = [
+            "repetition_penalty",
+            "temperature",
+            "top_k",
+            "top_p",
+            "min_p",
+        ]
+        if any(p in config for p in available_params):
+            diff_sampling_param = {
+                p: config.get(p)
+                for p in available_params if config.get(p) is not None
+            }
+        else:
+            diff_sampling_param = {}
+        return diff_sampling_param
    @property
    def is_encoder_decoder(self) -> bool:
        """Extract the HF encoder/decoder model flag."""
@@ -917,6 +1006,10 @@ class CacheConfig:
            raise ValueError(
                "GPU memory utilization must be less than 1.0. Got "
                f"{self.gpu_memory_utilization}.")
+        if (current_platform.is_cuda() and self.block_size is not None
+                and self.block_size > 32):
+            raise ValueError("CUDA Paged Attention kernel only supports "
+                             f"block sizes up to 32. Got {self.block_size}.")
    def _verify_cache_dtype(self) -> None:
        if self.cache_dtype == "auto":
@@ -1041,6 +1134,7 @@ class LoadFormat(str, enum.Enum):
    GGUF = "gguf"
    BITSANDBYTES = "bitsandbytes"
    MISTRAL = "mistral"
+    RUNAI_STREAMER = "runai_streamer"
 @dataclass
@@ -1977,7 +2071,7 @@ class LoRAConfig:
                           model_config.quantization)
    def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
        # If the feature combo become valid
        if scheduler_config.chunked_prefill_enabled:
            logger.warning("LoRA with chunked prefill is still experimental "
@@ -2155,6 +2249,17 @@ def _get_and_verify_dtype(
            else:
                torch_dtype = config_dtype
+            if (current_platform.is_cpu()
+                    and current_platform.get_cpu_architecture()
+                    == interface.CpuArchEnum.POWERPC
+                    and (config_dtype == torch.float16
+                         or config_dtype == torch.float32)):
+                logger.info(
+                    "For POWERPC, we cast models to bfloat16 instead of "
+                    "using float16 by default. Float16 is not currently "
+                    "supported for POWERPC.")
+                torch_dtype = torch.bfloat16
            if current_platform.is_hpu() and config_dtype == torch.float16:
                logger.info(
                    "For HPU, we cast models to bfloat16 instead of"
@@ -3165,7 +3270,7 @@ class VllmConfig:
            f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
            f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, "  # noqa
            f"use_async_output_proc={self.model_config.use_async_output_proc}, "
-            f"mm_cache_preprocessor={self.model_config.mm_cache_preprocessor!r}, "  # noqa
+            f"disable_mm_preprocessor_cache={self.model_config.disable_mm_preprocessor_cache!r}, "  # noqa
            f"mm_processor_kwargs={self.model_config.mm_processor_kwargs}, "
            f"pooler_config={self.model_config.pooler_config!r}, "
            f"compilation_config={self.compilation_config!r}")

--- a/vllm/core/evictor.py
+++ b/vllm/core/evictor.py
@@ -13,7 +13,7 @@ class EvictionPolicy(enum.Enum):
 class Evictor(ABC):
    """The Evictor subclasses should be used by the BlockAllocator class to
-    handle eviction of freed PhysicalTokenBlocks.
+    handle eviction of freed Blocks.
    """
    @abstractmethod
@@ -70,7 +70,7 @@ class BlockMetaData:
 class LRUEvictor(Evictor):
    """Evicts in a least-recently-used order using the last_accessed timestamp
-    that's recorded in the PhysicalTokenBlock. If there are multiple blocks with
+    that's recorded in the Block. If there are multiple blocks with
    the same last_accessed time, then the one with the largest num_hashed_tokens
    will be evicted. If two blocks each have the lowest last_accessed time and
    highest num_hashed_tokens value, then one will be chose arbitrarily

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -141,7 +141,7 @@ class EngineArgs:
    tokenizer_pool_extra_config: Optional[Dict[str, Any]] = None
    limit_mm_per_prompt: Optional[Mapping[str, int]] = None
    mm_processor_kwargs: Optional[Dict[str, Any]] = None
-    mm_cache_preprocessor: bool = False
+    disable_mm_preprocessor_cache: bool = False
    enable_lora: bool = False
    enable_lora_bias: bool = False
    max_loras: int = 1
@@ -200,6 +200,8 @@ class EngineArgs:
    kv_transfer_config: Optional[KVTransferConfig] = None
+    generation_config: Optional[str] = None
    def __post_init__(self):
        if not self.tokenizer:
            self.tokenizer = self.model
@@ -208,6 +210,7 @@ class EngineArgs:
        # by user.
        if self.enable_prefix_caching is None:
            self.enable_prefix_caching = bool(envs.VLLM_USE_V1)
        # Override max_num_seqs if it's not set by user.
        if self.max_num_seqs is None:
            self.max_num_seqs = 256 if not envs.VLLM_USE_V1 else 1024
@@ -316,6 +319,8 @@ class EngineArgs:
            '* "tensorizer" will load the weights using tensorizer from '
            'CoreWeave. See the Tensorize vLLM Model script in the Examples '
            'section for more information.\n'
+            '* "runai_streamer" will load the Safetensors weights using Run:ai'
+            'Model Streamer \n'
            '* "bitsandbytes" will load the weights using bitsandbytes '
            'quantization.\n')
        parser.add_argument(
@@ -371,7 +376,7 @@ class EngineArgs:
            choices=['outlines', 'lm-format-enforcer', 'xgrammar'],
            help='Which engine will be used for guided decoding'
            ' (JSON schema / regex etc) by default. Currently support '
-            'https://github.com/outlines-dev/outlines,'
+            'https://github.com/outlines-dev/outlines, '
            'https://github.com/mlc-ai/xgrammar, and '
            'https://github.com/noamgat/lm-format-enforcer.'
            ' Can be overridden per request via guided_decoding_backend'
@@ -426,10 +431,12 @@ class EngineArgs:
        parser.add_argument('--block-size',
                            type=int,
                            default=EngineArgs.block_size,
-                            choices=[8, 16, 32],
+                            choices=[8, 16, 32, 64, 128],
                            help='Token block size for contiguous chunks of '
                            'tokens. This is ignored on neuron devices and '
-                            'set to max-model-len')
+                            'set to max-model-len. On CUDA devices, '
+                            'only block sizes up to 32 are supported. '
+                            'On HPU devices, block size defaults to 128.')
        parser.add_argument(
            "--enable-prefix-caching",
@@ -606,11 +613,10 @@ class EngineArgs:
            help=('Overrides for the multimodal input mapping/processing, '
                  'e.g., image processor. For example: {"num_crops": 4}.'))
        parser.add_argument(
-            '--mm-cache-preprocessor',
+            '--disable-mm-preprocessor-cache',
            action='store_true',
-            help='If true, then enables caching of the multi-modal '
+            help='If true, then disables caching of the multi-modal '
-            'preprocessor/mapper. Otherwise, the mapper executes each time'
+            'preprocessor/mapper. (not recommended)')
-            ', and for better performance consider enabling frontend process.')
        # LoRA related configs
        parser.add_argument('--enable-lora',
@@ -957,6 +963,16 @@ class EngineArgs:
            default="auto",
            help='The worker class to use for distributed execution.')
+        parser.add_argument(
+            "--generation-config",
+            type=nullable_str,
+            default=None,
+            help="The folder path to the generation config. "
+            "Defaults to None, will use the default generation config in vLLM. "
+            "If set to 'auto', the generation config will be automatically "
+            "loaded from model. If set to a folder path, the generation config "
+            "will be loaded from the specified folder path.")
        return parser
    @classmethod
@@ -997,10 +1013,11 @@ class EngineArgs:
            use_async_output_proc=not self.disable_async_output_proc,
            config_format=self.config_format,
            mm_processor_kwargs=self.mm_processor_kwargs,
-            mm_cache_preprocessor=self.mm_cache_preprocessor,
+            disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache,
            override_neuron_config=self.override_neuron_config,
            override_pooler_config=self.override_pooler_config,
-            logits_processor_pattern=self.logits_processor_pattern)
+            logits_processor_pattern=self.logits_processor_pattern,
+            generation_config=self.generation_config)
    def create_load_config(self) -> LoadConfig:
        return LoadConfig(
@@ -1043,11 +1060,11 @@ class EngineArgs:
        device_config = DeviceConfig(device=self.device)
        model_config = self.create_model_config()
-        if model_config.is_multimodal_model:
+        if (model_config.is_multimodal_model and not envs.VLLM_USE_V1
-            if self.enable_prefix_caching:
+                and self.enable_prefix_caching):
-                logger.warning(
+            logger.warning("--enable-prefix-caching is currently not "
-                    "--enable-prefix-caching is currently not "
+                           "supported for multimodal models in v0 and "
-                    "supported for multimodal models and has been disabled.")
+                           "has been disabled.")
            self.enable_prefix_caching = False
        cache_config = CacheConfig(
@@ -1149,7 +1166,7 @@ class EngineArgs:
            num_speculative_heads=self.num_speculative_heads
        )
-        # Reminder: Please update docs/source/usage/compatibility_matrix.rst
+        # Reminder: Please update docs/source/usage/compatibility_matrix.md
        # If the feature combo become valid
        if self.num_scheduler_steps > 1:
            if speculative_config is not None:
@@ -1269,11 +1286,14 @@ class EngineArgs:
        # When no user override, set the default values based on the usage
        # context.
        # TODO(woosuk): Tune the default values for different hardware.
-        if self.max_num_batched_tokens is None:
+        default_max_num_batched_tokens = {
-            if usage_context == UsageContext.LLM_CLASS:
+            UsageContext.LLM_CLASS: 8192,
-                self.max_num_batched_tokens = 8192
+            UsageContext.OPENAI_API_SERVER: 2048,
-            elif usage_context == UsageContext.OPENAI_API_SERVER:
+        }
-                self.max_num_batched_tokens = 2048
+        if (self.max_num_batched_tokens is None
+                and usage_context in default_max_num_batched_tokens):
+            self.max_num_batched_tokens = default_max_num_batched_tokens[
+                usage_context]
            logger.warning(
                "Setting max_num_batched_tokens to %d for %s usage context.",
                self.max_num_batched_tokens, usage_context.value)
@@ -1283,9 +1303,6 @@ class EngineArgs:
        Override the EngineConfig's configs based on the usage context for V1.
        """
        assert envs.VLLM_USE_V1, "V1 is not enabled"
-        if engine_config.model_config.is_multimodal_model:
-            # TODO (ywang96): Enable APC by default when VLM supports it.
-            assert not engine_config.cache_config.enable_prefix_caching
 @dataclass

--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -1256,3 +1256,10 @@ class AsyncLLMEngine(EngineClient):
            self.engine.model_executor.stop_profile()
        else:
            self.engine.model_executor._run_workers("stop_profile")
+# TODO(v1): Remove this class proxy when V1 goes default.
+if envs.VLLM_USE_V1:
+    from vllm.v1.engine.async_llm import AsyncLLM
+    AsyncLLMEngine = AsyncLLM  # type: ignore
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -6,8 +6,8 @@ from collections import deque
 from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import partial
-from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict,
+from typing import (TYPE_CHECKING, Callable, ClassVar, Deque, Dict, Iterable,
-                    Iterable, List, Mapping, NamedTuple, Optional)
+                    List, Mapping, NamedTuple, Optional)
 from typing import Sequence as GenericSequence
 from typing import Set, Type, Union, cast, overload
@@ -53,7 +53,6 @@ from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
                           SequenceGroupOutput, SequenceStatus, CompletionSequenceGroupOutput, VLLM_INVALID_TOKEN_ID)
 from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
                          init_tracer)
-from vllm.transformers_utils.config import try_get_generation_config
 from vllm.transformers_utils.detokenizer import Detokenizer
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import (
@@ -66,20 +65,6 @@ from vllm.version import __version__ as VLLM_VERSION
 logger = init_logger(__name__)
 _LOCAL_LOGGING_INTERVAL_SEC = 5
-def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
-    config = try_get_generation_config(
-        model_config.model,
-        trust_remote_code=model_config.trust_remote_code,
-        revision=model_config.revision,
-    )
-    if config is None:
-        return {}
-    return config.to_diff_dict()
 _G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
 _O = TypeVar("_O", RequestOutput, PoolingRequestOutput)
@@ -149,7 +134,7 @@ class LLMEngine:
    and the :class:`AsyncLLMEngine` class wraps this class for online serving.
    The config arguments are derived from :class:`~vllm.EngineArgs`. (See
-    :ref:`engine_args`)
+    :ref:`engine-args`)
    Args:
        model_config: The configuration related to the LLM model.
@@ -275,8 +260,8 @@ class LLMEngine:
            return tokenizer_group.get_lora_tokenizer(sequence.lora_request)
        self.seq_counter = Counter()
-        self.generation_config_fields = _load_generation_config_dict(
+        self.generation_config_fields = (
-            self.model_config)
+            self.model_config.try_get_generation_config())
        self.input_preprocessor = InputPreprocessor(self.model_config,
                                                    self.tokenizer,