Commit 96ae75ad authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.6.post1' into v0.6.6.post1-dev

parents f9f4a735 2339d59f
...@@ -26,6 +26,10 @@ do ...@@ -26,6 +26,10 @@ do
export QUANTIZATION=${array[0]} export QUANTIZATION=${array[0]}
export MODEL_NAME=${array[1]} export MODEL_NAME=${array[1]}
export REVISION=${array[2]} export REVISION=${array[2]}
# If array length is larger than 3, then MIN_CAPABILITY is provided
if [ ${#array[@]} -gt 3 ]; then
export MIN_CAPABILITY=${array[3]}
fi
pytest -s weight_loading/test_weight_loading.py || LOCAL_SUCCESS=$? pytest -s weight_loading/test_weight_loading.py || LOCAL_SUCCESS=$?
if [[ $LOCAL_SUCCESS == 0 ]]; then if [[ $LOCAL_SUCCESS == 0 ]]; then
......
import os import os
import pytest
import torch import torch
from ..utils import models_path_prefix from ..utils import models_path_prefix
from vllm.platforms import current_platform
MAX_MODEL_LEN = 1024 MAX_MODEL_LEN = 1024
MODEL_NAME = os.environ.get("MODEL_NAME", MODEL_NAME = os.environ.get("MODEL_NAME",
os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-gptq")) os.path.join(models_path_prefix, "robertgshaw2/zephyr-7b-beta-channelwise-gptq"))
REVISION = os.environ.get("REVISION", "main") REVISION = os.environ.get("REVISION", "main")
QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin") QUANTIZATION = os.environ.get("QUANTIZATION", "gptq_marlin")
MIN_CAPABILITY = os.environ.get("MIN_CAPABILITY", "89")
@pytest.mark.skipif(
not current_platform.has_device_capability(int(MIN_CAPABILITY)),
reason="Current system does not have minimum capability.")
def test_weight_loading(vllm_runner): def test_weight_loading(vllm_runner):
""" """
Test parameter weight loading with tp>1. Test parameter weight loading with tp>1.
......
import contextlib import contextlib
import functools
import importlib import importlib
from typing import TYPE_CHECKING, List, Optional, Tuple, Union, Type from typing import TYPE_CHECKING, List, Optional, Tuple, Union, Type
...@@ -44,34 +42,6 @@ else: ...@@ -44,34 +42,6 @@ else:
from torch.library import impl_abstract as register_fake from torch.library import impl_abstract as register_fake
def hint_on_error(fn):
@functools.wraps(fn)
def wrapper(*args, **kwargs):
try:
return fn(*args, **kwargs)
except NotImplementedError as e:
msg = (
"Error in calling custom op %s: %s\n"
"Not implemented or built, mostly likely because the current current device "
"does not support this kernel (less likely TORCH_CUDA_ARCH_LIST was set "
"incorrectly while building)")
logger.error(msg, fn.__name__, e)
raise NotImplementedError(msg % (fn.__name__, e)) from e
except AttributeError as e:
msg = (
"Error in calling custom op %s: %s\n"
"Possibly you have built or installed an obsolete version of vllm.\n"
"Please try a clean build and install of vllm,"
"or remove old built files such as vllm/*cpython*.so and build/ ."
)
logger.error(msg, fn.__name__, e)
raise e
return wrapper
# activation ops # activation ops
def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None: def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
torch.ops._C.silu_and_mul(out, x) torch.ops._C.silu_and_mul(out, x)
...@@ -984,6 +954,114 @@ def cutlass_scaled_mm_azp(a: torch.Tensor, ...@@ -984,6 +954,114 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
return out return out
def cutlass_sparse_scaled_mm_supported(cuda_device_capability: int) -> bool:
return torch.ops._C.cutlass_sparse_scaled_mm_supported(
cuda_device_capability)
def cutlass_sparse_compress(a: torch.Tensor) \
-> Tuple[torch.Tensor, torch.Tensor]:
"""
Compresses a sparse matrix for use with Cutlass sparse operations.
This function takes a dense tensor and compresses it into two components:
non-zero elements and metadata. The compressed representation is compatible
with Cutlass sparse kernels.
Args:
a (torch.Tensor):
The input tensor to be compressed. Must have one of the following data types:
- `torch.int8`
- `torch.float8_e4m3fn`
- `torch.bfloat16`
- `torch.float16`
Returns:
Tuple[torch.Tensor, torch.Tensor]:
A tuple containing:
- `a_nzs` (torch.Tensor): A tensor containing non-zero elements of `a`.
- `a_meta` (torch.Tensor): A tensor containing metadata for the sparse representation.
Raises:
ValueError: If the compression operation fails.
Notes:
- The `a_meta` tensor has a data type of `torch.uint8`.
- Each metadata element encodes the sparsity of 4 non-zero elements (i.e., `elemsPerMetaElem = 4`).
- The shape of `a_nzs` is `(m, k // 2)`, where `m` and `k` are the dimensions of the input tensor.
- The shape of `a_meta` is `(m, k // 2 // elemsPerMetaElem)`.
"""
assert (a.dtype in [
torch.int8, torch.float8_e4m3fn, torch.bfloat16, torch.float16
])
assert (a.is_contiguous())
# a_meta.dtype: torch.uint8 so elemsPerMetaElem = 8b / 2b_per_nz = 4
elemsPerMetaElem = 4
m = a.shape[0]
k = a.shape[1]
assert (k % 2 == 0)
a_nzs = torch.empty((m, k // 2), dtype=a.dtype, device=a.device)
a_meta = torch.empty((m, k // 2 // elemsPerMetaElem),
dtype=torch.uint8,
device=a.device)
if not (torch.ops._C.cutlass_sparse_compress_entry(a_nzs, a_meta, a)):
raise ValueError
assert (a_nzs.is_contiguous())
assert (a_meta.is_contiguous())
return a_nzs, a_meta
def cutlass_scaled_sparse_mm(
a: torch.Tensor,
bt_nzs: torch.Tensor,
bt_meta: torch.Tensor,
scale_a: torch.Tensor,
scale_b: torch.Tensor,
out_dtype: torch.dtype,
bias: Optional[torch.Tensor] = None) -> torch.Tensor:
"""
Performs a scaled sparse matrix multiplication using Cutlass.
Steps:
1. Create a dense matrix `a` of shape (m, k) on the CUDA device:
`a = torch.randn((m, k), device='cuda')`.
2. Create a dense matrix `b` of shape (k, n) on the CUDA device:
`b = torch.randn((k, n), device='cuda')`.
3. Prune matrix `b` to 2:4 sparsity along the specified dimension:
`b = prune_to_2_4(b, dim=0)`.
4. Compress the transposed sparse matrix `b.t()`:
`bt_nzs, bt_meta = cutlass_sparse_compress(b.t())`.
5. Perform sparse matrix multiplication using the compressed matrix,
applying scaling factors for `a` and `b`, and the output data type:
`out = cutlass_scaled_sparse_mm(a, bt_nzs, bt_meta, scale_a, scale_b, out_dtype)`.
Returns:
- The result of the scaled sparse matrix multiplication.
"""
assert (bt_nzs.shape[0] % 16 == 0 and bt_nzs.shape[1] % 16 == 0)
assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
assert bias is None or bias.shape[0] == bt_nzs.shape[0] \
and bias.dtype == out_dtype
m = a.shape[0]
n = bt_nzs.shape[0]
out = torch.empty((m, n), dtype=out_dtype, device=a.device)
torch.ops._C.cutlass_scaled_sparse_mm(out, a, bt_nzs, bt_meta, scale_a,
scale_b, bias)
return out
# aqlm # aqlm
def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor, def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
codebooks: torch.Tensor, scales: torch.Tensor, codebooks: torch.Tensor, scales: torch.Tensor,
...@@ -1426,6 +1504,7 @@ def register_graph_buffers(fa: int, handles: List[List[int]], ...@@ -1426,6 +1504,7 @@ def register_graph_buffers(fa: int, handles: List[List[int]],
offsets: List[List[int]]) -> None: offsets: List[List[int]]) -> None:
torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets) torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
def read_cache( def read_cache(
keys: torch.Tensor, keys: torch.Tensor,
values: torch.Tensor, values: torch.Tensor,
...@@ -1449,26 +1528,3 @@ def write_cache_multi_layers( ...@@ -1449,26 +1528,3 @@ def write_cache_multi_layers(
torch.ops._C_cache_ops.write_cache_multi_layers(keys, values, key_caches, torch.ops._C_cache_ops.write_cache_multi_layers(keys, values, key_caches,
value_caches, slot_mapping, value_caches, slot_mapping,
kv_cache_dtype) kv_cache_dtype)
# temporary fix for https://github.com/vllm-project/vllm/issues/5456
# TODO: remove this in v0.6.0
names_and_values = globals()
names_and_values_to_update = {}
# prepare variables to avoid dict size change during iteration
k, v, arg = None, None, None
fn_type = type(lambda x: x)
for k, v in names_and_values.items():
# find functions that are defined in this file and have torch.Tensor
# in their annotations. `arg == "torch.Tensor"` is used to handle
# the case when users use `import __annotations__` to turn type
# hints into strings.
if isinstance(v, fn_type) \
and v.__code__.co_filename == __file__ \
and any(arg is torch.Tensor or arg == "torch.Tensor"
for arg in v.__annotations__.values()):
names_and_values_to_update[k] = hint_on_error(v)
names_and_values.update(names_and_values_to_update)
del names_and_values_to_update, names_and_values, v, k, fn_type
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any, Callable, Dict, Hashable, Optional, TypeVar from typing import Any, Callable, Dict, Optional, TypeVar
from torch import nn from torch import nn
...@@ -24,14 +24,13 @@ class AdapterModel(ABC): ...@@ -24,14 +24,13 @@ class AdapterModel(ABC):
T = TypeVar('T') T = TypeVar('T')
class AdapterLRUCache(LRUCache[T]): class AdapterLRUCache(LRUCache[int, T]):
def __init__(self, capacity: int, deactivate_fn: Callable[[Hashable], def __init__(self, capacity: int, deactivate_fn: Callable[[int], object]):
None]):
super().__init__(capacity) super().__init__(capacity)
self.deactivate_fn = deactivate_fn self.deactivate_fn = deactivate_fn
def _on_remove(self, key: Hashable, value: Optional[T]): def _on_remove(self, key: int, value: Optional[T]):
logger.debug("Removing adapter int id: %d", key) logger.debug("Removing adapter int id: %d", key)
self.deactivate_fn(key) self.deactivate_fn(key)
return super()._on_remove(key, value) return super()._on_remove(key, value)
......
from dataclasses import dataclass from dataclasses import dataclass
from typing import Literal, Tuple from typing import Literal
from urllib.parse import urljoin from urllib.parse import urljoin
import librosa import numpy.typing as npt
import numpy as np
from vllm.assets.base import get_vllm_public_assets, vLLM_S3_BUCKET_URL from vllm.utils import PlaceholderModule
from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets
try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
ASSET_DIR = "multimodal_asset" ASSET_DIR = "multimodal_asset"
...@@ -15,8 +21,7 @@ class AudioAsset: ...@@ -15,8 +21,7 @@ class AudioAsset:
name: Literal["winning_call", "mary_had_lamb"] name: Literal["winning_call", "mary_had_lamb"]
@property @property
def audio_and_sample_rate(self) -> Tuple[np.ndarray, int]: def audio_and_sample_rate(self) -> tuple[npt.NDArray, int]:
audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg", audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg",
s3_prefix=ASSET_DIR) s3_prefix=ASSET_DIR)
y, sr = librosa.load(audio_path, sr=None) y, sr = librosa.load(audio_path, sr=None)
...@@ -25,4 +30,4 @@ class AudioAsset: ...@@ -25,4 +30,4 @@ class AudioAsset:
@property @property
def url(self) -> str: def url(self) -> str:
return urljoin(vLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg") return urljoin(VLLM_S3_BUCKET_URL, f"{ASSET_DIR}/{self.name}.ogg")
...@@ -4,9 +4,8 @@ from typing import Optional ...@@ -4,9 +4,8 @@ from typing import Optional
import vllm.envs as envs import vllm.envs as envs
from vllm.connections import global_http_connection from vllm.connections import global_http_connection
from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
vLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com" VLLM_S3_BUCKET_URL = "https://vllm-public-assets.s3.us-west-2.amazonaws.com"
def get_cache_dir() -> Path: def get_cache_dir() -> Path:
...@@ -32,8 +31,8 @@ def get_vllm_public_assets(filename: str, ...@@ -32,8 +31,8 @@ def get_vllm_public_assets(filename: str,
if s3_prefix is not None: if s3_prefix is not None:
filename = s3_prefix + "/" + filename filename = s3_prefix + "/" + filename
global_http_connection.download_file( global_http_connection.download_file(
f"{vLLM_S3_BUCKET_URL}/{filename}", f"{VLLM_S3_BUCKET_URL}/{filename}",
asset_path, asset_path,
timeout=VLLM_IMAGE_FETCH_TIMEOUT) timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT)
return asset_path return asset_path
...@@ -4,7 +4,7 @@ from typing import Literal ...@@ -4,7 +4,7 @@ from typing import Literal
import torch import torch
from PIL import Image from PIL import Image
from vllm.assets.base import get_vllm_public_assets from .base import get_vllm_public_assets
VLM_IMAGES_DIR = "vision_model_images" VLM_IMAGES_DIR = "vision_model_images"
...@@ -15,7 +15,6 @@ class ImageAsset: ...@@ -15,7 +15,6 @@ class ImageAsset:
@property @property
def pil_image(self) -> Image.Image: def pil_image(self) -> Image.Image:
image_path = get_vllm_public_assets(filename=f"{self.name}.jpg", image_path = get_vllm_public_assets(filename=f"{self.name}.jpg",
s3_prefix=VLM_IMAGES_DIR) s3_prefix=VLM_IMAGES_DIR)
return Image.open(image_path) return Image.open(image_path)
......
...@@ -2,13 +2,13 @@ from dataclasses import dataclass ...@@ -2,13 +2,13 @@ from dataclasses import dataclass
from functools import lru_cache from functools import lru_cache
from typing import List, Literal from typing import List, Literal
import cv2
import numpy as np import numpy as np
import numpy.typing as npt import numpy.typing as npt
from huggingface_hub import hf_hub_download from huggingface_hub import hf_hub_download
from PIL import Image from PIL import Image
from vllm.multimodal.utils import (sample_frames_from_video, from vllm.multimodal.video import sample_frames_from_video
try_import_video_packages)
from .base import get_cache_dir from .base import get_cache_dir
...@@ -19,7 +19,7 @@ def download_video_asset(filename: str) -> str: ...@@ -19,7 +19,7 @@ def download_video_asset(filename: str) -> str:
Download and open an image from huggingface Download and open an image from huggingface
repo: raushan-testing-hf/videos-test repo: raushan-testing-hf/videos-test
""" """
video_directory = get_cache_dir() / "video-eample-data" video_directory = get_cache_dir() / "video-example-data"
video_directory.mkdir(parents=True, exist_ok=True) video_directory.mkdir(parents=True, exist_ok=True)
video_path = video_directory / filename video_path = video_directory / filename
...@@ -35,8 +35,6 @@ def download_video_asset(filename: str) -> str: ...@@ -35,8 +35,6 @@ def download_video_asset(filename: str) -> str:
def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray: def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
cv2, _ = try_import_video_packages()
cap = cv2.VideoCapture(path) cap = cv2.VideoCapture(path)
if not cap.isOpened(): if not cap.isOpened():
raise ValueError(f"Could not open video file {path}") raise ValueError(f"Could not open video file {path}")
...@@ -59,7 +57,6 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray: ...@@ -59,7 +57,6 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
def video_to_pil_images_list(path: str, def video_to_pil_images_list(path: str,
num_frames: int = -1) -> List[Image.Image]: num_frames: int = -1) -> List[Image.Image]:
cv2, _ = try_import_video_packages()
frames = video_to_ndarrays(path, num_frames) frames = video_to_ndarrays(path, num_frames)
return [ return [
Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
......
...@@ -447,7 +447,7 @@ class ROCmFlashAttentionImpl(AttentionImpl): ...@@ -447,7 +447,7 @@ class ROCmFlashAttentionImpl(AttentionImpl):
Returns: Returns:
shape = [num_tokens, num_heads * head_size] shape = [num_tokens, num_heads * head_size]
""" """
# Reminder: Please update docs/source/usage/compatibility_matrix.rst # Reminder: Please update docs/source/usage/compatibility_matrix.md
# If the feature combo become valid # If the feature combo become valid
if attn_type != AttentionType.DECODER: if attn_type != AttentionType.DECODER:
raise NotImplementedError("Encoder self-attention and " raise NotImplementedError("Encoder self-attention and "
......
...@@ -191,6 +191,7 @@ class MultiHeadAttention(nn.Module): ...@@ -191,6 +191,7 @@ class MultiHeadAttention(nn.Module):
kv_cache_dtype=None, kv_cache_dtype=None,
block_size=16, block_size=16,
is_attention_free=False) is_attention_free=False)
attn_backend = backend_name_to_enum(attn_backend.get_name())
if attn_backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}: if attn_backend in {_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1}:
attn_backend = _Backend.XFORMERS attn_backend = _Backend.XFORMERS
......
...@@ -4,7 +4,8 @@ import dataclasses ...@@ -4,7 +4,8 @@ import dataclasses
import json import json
import random import random
import time import time
from typing import List, Optional from functools import cache
from typing import Dict, List, Optional, Tuple
import numpy as np import numpy as np
import torch import torch
...@@ -20,8 +21,11 @@ from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs ...@@ -20,8 +21,11 @@ from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.entrypoints.openai.api_server import ( from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args) build_async_engine_client_from_engine_args)
from vllm.inputs import TextPrompt from vllm.inputs import TextPrompt
from vllm.lora.request import LoRARequest
from vllm.lora.utils import get_adapter_absolute_path
from vllm.multimodal import MultiModalDataDict from vllm.multimodal import MultiModalDataDict
from vllm.sampling_params import BeamSearchParams from vllm.sampling_params import BeamSearchParams
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
from vllm.utils import FlexibleArgumentParser, merge_async_iterators from vllm.utils import FlexibleArgumentParser, merge_async_iterators
...@@ -31,15 +35,17 @@ class SampleRequest: ...@@ -31,15 +35,17 @@ class SampleRequest:
Attributes: Attributes:
prompt: The input text prompt for the model. prompt: The input text prompt for the model.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
prompt_len: The length of the prompt in tokens. prompt_len: The length of the prompt in tokens.
expected_output_len: The expected length of the output in tokens. expected_output_len: The expected length of the output in tokens.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
lora_request: Optional LoRARequest specifying the LoRA to use.
""" """
prompt: str prompt: str
prompt_len: int prompt_len: int
expected_output_len: int expected_output_len: int
multi_modal_data: Optional[MultiModalDataDict] = None multi_modal_data: Optional[MultiModalDataDict] = None
lora_request: Optional[LoRARequest] = None
def _get_prompt_for_image_model(question: str, *, model: str) -> str: def _get_prompt_for_image_model(question: str, *, model: str) -> str:
...@@ -63,8 +69,30 @@ def _get_prompt_for_image_model(question: str, *, model: str) -> str: ...@@ -63,8 +69,30 @@ def _get_prompt_for_image_model(question: str, *, model: str) -> str:
raise ValueError(f"Unsupported model {model}") raise ValueError(f"Unsupported model {model}")
@cache
def lora_path_on_disk(lora_path: str) -> str:
return get_adapter_absolute_path(lora_path)
lora_tokenizer_cache: Dict[int, AnyTokenizer] = {}
def get_random_lora_request(
args: argparse.Namespace
) -> Tuple[LoRARequest, Optional[AnyTokenizer]]:
global lora_tokenizer_cache
lora_id = random.randint(1, args.max_loras)
lora_request = LoRARequest(lora_name=str(lora_id),
lora_int_id=lora_id,
lora_path=lora_path_on_disk(args.lora_path))
if lora_id not in lora_tokenizer_cache:
lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
return lora_request, lora_tokenizer_cache[lora_id]
def sample_requests(tokenizer: PreTrainedTokenizerBase, def sample_requests(tokenizer: PreTrainedTokenizerBase,
args: argparse.Namespace) -> List[SampleRequest]: args: argparse.Namespace) -> List[SampleRequest]:
dataset_path: str = args.dataset dataset_path: str = args.dataset
num_requests: int = args.num_prompts num_requests: int = args.num_prompts
fixed_output_len: Optional[int] = args.output_len fixed_output_len: Optional[int] = args.output_len
...@@ -82,7 +110,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, ...@@ -82,7 +110,9 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
# Filter out sequences that are too long or too short # Filter out sequences that are too long or too short
filtered_dataset: List[SampleRequest] = [] filtered_dataset: List[SampleRequest] = []
for data in dataset: for data in tqdm(dataset,
total=len(filtered_dataset),
desc="sampling requests"):
if len(filtered_dataset) == num_requests: if len(filtered_dataset) == num_requests:
break break
...@@ -105,9 +135,16 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, ...@@ -105,9 +135,16 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
continue continue
prompt = _get_prompt_for_image_model(question=prompt, model=model) prompt = _get_prompt_for_image_model(question=prompt, model=model)
request_tokenizer = tokenizer
lora_request: Optional[LoRARequest] = None
if args.enable_lora:
lora_request, lora_tokenizer = get_random_lora_request(args)
if lora_tokenizer:
request_tokenizer = lora_tokenizer
# Tokenize the prompts and completions. # Tokenize the prompts and completions.
prompt_token_ids = tokenizer(prompt).input_ids prompt_token_ids = request_tokenizer(prompt).input_ids
completion_token_ids = tokenizer(completion).input_ids completion_token_ids = request_tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids) prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids output_len = len(completion_token_ids
) if fixed_output_len is None else fixed_output_len ) if fixed_output_len is None else fixed_output_len
...@@ -121,7 +158,8 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase, ...@@ -121,7 +158,8 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
SampleRequest(prompt=prompt, SampleRequest(prompt=prompt,
prompt_len=prompt_len, prompt_len=prompt_len,
expected_output_len=output_len, expected_output_len=output_len,
multi_modal_data=multi_modal_data)) multi_modal_data=multi_modal_data,
lora_request=lora_request))
return filtered_dataset return filtered_dataset
...@@ -150,11 +188,14 @@ def run_vllm( ...@@ -150,11 +188,14 @@ def run_vllm(
ignore_eos=True, ignore_eos=True,
max_tokens=request.expected_output_len, max_tokens=request.expected_output_len,
)) ))
lora_requests: Optional[List[LoRARequest]] = None
if engine_args.enable_lora:
lora_requests = [request.lora_request for request in requests]
# warmup # warmup
warmup_prompts: List[TextPrompt] = [] warmup_prompts: List[TextPrompt] = []
warmup_sampling_params: List[SamplingParams] = [] warmup_sampling_params: List[SamplingParams] = []
for request in warmup_prompts: for request in warmup_requests:
warmup_prompts.append( warmup_prompts.append(
TextPrompt(prompt=request.prompt, TextPrompt(prompt=request.prompt,
multi_modal_data=request.multi_modal_data)) multi_modal_data=request.multi_modal_data))
...@@ -191,9 +232,13 @@ def run_vllm( ...@@ -191,9 +232,13 @@ def run_vllm(
if not use_beam_search: if not use_beam_search:
start = time.perf_counter() start = time.perf_counter()
llm.generate(prompts, sampling_params, use_tqdm=True) llm.generate(prompts,
sampling_params,
lora_request=lora_requests,
use_tqdm=True)
end = time.perf_counter() end = time.perf_counter()
else: else:
assert lora_requests is None, "BeamSearch API does not support LoRA"
prompts = [request.prompt for request in requests] prompts = [request.prompt for request in requests]
# output_len should be the same for all requests. # output_len should be the same for all requests.
output_len = requests[0][2] output_len = requests[0][2]
...@@ -225,6 +270,7 @@ async def run_vllm_async( ...@@ -225,6 +270,7 @@ async def run_vllm_async(
# Add the requests to the engine. # Add the requests to the engine.
prompts: List[TextPrompt] = [] prompts: List[TextPrompt] = []
sampling_params: List[SamplingParams] = [] sampling_params: List[SamplingParams] = []
lora_requests: List[Optional[LoRARequest]] = []
for request in requests: for request in requests:
prompts.append( prompts.append(
TextPrompt(prompt=request.prompt, TextPrompt(prompt=request.prompt,
...@@ -237,11 +283,16 @@ async def run_vllm_async( ...@@ -237,11 +283,16 @@ async def run_vllm_async(
ignore_eos=True, ignore_eos=True,
max_tokens=request.expected_output_len, max_tokens=request.expected_output_len,
)) ))
lora_requests.append(request.lora_request)
generators = [] generators = []
start = time.perf_counter() start = time.perf_counter()
for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)): for i, (prompt, sp,
generator = llm.generate(prompt, sp, request_id=f"test{i}") lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
generator = llm.generate(prompt,
sp,
lora_request=lr,
request_id=f"test{i}")
generators.append(generator) generators.append(generator)
all_gens = merge_async_iterators(*generators) all_gens = merge_async_iterators(*generators)
async for i, res in all_gens: async for i, res in all_gens:
...@@ -340,6 +391,14 @@ def main(args: argparse.Namespace): ...@@ -340,6 +391,14 @@ def main(args: argparse.Namespace):
vocab_size = tokenizer.vocab_size vocab_size = tokenizer.vocab_size
requests = [] requests = []
for _ in range(args.num_prompts): for _ in range(args.num_prompts):
request_tokenizer = tokenizer
lora_request: Optional[LoRARequest] = None
if args.enable_lora:
lora_request, lora_tokenizer = get_random_lora_request(args)
if lora_tokenizer:
request_tokenizer = lora_tokenizer
# Synthesize a prompt with the given input length. # Synthesize a prompt with the given input length.
candidate_ids = [ candidate_ids = [
random.randint(0, vocab_size - 1) random.randint(0, vocab_size - 1)
...@@ -348,8 +407,8 @@ def main(args: argparse.Namespace): ...@@ -348,8 +407,8 @@ def main(args: argparse.Namespace):
# As tokenizer may add additional tokens like BOS, we need to try # As tokenizer may add additional tokens like BOS, we need to try
# different lengths to get the desired input length. # different lengths to get the desired input length.
for _ in range(5): # Max attempts to correct for _ in range(5): # Max attempts to correct
candidate_prompt = tokenizer.decode(candidate_ids) candidate_prompt = request_tokenizer.decode(candidate_ids)
tokenized_len = len(tokenizer.encode(candidate_prompt)) tokenized_len = len(request_tokenizer.encode(candidate_prompt))
if tokenized_len == args.input_len: if tokenized_len == args.input_len:
break break
...@@ -366,40 +425,14 @@ def main(args: argparse.Namespace): ...@@ -366,40 +425,14 @@ def main(args: argparse.Namespace):
requests.append( requests.append(
SampleRequest(prompt=candidate_prompt, SampleRequest(prompt=candidate_prompt,
prompt_len=args.input_len, prompt_len=args.input_len,
expected_output_len=args.output_len)) expected_output_len=args.output_len,
lora_request=lora_request))
else: else:
requests = sample_requests(tokenizer, args) requests = sample_requests(tokenizer, args)
is_multi_modal = any(request.multi_modal_data is not None is_multi_modal = any(request.multi_modal_data is not None
for request in requests) for request in requests)
if args.backend == "vllm": if args.backend == "vllm":
# if args.async_engine:
# run_args = [
# requests, args.model, args.tokenizer, args.quantization,
# args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
# args.trust_remote_code, args.dtype, args.max_model_len,
# args.enforce_eager, args.kv_cache_dtype,
# args.quantization_param_path, args.device,
# args.enable_prefix_caching, args.enable_chunked_prefill,
# args.max_num_batched_tokens, args.distributed_executor_backend,
# args.gpu_memory_utilization, args.num_scheduler_steps,
# args.use_v2_block_manager, args.download_dir, args.load_format,
# args.disable_async_output_proc
# ]
# else:
# run_args = [
# warmup_requests, requests, args.model, args.tokenizer, args.quantization,
# args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
# args.trust_remote_code, args.dtype, args.max_model_len,
# args.enforce_eager, args.kv_cache_dtype,
# args.quantization_param_path, args.device,
# args.enable_prefix_caching, args.enable_chunked_prefill,
# args.max_num_batched_tokens, args.distributed_executor_backend,
# args.gpu_memory_utilization, args.num_scheduler_steps,
# args.use_v2_block_manager, args.download_dir, args.load_format,
# args.disable_async_output_proc
# ]
if args.async_engine: if args.async_engine:
elapsed_time = uvloop.run( elapsed_time = uvloop.run(
run_vllm_async( run_vllm_async(
...@@ -409,7 +442,7 @@ def main(args: argparse.Namespace): ...@@ -409,7 +442,7 @@ def main(args: argparse.Namespace):
args.disable_frontend_multiprocessing, args.disable_frontend_multiprocessing,
)) ))
else: else:
elapsed_time = run_vllm(requests, args.n, elapsed_time = run_vllm(warmup_requests, requests, args.n,
EngineArgs.from_cli_args(args)) EngineArgs.from_cli_args(args))
elif args.backend == "hf": elif args.backend == "hf":
assert args.tensor_parallel_size == 1 assert args.tensor_parallel_size == 1
...@@ -496,6 +529,14 @@ if __name__ == "__main__": ...@@ -496,6 +529,14 @@ if __name__ == "__main__":
action='store_true', action='store_true',
default=False, default=False,
help="Disable decoupled async engine frontend.") help="Disable decoupled async engine frontend.")
# LoRA
parser.add_argument(
"--lora-path",
type=str,
default=None,
help="Path to the lora adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier.")
parser = AsyncEngineArgs.add_cli_args(parser) parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args() args = parser.parse_args()
if args.tokenizer is None: if args.tokenizer is None:
...@@ -505,6 +546,8 @@ if __name__ == "__main__": ...@@ -505,6 +546,8 @@ if __name__ == "__main__":
assert args.output_len is not None assert args.output_len is not None
else: else:
assert args.input_len is None assert args.input_len is None
if args.enable_lora:
assert args.lora_path is not None
if args.backend == "vllm": if args.backend == "vllm":
if args.hf_max_batch_size is not None: if args.hf_max_batch_size is not None:
...@@ -514,6 +557,9 @@ if __name__ == "__main__": ...@@ -514,6 +557,9 @@ if __name__ == "__main__":
raise ValueError("HF max batch size is required for HF backend.") raise ValueError("HF max batch size is required for HF backend.")
if args.quantization is not None: if args.quantization is not None:
raise ValueError("Quantization is only for vLLM backend.") raise ValueError("Quantization is only for vLLM backend.")
if args.enable_lora is not None:
raise ValueError("LoRA benchmarking is only supported for vLLM"
" backend")
elif args.backend == "mii": elif args.backend == "mii":
if args.dtype != "auto": if args.dtype != "auto":
raise ValueError("dtype must be auto for MII backend.") raise ValueError("dtype must be auto for MII backend.")
...@@ -526,4 +572,7 @@ if __name__ == "__main__": ...@@ -526,4 +572,7 @@ if __name__ == "__main__":
if args.tokenizer != args.model: if args.tokenizer != args.model:
raise ValueError("Tokenizer must be the same as the model for MII " raise ValueError("Tokenizer must be the same as the model for MII "
"backend.") "backend.")
if args.enable_lora is not None:
raise ValueError("LoRA benchmarking is only supported for vLLM"
" backend")
main(args) main(args)
\ No newline at end of file
"""Token blocks."""
from typing import TYPE_CHECKING, Iterator, List, Optional
from vllm.utils import Device
DEFAULT_LAST_ACCESSED_TIME: float = -1
class PhysicalTokenBlock:
"""Represents the state of a block in the KV cache."""
def __init__(
self,
device: Device,
block_number: int,
block_size: int,
block_hash: int,
num_hashed_tokens: int,
) -> None:
self.device = device
self.block_number = block_number
self.block_size = block_size
self.block_hash = block_hash
self.num_hashed_tokens = num_hashed_tokens
self.ref_count = 0
self.last_accessed = DEFAULT_LAST_ACCESSED_TIME
self.computed = False
def __repr__(self) -> str:
return (f'PhysicalTokenBlock(device={self.device}, '
f'block_number={self.block_number}, '
f'num_hashed_tokens={self.num_hashed_tokens}, '
f'ref_count={self.ref_count}, '
f'last_accessed={self.last_accessed}, '
f'computed={self.computed})')
class BlockTable:
"""Holds a list of blocks with caching of their associated block_ids
"""
def __init__(self, blocks: Optional[List[PhysicalTokenBlock]] = None):
self._blocks: List[PhysicalTokenBlock] = []
self._block_ids: List[int] = []
if blocks is not None:
for block in blocks:
self.append(block)
def append(self, block: PhysicalTokenBlock):
self._blocks.append(block)
self._block_ids.append(block.block_number)
def __len__(self) -> int:
return len(self._blocks)
def __getitem__(self, key):
return self._blocks[key]
if TYPE_CHECKING:
def __iter__(self) -> Iterator[PhysicalTokenBlock]:
raise RuntimeError("Method should be automatically generated")
def __setitem__(self, key, value):
if isinstance(key, slice):
blocks = value
self._blocks[key] = blocks
self._block_ids[key] = [b.block_number for b in blocks]
else:
block = value
self._blocks[key] = block
self._block_ids[key] = block.block_number
def reset(self):
self._blocks = []
self._block_ids = []
def copy(self) -> "BlockTable":
return BlockTable(self._blocks)
def list(self) -> List[PhysicalTokenBlock]:
return self._blocks
def ids(self) -> List[int]:
return self._block_ids
...@@ -141,14 +141,14 @@ class AlwaysHitShapeEnv: ...@@ -141,14 +141,14 @@ class AlwaysHitShapeEnv:
return "" return ""
def wrap_inductor(graph, def wrap_inductor(graph: fx.GraphModule,
example_inputs, example_inputs,
additional_inductor_config, additional_inductor_config,
compilation_config: CompilationConfig, compilation_config: CompilationConfig,
graph_index: int = 0, graph_index: int = 0,
num_graphs: int = 1, num_graphs: int = 1,
runtime_shape: Optional[int] = None, runtime_shape: Optional[int] = None,
use_inductor: bool = True): use_inductor: bool = True) -> Any:
if graph_index == 0: if graph_index == 0:
# before compiling the first graph, record the start time # before compiling the first graph, record the start time
global compilation_start_time global compilation_start_time
...@@ -208,7 +208,7 @@ def wrap_inductor(graph, ...@@ -208,7 +208,7 @@ def wrap_inductor(graph,
from torch._inductor.compile_fx import graph_returns_tuple from torch._inductor.compile_fx import graph_returns_tuple
returns_tuple = graph_returns_tuple(graph) returns_tuple = graph_returns_tuple(graph)
# this is the graph we return to Dynamo to run # this is the callable we return to Dynamo to run
def compiled_graph(*args): def compiled_graph(*args):
# convert args to list # convert args to list
list_args = list(args) list_args = list(args)
...@@ -247,7 +247,7 @@ def wrap_inductor(graph, ...@@ -247,7 +247,7 @@ def wrap_inductor(graph,
# see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa # see https://github.com/pytorch/pytorch/blob/9f5ebf3fc609105a74eab4ccc24932d6353ff566/torch/_inductor/codecache.py#L1221 # noqa
return return
def _get_shape_env(): def _get_shape_env() -> AlwaysHitShapeEnv:
return AlwaysHitShapeEnv() return AlwaysHitShapeEnv()
with patch(# for hijacking the hash of the compiled graph with patch(# for hijacking the hash of the compiled graph
...@@ -537,6 +537,7 @@ class VllmBackend: ...@@ -537,6 +537,7 @@ class VllmBackend:
example_inputs[x].clone() for x in self.sym_tensor_indices example_inputs[x].clone() for x in self.sym_tensor_indices
] ]
# this is the callable we return to Dynamo to run
def copy_and_call(*args): def copy_and_call(*args):
list_args = list(args) list_args = list(args)
for i, index in enumerate(self.sym_tensor_indices): for i, index in enumerate(self.sym_tensor_indices):
......
...@@ -7,6 +7,7 @@ from torch import fx ...@@ -7,6 +7,7 @@ from torch import fx
from torch._higher_order_ops.auto_functionalize import auto_functionalized from torch._higher_order_ops.auto_functionalize import auto_functionalized
from torch._inductor import pattern_matcher as pm from torch._inductor import pattern_matcher as pm
from torch._ops import OpOverload from torch._ops import OpOverload
from torch.fx import Node
from vllm.compilation.fx_utils import find_auto_fn from vllm.compilation.fx_utils import find_auto_fn
...@@ -97,7 +98,7 @@ class MultiOutputMatch(abc.ABC): ...@@ -97,7 +98,7 @@ class MultiOutputMatch(abc.ABC):
self.graph.call_function(operator.getitem, (tuple_node, idx)) self.graph.call_function(operator.getitem, (tuple_node, idx))
for idx in indices) for idx in indices)
def insert_auto_fn(self, op: OpOverload, kwargs): def insert_auto_fn(self, op: OpOverload, kwargs) -> Node:
""" """
Insert an auto_functionalized node with the given op and kwargs. Insert an auto_functionalized node with the given op and kwargs.
""" """
......
from typing import List from typing import Any, Dict, List
from torch import fx as fx from torch import fx as fx
...@@ -53,7 +53,7 @@ class PostGradPassManager: ...@@ -53,7 +53,7 @@ class PostGradPassManager:
assert isinstance(pass_, InductorPass) assert isinstance(pass_, InductorPass)
self.passes.append(pass_) self.passes.append(pass_)
def __getstate__(self): def __getstate__(self) -> Dict[str, List[Any]]:
""" """
Custom pickling for the pass manager, as some passes cannot be pickled. Custom pickling for the pass manager, as some passes cannot be pickled.
Pickling occurs because the pass manager is set as the value of Pickling occurs because the pass manager is set as the value of
......
...@@ -22,12 +22,15 @@ from vllm.logger import init_logger ...@@ -22,12 +22,15 @@ from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS, from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
get_quantization_config) get_quantization_config)
from vllm.model_executor.models import ModelRegistry from vllm.model_executor.models import ModelRegistry
from vllm.platforms import current_platform from vllm.platforms import current_platform, interface
from vllm.tracing import is_otel_available, otel_import_error_traceback from vllm.tracing import is_otel_available, otel_import_error_traceback
from vllm.transformers_utils.config import ( from vllm.transformers_utils.config import (
ConfigFormat, get_config, get_hf_image_processor_config, ConfigFormat, get_config, get_hf_image_processor_config,
get_hf_text_config, get_pooling_config, get_hf_text_config, get_pooling_config,
get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope) get_sentence_transformer_tokenizer_config, is_encoder_decoder,
try_get_generation_config, uses_mrope)
from vllm.transformers_utils.s3_utils import S3Model
from vllm.transformers_utils.utils import is_s3
from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless, from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
get_cpu_memory, print_warning_once, random_uuid, get_cpu_memory, print_warning_once, random_uuid,
resolve_obj_by_qualname) resolve_obj_by_qualname)
...@@ -148,9 +151,8 @@ class ModelConfig: ...@@ -148,9 +151,8 @@ class ModelConfig:
HuggingFace config. HuggingFace config.
mm_processor_kwargs: Arguments to be forwarded to the model's processor mm_processor_kwargs: Arguments to be forwarded to the model's processor
for multi-modal data, e.g., image processor. for multi-modal data, e.g., image processor.
mm_cache_preprocessor: If true, then enables caching of the multi-modal disable_mm_preprocessor_cache: If true, then disables caching of the
preprocessor/mapper. Otherwise, the mapper executes each time, and multi-modal preprocessor/mapper. (not recommended)
for better performance consider enabling frontend process.
override_neuron_config: Initialize non default neuron config or override_neuron_config: Initialize non default neuron config or
override default neuron config that are specific to Neuron devices, override default neuron config that are specific to Neuron devices,
this argument will be used to configure the neuron config that this argument will be used to configure the neuron config that
...@@ -159,8 +161,9 @@ class ModelConfig: ...@@ -159,8 +161,9 @@ class ModelConfig:
override default pooling config for the pooling model. override default pooling config for the pooling model.
logits_processor_pattern: Optional regex pattern specifying valid logits_processor_pattern: Optional regex pattern specifying valid
logits processor qualified names that can be passed with the logits processor qualified names that can be passed with the
`logits_processors` extra completion argument. Defaults to None, `logits_processors` extra completion argument. Defaults to None,
which allows no processors. which allows no processors.
generation_config: Configuration parameter file for generation.
""" """
def compute_hash(self) -> str: def compute_hash(self) -> str:
...@@ -216,10 +219,11 @@ class ModelConfig: ...@@ -216,10 +219,11 @@ class ModelConfig:
config_format: ConfigFormat = ConfigFormat.AUTO, config_format: ConfigFormat = ConfigFormat.AUTO,
hf_overrides: Optional[HfOverrides] = None, hf_overrides: Optional[HfOverrides] = None,
mm_processor_kwargs: Optional[Dict[str, Any]] = None, mm_processor_kwargs: Optional[Dict[str, Any]] = None,
mm_cache_preprocessor: bool = False, disable_mm_preprocessor_cache: bool = False,
override_neuron_config: Optional[Dict[str, Any]] = None, override_neuron_config: Optional[Dict[str, Any]] = None,
override_pooler_config: Optional["PoolerConfig"] = None, override_pooler_config: Optional["PoolerConfig"] = None,
logits_processor_pattern: Optional[str] = None) -> None: logits_processor_pattern: Optional[str] = None,
generation_config: Optional[str] = None) -> None:
self.model = model self.model = model
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.tokenizer_mode = tokenizer_mode self.tokenizer_mode = tokenizer_mode
...@@ -254,6 +258,8 @@ class ModelConfig: ...@@ -254,6 +258,8 @@ class ModelConfig:
f"'Please instead use `--hf-overrides '{hf_override!r}'`") f"'Please instead use `--hf-overrides '{hf_override!r}'`")
warnings.warn(DeprecationWarning(msg), stacklevel=2) warnings.warn(DeprecationWarning(msg), stacklevel=2)
self.maybe_pull_model_tokenizer_for_s3(model, tokenizer)
# The tokenizer version is consistent with the model version by default. # The tokenizer version is consistent with the model version by default.
if tokenizer_revision is None: if tokenizer_revision is None:
self.tokenizer_revision = revision self.tokenizer_revision = revision
...@@ -286,7 +292,7 @@ class ModelConfig: ...@@ -286,7 +292,7 @@ class ModelConfig:
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype) self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
self.use_async_output_proc = use_async_output_proc self.use_async_output_proc = use_async_output_proc
self.mm_processor_kwargs = mm_processor_kwargs self.mm_processor_kwargs = mm_processor_kwargs
self.mm_cache_preprocessor = mm_cache_preprocessor self.disable_mm_preprocessor_cache = disable_mm_preprocessor_cache
# Set enforce_eager to False if the value is unset. # Set enforce_eager to False if the value is unset.
if self.enforce_eager is None: if self.enforce_eager is None:
...@@ -349,10 +355,36 @@ class ModelConfig: ...@@ -349,10 +355,36 @@ class ModelConfig:
self.pooler_config = self._init_pooler_config(override_pooler_config) self.pooler_config = self._init_pooler_config(override_pooler_config)
self.logits_processor_pattern = logits_processor_pattern self.logits_processor_pattern = logits_processor_pattern
self.generation_config = generation_config
self._verify_quantization() self._verify_quantization()
self._verify_cuda_graph() self._verify_cuda_graph()
self._verify_bnb_config() self._verify_bnb_config()
def maybe_pull_model_tokenizer_for_s3(self, model: str,
tokenizer: str) -> None:
"""
Pull the model config or tokenizer to a temporary
directory in case of S3.
Args:
model: The model name or path.
tokenizer: The tokenizer name or path.
"""
if is_s3(model) or is_s3(tokenizer):
if is_s3(model):
self.s3_model = S3Model()
self.s3_model.pull_files(model, allow_pattern=["*config.json"])
self.model_weights = self.model
self.model = self.s3_model.dir
if is_s3(tokenizer):
self.s3_tokenizer = S3Model()
self.s3_tokenizer.pull_files(
model, ignore_pattern=["*.pt", "*.safetensors", "*.bin"])
self.tokenizer = self.s3_tokenizer.dir
def _init_multimodal_config( def _init_multimodal_config(
self, limit_mm_per_prompt: Optional[Mapping[str, int]] self, limit_mm_per_prompt: Optional[Mapping[str, int]]
) -> Optional["MultiModalConfig"]: ) -> Optional["MultiModalConfig"]:
...@@ -564,6 +596,12 @@ class ModelConfig: ...@@ -564,6 +596,12 @@ class ModelConfig:
self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
self.max_model_len) self.max_model_len)
if (self.hf_config.model_type == 'deepseek_v3'
and not self.enforce_eager):
logger.warning("CUDA graph is not supported for Deepseek V3 yet, "
"fallback to the eager mode.")
self.enforce_eager = True
def _verify_bnb_config(self) -> None: def _verify_bnb_config(self) -> None:
""" """
The current version of bitsandbytes (0.44.0) with 8-bit models does not The current version of bitsandbytes (0.44.0) with 8-bit models does not
...@@ -598,7 +636,7 @@ class ModelConfig: ...@@ -598,7 +636,7 @@ class ModelConfig:
self.use_async_output_proc = False self.use_async_output_proc = False
return return
# Reminder: Please update docs/source/usage/compatibility_matrix.rst # Reminder: Please update docs/source/usage/compatibility_matrix.md
# If the feature combo become valid # If the feature combo become valid
if not current_platform.is_async_output_supported(self.enforce_eager): if not current_platform.is_async_output_supported(self.enforce_eager):
logger.warning( logger.warning(
...@@ -618,7 +656,7 @@ class ModelConfig: ...@@ -618,7 +656,7 @@ class ModelConfig:
if self.runner_type == "pooling": if self.runner_type == "pooling":
self.use_async_output_proc = False self.use_async_output_proc = False
# Reminder: Please update docs/source/usage/compatibility_matrix.rst # Reminder: Please update docs/source/usage/compatibility_matrix.md
# If the feature combo become valid # If the feature combo become valid
if speculative_config: if speculative_config:
logger.warning("Async output processing is not supported with" logger.warning("Async output processing is not supported with"
...@@ -680,8 +718,9 @@ class ModelConfig: ...@@ -680,8 +718,9 @@ class ModelConfig:
def get_head_size(self) -> int: def get_head_size(self) -> int:
# TODO remove hard code # TODO remove hard code
if hasattr(self.hf_text_config, "model_type" if hasattr(self.hf_text_config,
) and self.hf_text_config.model_type == 'deepseek_v2': "model_type") and (self.hf_text_config.model_type
in ('deepseek_v2', 'deepseek_v3')):
# FlashAttention supports only head_size 32, 64, 128, 256, # FlashAttention supports only head_size 32, 64, 128, 256,
# we need to pad head_size 192 to 256 # we need to pad head_size 192 to 256
return 256 return 256
...@@ -814,6 +853,56 @@ class ModelConfig: ...@@ -814,6 +853,56 @@ class ModelConfig:
return self.multimodal_config return self.multimodal_config
def try_get_generation_config(self) -> Dict[str, Any]:
if self.generation_config is None or self.generation_config == "auto":
config = try_get_generation_config(
self.model,
trust_remote_code=self.trust_remote_code,
revision=self.revision,
)
else:
config = try_get_generation_config(
self.generation_config,
trust_remote_code=self.trust_remote_code,
)
if config is None:
return {}
return config.to_diff_dict()
def get_diff_sampling_param(self) -> Dict[str, Any]:
"""
This method returns a dictionary containing the parameters
that differ from the default sampling parameters, but only
if `generation_config` is set. If `generation_config` is not
set, an empty dictionary is returned.
Returns:
Dict[str, Any]: A dictionary with the differing sampling
parameters if `generation_config` is set, otherwise an
empty dictionary.
"""
if self.generation_config is None:
# When generation_config is not set
return {}
config = self.try_get_generation_config()
available_params = [
"repetition_penalty",
"temperature",
"top_k",
"top_p",
"min_p",
]
if any(p in config for p in available_params):
diff_sampling_param = {
p: config.get(p)
for p in available_params if config.get(p) is not None
}
else:
diff_sampling_param = {}
return diff_sampling_param
@property @property
def is_encoder_decoder(self) -> bool: def is_encoder_decoder(self) -> bool:
"""Extract the HF encoder/decoder model flag.""" """Extract the HF encoder/decoder model flag."""
...@@ -917,6 +1006,10 @@ class CacheConfig: ...@@ -917,6 +1006,10 @@ class CacheConfig:
raise ValueError( raise ValueError(
"GPU memory utilization must be less than 1.0. Got " "GPU memory utilization must be less than 1.0. Got "
f"{self.gpu_memory_utilization}.") f"{self.gpu_memory_utilization}.")
if (current_platform.is_cuda() and self.block_size is not None
and self.block_size > 32):
raise ValueError("CUDA Paged Attention kernel only supports "
f"block sizes up to 32. Got {self.block_size}.")
def _verify_cache_dtype(self) -> None: def _verify_cache_dtype(self) -> None:
if self.cache_dtype == "auto": if self.cache_dtype == "auto":
...@@ -1041,6 +1134,7 @@ class LoadFormat(str, enum.Enum): ...@@ -1041,6 +1134,7 @@ class LoadFormat(str, enum.Enum):
GGUF = "gguf" GGUF = "gguf"
BITSANDBYTES = "bitsandbytes" BITSANDBYTES = "bitsandbytes"
MISTRAL = "mistral" MISTRAL = "mistral"
RUNAI_STREAMER = "runai_streamer"
@dataclass @dataclass
...@@ -1977,7 +2071,7 @@ class LoRAConfig: ...@@ -1977,7 +2071,7 @@ class LoRAConfig:
model_config.quantization) model_config.quantization)
def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
# Reminder: Please update docs/source/usage/compatibility_matrix.rst # Reminder: Please update docs/source/usage/compatibility_matrix.md
# If the feature combo become valid # If the feature combo become valid
if scheduler_config.chunked_prefill_enabled: if scheduler_config.chunked_prefill_enabled:
logger.warning("LoRA with chunked prefill is still experimental " logger.warning("LoRA with chunked prefill is still experimental "
...@@ -2155,6 +2249,17 @@ def _get_and_verify_dtype( ...@@ -2155,6 +2249,17 @@ def _get_and_verify_dtype(
else: else:
torch_dtype = config_dtype torch_dtype = config_dtype
if (current_platform.is_cpu()
and current_platform.get_cpu_architecture()
== interface.CpuArchEnum.POWERPC
and (config_dtype == torch.float16
or config_dtype == torch.float32)):
logger.info(
"For POWERPC, we cast models to bfloat16 instead of "
"using float16 by default. Float16 is not currently "
"supported for POWERPC.")
torch_dtype = torch.bfloat16
if current_platform.is_hpu() and config_dtype == torch.float16: if current_platform.is_hpu() and config_dtype == torch.float16:
logger.info( logger.info(
"For HPU, we cast models to bfloat16 instead of" "For HPU, we cast models to bfloat16 instead of"
...@@ -3165,7 +3270,7 @@ class VllmConfig: ...@@ -3165,7 +3270,7 @@ class VllmConfig:
f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, " f"enable_prefix_caching={self.cache_config.enable_prefix_caching}, "
f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa f"chunked_prefill_enabled={self.scheduler_config.chunked_prefill_enabled}, " # noqa
f"use_async_output_proc={self.model_config.use_async_output_proc}, " f"use_async_output_proc={self.model_config.use_async_output_proc}, "
f"mm_cache_preprocessor={self.model_config.mm_cache_preprocessor!r}, " # noqa f"disable_mm_preprocessor_cache={self.model_config.disable_mm_preprocessor_cache!r}, " # noqa
f"mm_processor_kwargs={self.model_config.mm_processor_kwargs}, " f"mm_processor_kwargs={self.model_config.mm_processor_kwargs}, "
f"pooler_config={self.model_config.pooler_config!r}, " f"pooler_config={self.model_config.pooler_config!r}, "
f"compilation_config={self.compilation_config!r}") f"compilation_config={self.compilation_config!r}")
......
...@@ -13,7 +13,7 @@ class EvictionPolicy(enum.Enum): ...@@ -13,7 +13,7 @@ class EvictionPolicy(enum.Enum):
class Evictor(ABC): class Evictor(ABC):
"""The Evictor subclasses should be used by the BlockAllocator class to """The Evictor subclasses should be used by the BlockAllocator class to
handle eviction of freed PhysicalTokenBlocks. handle eviction of freed Blocks.
""" """
@abstractmethod @abstractmethod
...@@ -70,7 +70,7 @@ class BlockMetaData: ...@@ -70,7 +70,7 @@ class BlockMetaData:
class LRUEvictor(Evictor): class LRUEvictor(Evictor):
"""Evicts in a least-recently-used order using the last_accessed timestamp """Evicts in a least-recently-used order using the last_accessed timestamp
that's recorded in the PhysicalTokenBlock. If there are multiple blocks with that's recorded in the Block. If there are multiple blocks with
the same last_accessed time, then the one with the largest num_hashed_tokens the same last_accessed time, then the one with the largest num_hashed_tokens
will be evicted. If two blocks each have the lowest last_accessed time and will be evicted. If two blocks each have the lowest last_accessed time and
highest num_hashed_tokens value, then one will be chose arbitrarily highest num_hashed_tokens value, then one will be chose arbitrarily
......
...@@ -141,7 +141,7 @@ class EngineArgs: ...@@ -141,7 +141,7 @@ class EngineArgs:
tokenizer_pool_extra_config: Optional[Dict[str, Any]] = None tokenizer_pool_extra_config: Optional[Dict[str, Any]] = None
limit_mm_per_prompt: Optional[Mapping[str, int]] = None limit_mm_per_prompt: Optional[Mapping[str, int]] = None
mm_processor_kwargs: Optional[Dict[str, Any]] = None mm_processor_kwargs: Optional[Dict[str, Any]] = None
mm_cache_preprocessor: bool = False disable_mm_preprocessor_cache: bool = False
enable_lora: bool = False enable_lora: bool = False
enable_lora_bias: bool = False enable_lora_bias: bool = False
max_loras: int = 1 max_loras: int = 1
...@@ -200,6 +200,8 @@ class EngineArgs: ...@@ -200,6 +200,8 @@ class EngineArgs:
kv_transfer_config: Optional[KVTransferConfig] = None kv_transfer_config: Optional[KVTransferConfig] = None
generation_config: Optional[str] = None
def __post_init__(self): def __post_init__(self):
if not self.tokenizer: if not self.tokenizer:
self.tokenizer = self.model self.tokenizer = self.model
...@@ -208,6 +210,7 @@ class EngineArgs: ...@@ -208,6 +210,7 @@ class EngineArgs:
# by user. # by user.
if self.enable_prefix_caching is None: if self.enable_prefix_caching is None:
self.enable_prefix_caching = bool(envs.VLLM_USE_V1) self.enable_prefix_caching = bool(envs.VLLM_USE_V1)
# Override max_num_seqs if it's not set by user. # Override max_num_seqs if it's not set by user.
if self.max_num_seqs is None: if self.max_num_seqs is None:
self.max_num_seqs = 256 if not envs.VLLM_USE_V1 else 1024 self.max_num_seqs = 256 if not envs.VLLM_USE_V1 else 1024
...@@ -316,6 +319,8 @@ class EngineArgs: ...@@ -316,6 +319,8 @@ class EngineArgs:
'* "tensorizer" will load the weights using tensorizer from ' '* "tensorizer" will load the weights using tensorizer from '
'CoreWeave. See the Tensorize vLLM Model script in the Examples ' 'CoreWeave. See the Tensorize vLLM Model script in the Examples '
'section for more information.\n' 'section for more information.\n'
'* "runai_streamer" will load the Safetensors weights using Run:ai'
'Model Streamer \n'
'* "bitsandbytes" will load the weights using bitsandbytes ' '* "bitsandbytes" will load the weights using bitsandbytes '
'quantization.\n') 'quantization.\n')
parser.add_argument( parser.add_argument(
...@@ -371,7 +376,7 @@ class EngineArgs: ...@@ -371,7 +376,7 @@ class EngineArgs:
choices=['outlines', 'lm-format-enforcer', 'xgrammar'], choices=['outlines', 'lm-format-enforcer', 'xgrammar'],
help='Which engine will be used for guided decoding' help='Which engine will be used for guided decoding'
' (JSON schema / regex etc) by default. Currently support ' ' (JSON schema / regex etc) by default. Currently support '
'https://github.com/outlines-dev/outlines,' 'https://github.com/outlines-dev/outlines, '
'https://github.com/mlc-ai/xgrammar, and ' 'https://github.com/mlc-ai/xgrammar, and '
'https://github.com/noamgat/lm-format-enforcer.' 'https://github.com/noamgat/lm-format-enforcer.'
' Can be overridden per request via guided_decoding_backend' ' Can be overridden per request via guided_decoding_backend'
...@@ -426,10 +431,12 @@ class EngineArgs: ...@@ -426,10 +431,12 @@ class EngineArgs:
parser.add_argument('--block-size', parser.add_argument('--block-size',
type=int, type=int,
default=EngineArgs.block_size, default=EngineArgs.block_size,
choices=[8, 16, 32], choices=[8, 16, 32, 64, 128],
help='Token block size for contiguous chunks of ' help='Token block size for contiguous chunks of '
'tokens. This is ignored on neuron devices and ' 'tokens. This is ignored on neuron devices and '
'set to max-model-len') 'set to max-model-len. On CUDA devices, '
'only block sizes up to 32 are supported. '
'On HPU devices, block size defaults to 128.')
parser.add_argument( parser.add_argument(
"--enable-prefix-caching", "--enable-prefix-caching",
...@@ -606,11 +613,10 @@ class EngineArgs: ...@@ -606,11 +613,10 @@ class EngineArgs:
help=('Overrides for the multimodal input mapping/processing, ' help=('Overrides for the multimodal input mapping/processing, '
'e.g., image processor. For example: {"num_crops": 4}.')) 'e.g., image processor. For example: {"num_crops": 4}.'))
parser.add_argument( parser.add_argument(
'--mm-cache-preprocessor', '--disable-mm-preprocessor-cache',
action='store_true', action='store_true',
help='If true, then enables caching of the multi-modal ' help='If true, then disables caching of the multi-modal '
'preprocessor/mapper. Otherwise, the mapper executes each time' 'preprocessor/mapper. (not recommended)')
', and for better performance consider enabling frontend process.')
# LoRA related configs # LoRA related configs
parser.add_argument('--enable-lora', parser.add_argument('--enable-lora',
...@@ -957,6 +963,16 @@ class EngineArgs: ...@@ -957,6 +963,16 @@ class EngineArgs:
default="auto", default="auto",
help='The worker class to use for distributed execution.') help='The worker class to use for distributed execution.')
parser.add_argument(
"--generation-config",
type=nullable_str,
default=None,
help="The folder path to the generation config. "
"Defaults to None, will use the default generation config in vLLM. "
"If set to 'auto', the generation config will be automatically "
"loaded from model. If set to a folder path, the generation config "
"will be loaded from the specified folder path.")
return parser return parser
@classmethod @classmethod
...@@ -997,10 +1013,11 @@ class EngineArgs: ...@@ -997,10 +1013,11 @@ class EngineArgs:
use_async_output_proc=not self.disable_async_output_proc, use_async_output_proc=not self.disable_async_output_proc,
config_format=self.config_format, config_format=self.config_format,
mm_processor_kwargs=self.mm_processor_kwargs, mm_processor_kwargs=self.mm_processor_kwargs,
mm_cache_preprocessor=self.mm_cache_preprocessor, disable_mm_preprocessor_cache=self.disable_mm_preprocessor_cache,
override_neuron_config=self.override_neuron_config, override_neuron_config=self.override_neuron_config,
override_pooler_config=self.override_pooler_config, override_pooler_config=self.override_pooler_config,
logits_processor_pattern=self.logits_processor_pattern) logits_processor_pattern=self.logits_processor_pattern,
generation_config=self.generation_config)
def create_load_config(self) -> LoadConfig: def create_load_config(self) -> LoadConfig:
return LoadConfig( return LoadConfig(
...@@ -1043,11 +1060,11 @@ class EngineArgs: ...@@ -1043,11 +1060,11 @@ class EngineArgs:
device_config = DeviceConfig(device=self.device) device_config = DeviceConfig(device=self.device)
model_config = self.create_model_config() model_config = self.create_model_config()
if model_config.is_multimodal_model: if (model_config.is_multimodal_model and not envs.VLLM_USE_V1
if self.enable_prefix_caching: and self.enable_prefix_caching):
logger.warning( logger.warning("--enable-prefix-caching is currently not "
"--enable-prefix-caching is currently not " "supported for multimodal models in v0 and "
"supported for multimodal models and has been disabled.") "has been disabled.")
self.enable_prefix_caching = False self.enable_prefix_caching = False
cache_config = CacheConfig( cache_config = CacheConfig(
...@@ -1149,7 +1166,7 @@ class EngineArgs: ...@@ -1149,7 +1166,7 @@ class EngineArgs:
num_speculative_heads=self.num_speculative_heads num_speculative_heads=self.num_speculative_heads
) )
# Reminder: Please update docs/source/usage/compatibility_matrix.rst # Reminder: Please update docs/source/usage/compatibility_matrix.md
# If the feature combo become valid # If the feature combo become valid
if self.num_scheduler_steps > 1: if self.num_scheduler_steps > 1:
if speculative_config is not None: if speculative_config is not None:
...@@ -1269,11 +1286,14 @@ class EngineArgs: ...@@ -1269,11 +1286,14 @@ class EngineArgs:
# When no user override, set the default values based on the usage # When no user override, set the default values based on the usage
# context. # context.
# TODO(woosuk): Tune the default values for different hardware. # TODO(woosuk): Tune the default values for different hardware.
if self.max_num_batched_tokens is None: default_max_num_batched_tokens = {
if usage_context == UsageContext.LLM_CLASS: UsageContext.LLM_CLASS: 8192,
self.max_num_batched_tokens = 8192 UsageContext.OPENAI_API_SERVER: 2048,
elif usage_context == UsageContext.OPENAI_API_SERVER: }
self.max_num_batched_tokens = 2048 if (self.max_num_batched_tokens is None
and usage_context in default_max_num_batched_tokens):
self.max_num_batched_tokens = default_max_num_batched_tokens[
usage_context]
logger.warning( logger.warning(
"Setting max_num_batched_tokens to %d for %s usage context.", "Setting max_num_batched_tokens to %d for %s usage context.",
self.max_num_batched_tokens, usage_context.value) self.max_num_batched_tokens, usage_context.value)
...@@ -1283,9 +1303,6 @@ class EngineArgs: ...@@ -1283,9 +1303,6 @@ class EngineArgs:
Override the EngineConfig's configs based on the usage context for V1. Override the EngineConfig's configs based on the usage context for V1.
""" """
assert envs.VLLM_USE_V1, "V1 is not enabled" assert envs.VLLM_USE_V1, "V1 is not enabled"
if engine_config.model_config.is_multimodal_model:
# TODO (ywang96): Enable APC by default when VLM supports it.
assert not engine_config.cache_config.enable_prefix_caching
@dataclass @dataclass
......
...@@ -1256,3 +1256,10 @@ class AsyncLLMEngine(EngineClient): ...@@ -1256,3 +1256,10 @@ class AsyncLLMEngine(EngineClient):
self.engine.model_executor.stop_profile() self.engine.model_executor.stop_profile()
else: else:
self.engine.model_executor._run_workers("stop_profile") self.engine.model_executor._run_workers("stop_profile")
# TODO(v1): Remove this class proxy when V1 goes default.
if envs.VLLM_USE_V1:
from vllm.v1.engine.async_llm import AsyncLLM
AsyncLLMEngine = AsyncLLM # type: ignore
...@@ -6,8 +6,8 @@ from collections import deque ...@@ -6,8 +6,8 @@ from collections import deque
from contextlib import contextmanager from contextlib import contextmanager
from dataclasses import dataclass from dataclasses import dataclass
from functools import partial from functools import partial
from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict, from typing import (TYPE_CHECKING, Callable, ClassVar, Deque, Dict, Iterable,
Iterable, List, Mapping, NamedTuple, Optional) List, Mapping, NamedTuple, Optional)
from typing import Sequence as GenericSequence from typing import Sequence as GenericSequence
from typing import Set, Type, Union, cast, overload from typing import Set, Type, Union, cast, overload
...@@ -53,7 +53,6 @@ from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup, ...@@ -53,7 +53,6 @@ from vllm.sequence import (ExecuteModelRequest, ParallelSampleSequenceGroup,
SequenceGroupOutput, SequenceStatus, CompletionSequenceGroupOutput, VLLM_INVALID_TOKEN_ID) SequenceGroupOutput, SequenceStatus, CompletionSequenceGroupOutput, VLLM_INVALID_TOKEN_ID)
from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context, from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
init_tracer) init_tracer)
from vllm.transformers_utils.config import try_get_generation_config
from vllm.transformers_utils.detokenizer import Detokenizer from vllm.transformers_utils.detokenizer import Detokenizer
from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer
from vllm.transformers_utils.tokenizer_group import ( from vllm.transformers_utils.tokenizer_group import (
...@@ -66,20 +65,6 @@ from vllm.version import __version__ as VLLM_VERSION ...@@ -66,20 +65,6 @@ from vllm.version import __version__ as VLLM_VERSION
logger = init_logger(__name__) logger = init_logger(__name__)
_LOCAL_LOGGING_INTERVAL_SEC = 5 _LOCAL_LOGGING_INTERVAL_SEC = 5
def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
config = try_get_generation_config(
model_config.model,
trust_remote_code=model_config.trust_remote_code,
revision=model_config.revision,
)
if config is None:
return {}
return config.to_diff_dict()
_G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup) _G = TypeVar("_G", bound=BaseTokenizerGroup, default=BaseTokenizerGroup)
_O = TypeVar("_O", RequestOutput, PoolingRequestOutput) _O = TypeVar("_O", RequestOutput, PoolingRequestOutput)
...@@ -149,7 +134,7 @@ class LLMEngine: ...@@ -149,7 +134,7 @@ class LLMEngine:
and the :class:`AsyncLLMEngine` class wraps this class for online serving. and the :class:`AsyncLLMEngine` class wraps this class for online serving.
The config arguments are derived from :class:`~vllm.EngineArgs`. (See The config arguments are derived from :class:`~vllm.EngineArgs`. (See
:ref:`engine_args`) :ref:`engine-args`)
Args: Args:
model_config: The configuration related to the LLM model. model_config: The configuration related to the LLM model.
...@@ -275,8 +260,8 @@ class LLMEngine: ...@@ -275,8 +260,8 @@ class LLMEngine:
return tokenizer_group.get_lora_tokenizer(sequence.lora_request) return tokenizer_group.get_lora_tokenizer(sequence.lora_request)
self.seq_counter = Counter() self.seq_counter = Counter()
self.generation_config_fields = _load_generation_config_dict( self.generation_config_fields = (
self.model_config) self.model_config.try_get_generation_config())
self.input_preprocessor = InputPreprocessor(self.model_config, self.input_preprocessor = InputPreprocessor(self.model_config,
self.tokenizer, self.tokenizer,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment