Commit 38d80967 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori

parents 33650733 880c741b
......@@ -68,5 +68,18 @@ def flash_attn_supports_fp8() -> bool:
current_platform.get_device_capability().major == 9
def flash_attn_supports_mla():
from vllm.platforms import current_platform
if current_platform.is_cuda():
try:
from vllm.vllm_flash_attn.flash_attn_interface import (
is_fa_version_supported)
return is_fa_version_supported(3) \
and current_platform.get_device_capability()[0] == 9
except (ImportError, AssertionError):
pass
return False
def is_flash_attn_varlen_func_available() -> bool:
return current_platform.is_cuda() or current_platform.is_xpu()
......@@ -4,8 +4,8 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING, Any, Optional, Union
from vllm.logprobs import Logprob
from vllm.lora.request import LoRARequest
from vllm.sequence import Logprob
if TYPE_CHECKING:
from vllm.multimodal import MultiModalDataDict
......
......@@ -198,8 +198,9 @@ class BenchmarkDataset(ABC):
@abstractmethod
def sample(self, tokenizer: PreTrainedTokenizerBase,
num_requests: int,
request_id_prefix: str = "") -> list[SampleRequest]:
num_requests: int,
request_id_prefix: str = "",
no_oversample: bool = False) -> list[SampleRequest]:
"""
Abstract method to generate sample requests from the dataset.
......@@ -224,6 +225,7 @@ class BenchmarkDataset(ABC):
requests: list[SampleRequest],
num_requests: int,
request_id_prefix: str = "",
no_oversample: bool = False,
) -> None:
"""
Oversamples the list of requests if its size is less than the desired
......@@ -236,6 +238,11 @@ class BenchmarkDataset(ABC):
request_id_prefix (str) The prefix of the request ids.
"""
if no_oversample:
logger.info("Skipping oversampling. " \
"Total samples: %d.", len(requests))
return
if len(requests) < num_requests:
random.seed(self.random_seed)
additional = deepcopy(
......@@ -405,6 +412,7 @@ class RandomDataset(BenchmarkDataset):
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
request_id_prefix: str = "",
no_oversample: bool = False,
prefix_len: int = DEFAULT_PREFIX_LEN,
range_ratio: float = DEFAULT_RANGE_RATIO,
input_len: int = DEFAULT_INPUT_LEN,
......@@ -832,6 +840,7 @@ class RandomMultiModalDataset(RandomDataset):
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
request_id_prefix: str = "",
no_oversample: bool = False,
prefix_len: int = RandomDataset.DEFAULT_PREFIX_LEN,
range_ratio: float = RandomDataset.DEFAULT_RANGE_RATIO,
input_len: int = RandomDataset.DEFAULT_INPUT_LEN,
......@@ -959,6 +968,7 @@ class ShareGPTDataset(BenchmarkDataset):
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs,
) -> list:
samples: list = []
......@@ -1002,7 +1012,10 @@ class ShareGPTDataset(BenchmarkDataset):
request_id=request_id_prefix + str(ind),
))
ind += 1
self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
self.maybe_oversample_requests(samples,
num_requests,
request_id_prefix,
no_oversample)
return samples
......@@ -1020,7 +1033,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
default="random",
choices=[
"sharegpt", "burstgpt", "sonnet", "random", "random-mm", "hf",
"custom", "prefix_repetition"
"custom", "prefix_repetition", "spec_bench"
],
help="Name of the dataset to benchmark on.",
)
......@@ -1036,6 +1049,12 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
help="Path to the sharegpt/sonnet dataset. "
"Or the huggingface dataset ID if using HF dataset.",
)
parser.add_argument(
"--no-oversample",
action="store_true",
help="Do not oversample if the dataset has " \
"fewer samples than num-prompts.",
)
# group for dataset specific arguments
custom_group = parser.add_argument_group("custom dataset options")
......@@ -1053,6 +1072,22 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
"Skip applying chat template to prompt, used only for custom dataset.",
)
spec_bench_group = parser.add_argument_group("spec bench dataset options")
spec_bench_group.add_argument(
"--spec-bench-output-len",
type=int,
default=256,
help=
"Num of output tokens per request, used only for spec bench dataset.",
)
spec_bench_group.add_argument(
"--spec-bench-category",
type=str,
default=None,
help=
"Category for spec bench dataset. If None, use all categories.",
)
sonnet_group = parser.add_argument_group("sonnet dataset options")
sonnet_group.add_argument(
"--sonnet-input-len",
......@@ -1085,6 +1120,22 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
"from the ShareGPT dataset.",
)
blazedit_group = parser.add_argument_group("blazedit dataset options")
blazedit_group.add_argument(
"--blazedit-min-distance",
type=float,
default=0.0,
help=
"Minimum distance for blazedit dataset. Min: 0, Max: 1.0",
)
blazedit_group.add_argument(
"--blazedit-max-distance",
type=float,
default=1.0,
help=
"Maximum distance for blazedit dataset. Min: 0, Max: 1.0",
)
random_group = parser.add_argument_group("random dataset options")
random_group.add_argument(
"--random-input-len",
......@@ -1227,6 +1278,16 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
type=str,
default=None,
help="Split of the HF dataset.")
hf_group.add_argument(
"--hf-name",
type=str,
default=None,
help=(
"Name of the dataset on HuggingFace "
"(e.g., 'lmarena-ai/VisionArena-Chat'). "
"Specify this if your dataset-path is a local path."
),
)
hf_group.add_argument(
"--hf-output-len",
type=int,
......@@ -1268,6 +1329,10 @@ def add_dataset_parser(parser: FlexibleArgumentParser):
def get_samples(args, tokenizer) -> list[SampleRequest]:
if not hasattr(args, "request_id_prefix"):
args.request_id_prefix = ""
if args.dataset_name == "custom":
dataset = CustomDataset(dataset_path=args.dataset_path)
input_requests = dataset.sample(
......@@ -1276,6 +1341,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
output_len=args.custom_output_len,
skip_chat_template=args.custom_skip_chat_template,
request_id_prefix=args.request_id_prefix,
no_oversample=args.no_oversample,
)
elif args.dataset_name == "sonnet":
......@@ -1290,6 +1356,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
tokenizer=tokenizer,
return_prompt_formatted=False,
request_id_prefix=args.request_id_prefix,
no_oversample=args.no_oversample,
)
else:
assert tokenizer.chat_template or tokenizer.default_chat_template, (
......@@ -1302,33 +1369,67 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
tokenizer=tokenizer,
return_prompt_formatted=True,
request_id_prefix=args.request_id_prefix,
no_oversample=args.no_oversample,
)
elif args.dataset_name == "hf":
# all following datasets are implemented from the
# HuggingFaceDataset base class
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
hf_kwargs = {}
if (
args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS
or args.hf_name in VisionArenaDataset.SUPPORTED_DATASET_PATHS
):
dataset_class = VisionArenaDataset
args.hf_split = "train"
args.hf_subset = None
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
elif (
args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS
or args.hf_name in InstructCoderDataset.SUPPORTED_DATASET_PATHS
):
dataset_class = InstructCoderDataset
args.hf_split = "train"
elif args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS:
elif (
args.dataset_path in MTBenchDataset.SUPPORTED_DATASET_PATHS
or args.hf_name in MTBenchDataset.SUPPORTED_DATASET_PATHS
):
dataset_class = MTBenchDataset
args.hf_split = "train"
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
elif (
args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS
or args.hf_name in ConversationDataset.SUPPORTED_DATASET_PATHS
):
dataset_class = ConversationDataset
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
elif (
args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS
or args.hf_name in AIMODataset.SUPPORTED_DATASET_PATHS
):
dataset_class = AIMODataset
args.hf_split = "train"
elif args.dataset_path in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS: # noqa: E501
elif (
args.dataset_path
in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS # noqa: E501
or args.hf_name in NextEditPredictionDataset.SUPPORTED_DATASET_PATHS
):
dataset_class = NextEditPredictionDataset
args.hf_split = "train"
elif args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS:
elif (
args.dataset_path in ASRDataset.SUPPORTED_DATASET_PATHS
or args.hf_name in ASRDataset.SUPPORTED_DATASET_PATHS
):
dataset_class = ASRDataset
args.hf_split = "train"
elif args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS:
elif args.dataset_path in BlazeditDataset.SUPPORTED_DATASET_PATHS:
dataset_class = BlazeditDataset
args.hf_split = "train"
hf_kwargs = {
"min_distance": args.blazedit_min_distance,
"max_distance": args.blazedit_max_distance,
}
elif (
args.dataset_path in MLPerfDataset.SUPPORTED_DATASET_PATHS
or args.hf_name in MLPerfDataset.SUPPORTED_DATASET_PATHS
):
dataset_class = MLPerfDataset
args.hf_split = "train"
else:
......@@ -1358,16 +1459,28 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
dataset_split=args.hf_split,
random_seed=args.seed,
no_stream=args.no_stream,
hf_name=args.hf_name,
).sample(
num_requests=args.num_prompts,
tokenizer=tokenizer,
output_len=args.hf_output_len,
request_id_prefix=args.request_id_prefix,
no_oversample=args.no_oversample,
**hf_kwargs
)
else:
# For datasets that follow a similar structure, use a mapping.
dataset_mapping = {
"spec_bench":
lambda: SpecBench(dataset_path=args.dataset_path,
category=args.spec_bench_category).sample(
num_requests=args.num_prompts,
tokenizer=tokenizer,
output_len=args.spec_bench_output_len,
request_id_prefix=args.request_id_prefix,
no_oversample=args.no_oversample,
),
"sharegpt": lambda: ShareGPTDataset(
random_seed=args.seed, dataset_path=args.dataset_path
).sample(
......@@ -1375,6 +1488,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
num_requests=args.num_prompts,
output_len=args.sharegpt_output_len,
request_id_prefix=args.request_id_prefix,
no_oversample=args.no_oversample,
),
"burstgpt": lambda: BurstGPTDataset(
random_seed=args.seed, dataset_path=args.dataset_path
......@@ -1382,6 +1496,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
tokenizer=tokenizer,
num_requests=args.num_prompts,
request_id_prefix=args.request_id_prefix,
no_oversample=args.no_oversample,
),
"random": lambda: RandomDataset(
random_seed=args.seed, dataset_path=args.dataset_path
......@@ -1394,6 +1509,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
range_ratio=args.random_range_ratio,
request_id_prefix=args.request_id_prefix,
batchsize=args.random_batch_size,
no_oversample=args.no_oversample,
),
"random-mm":
lambda: RandomMultiModalDataset(
......@@ -1410,6 +1526,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
num_mm_items_range_ratio=args.random_mm_num_mm_items_range_ratio,
bucket_config=args.random_mm_bucket_config,
request_id_prefix=args.request_id_prefix,
no_oversample=args.no_oversample,
),
"prefix_repetition":
lambda: PrefixRepetitionRandomDataset(
......@@ -1422,6 +1539,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]:
num_prefixes=args.prefix_repetition_num_prefixes,
output_len=args.prefix_repetition_output_len,
request_id_prefix=args.request_id_prefix,
no_oversample=args.no_oversample,
),
}
......@@ -1503,8 +1621,17 @@ class CustomDataset(BenchmarkDataset):
enable_multimodal_chat: bool = False,
skip_chat_template: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs,
) -> list:
# load all data if needed
self.num_available_samples = len(self.data)
if num_requests <= 0:
num_requests = self.num_available_samples
logger.info("num_requests is set to 0 or negative, "
"so using all available samples: %d",
num_requests)
sampled_requests = []
for i, item in enumerate(self.data):
if len(sampled_requests) >= num_requests:
......@@ -1531,11 +1658,57 @@ class CustomDataset(BenchmarkDataset):
request_id=request_id_prefix + str(i),
))
self.maybe_oversample_requests(sampled_requests, num_requests,
request_id_prefix)
request_id_prefix, no_oversample)
return sampled_requests
# -----------------------------------------------------------------------------
# Spec Bench Dataset Implementation
# -----------------------------------------------------------------------------
class SpecBench(CustomDataset):
"""
Implements the SpecBench dataset: https://github.com/hemingkx/Spec-Bench
Download the dataset using:
wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
""" # noqa: E501
def __init__(self, **kwargs) -> None:
self.category = kwargs.pop("category", None)
super().__init__(**kwargs)
self.load_data()
def load_data(self) -> None:
if self.dataset_path is None:
raise ValueError("dataset_path must be provided for loading data.")
self.data = []
# Load the JSONL file
jsonl_data = pd.read_json(path_or_buf=self.dataset_path,
lines=True)
# check if the JSONL file has a 'turns' column
if "turns" not in jsonl_data.columns:
raise ValueError("JSONL file must contain a 'turns' column.")
for _, row in jsonl_data.iterrows():
# sample only from a specific category if specified
if (not self.category) or (self.category == row['category']):
prompt = row["turns"][0]
self.data.append({"prompt": prompt})
random.seed(self.random_seed)
random.shuffle(self.data)
def sample(self, **kwargs) -> list:
# leverage CustomDataset sample
kwargs["skip_chat_template"] = False
return super().sample(**kwargs)
# -----------------------------------------------------------------------------
# Sonnet Dataset Implementation
# -----------------------------------------------------------------------------
......@@ -1576,6 +1749,7 @@ class SonnetDataset(BenchmarkDataset):
output_len: int = DEFAULT_OUTPUT_LEN,
return_prompt_formatted: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs,
) -> list:
# Calculate average token length for a poem line.
......@@ -1671,6 +1845,7 @@ class BurstGPTDataset(BenchmarkDataset):
max_loras: Optional[int] = None,
lora_path: Optional[str] = None,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs,
) -> list[SampleRequest]:
samples = []
......@@ -1710,6 +1885,7 @@ class HuggingFaceDataset(BenchmarkDataset):
dataset_split: str,
no_stream: bool = False,
dataset_subset: Optional[str] = None,
hf_name: Optional[str] = None,
**kwargs,
) -> None:
super().__init__(dataset_path=dataset_path, **kwargs)
......@@ -1717,6 +1893,7 @@ class HuggingFaceDataset(BenchmarkDataset):
self.dataset_split = dataset_split
self.dataset_subset = dataset_subset
self.load_stream = not no_stream
self.hf_name = hf_name or dataset_path
self.load_data()
def load_data(self) -> None:
......@@ -1748,6 +1925,7 @@ class ConversationDataset(HuggingFaceDataset):
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs) -> list:
# Filter examples with at least 2 conversations
filtered_data = self.data.filter(
......@@ -1789,7 +1967,7 @@ class ConversationDataset(HuggingFaceDataset):
))
ind += 1
self.maybe_oversample_requests(sampled_requests, num_requests,
request_id_prefix)
request_id_prefix, no_oversample)
return sampled_requests
......@@ -1819,6 +1997,7 @@ class VisionArenaDataset(HuggingFaceDataset):
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs,
) -> list:
output_len = (output_len
......@@ -1827,10 +2006,9 @@ class VisionArenaDataset(HuggingFaceDataset):
for i, item in enumerate(self.data):
if len(sampled_requests) >= num_requests:
break
parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.hf_name)
if parser_fn is None:
raise ValueError(
f"Unsupported dataset path: {self.dataset_path}")
raise ValueError(f"Unsupported dataset path: {self.hf_name}")
prompt = parser_fn(item)
mm_content = process_image(item["images"][0])
prompt_len = len(tokenizer(prompt).input_ids)
......@@ -1849,7 +2027,7 @@ class VisionArenaDataset(HuggingFaceDataset):
request_id=request_id_prefix + str(i),
))
self.maybe_oversample_requests(sampled_requests, num_requests,
request_id_prefix)
request_id_prefix, no_oversample)
return sampled_requests
......@@ -1879,6 +2057,7 @@ class InstructCoderDataset(HuggingFaceDataset):
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs) -> list:
output_len = (output_len
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
......@@ -1910,7 +2089,7 @@ class InstructCoderDataset(HuggingFaceDataset):
request_id=request_id_prefix + str(i),
))
self.maybe_oversample_requests(sampled_requests, num_requests,
request_id_prefix)
request_id_prefix, no_oversample)
return sampled_requests
......@@ -1941,6 +2120,7 @@ class MTBenchDataset(HuggingFaceDataset):
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs,
) -> list:
output_len = (output_len
......@@ -1971,7 +2151,96 @@ class MTBenchDataset(HuggingFaceDataset):
request_id=request_id_prefix + str(i),
))
self.maybe_oversample_requests(sampled_requests, num_requests,
request_id_prefix)
request_id_prefix, no_oversample)
return sampled_requests
# -----------------------------------------------------------------------------
# Blazedit Dataset Implementation
# -----------------------------------------------------------------------------
class BlazeditDataset(HuggingFaceDataset):
"""
Blazedit Dataset.
https://github.com/ise-uiuc/blazedit
5k char version: vdaita/edit_5k_char
10k char version: vdaita/edit_10k_char
""" # noqa: E501
# 5k char version will have output as ~5k chars
# 10k char version will have output as ~10k chars
# Assuming 3 char per token, 10k chars will be 3333 tokens
# We set default to 4000 to be safe
DEFAULT_OUTPUT_LEN = 4000
SUPPORTED_DATASET_PATHS = {
"vdaita/edit_5k_char",
"vdaita/edit_10k_char",
}
def sample(
self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: Optional[int] = None,
request_id_prefix: str = "",
no_oversample: bool = False,
min_distance: float = 0.0,
max_distance: float = 1.0,
**kwargs,
) -> list:
output_len = (output_len
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
sampled_requests = []
for i, item in enumerate(self.data):
if len(sampled_requests) >= num_requests:
break
code = item["code"]
change_request = item["change_request"]
norm_distance = item["norm_distance"]
# compare the levenshtein distance normalized by code length
if norm_distance < min_distance or norm_distance > max_distance:
continue
# template copied from
# https://github.com/ise-uiuc/blazedit/blob/7765137e656fd62de877422d2e4cf8de51228054/dataset/create_refined_dataset.py#L94-L105 # noqa: E501
instruction = f"""Given a code file, please apply the change requests and generate the new file.
Original file:
```python
{code}
```
Change request:
{change_request}
Please generate the new code file in the "New file" section below.""" # noqa: E501
# apply template
prompt = tokenizer.apply_chat_template(
[{
"role": "user",
"content": instruction
}],
add_generation_prompt=True,
tokenize=False,
)
prompt_len = len(tokenizer(prompt).input_ids)
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
request_id=request_id_prefix + str(i),
))
self.maybe_oversample_requests(sampled_requests, num_requests,
request_id_prefix, no_oversample)
return sampled_requests
......@@ -1994,6 +2263,7 @@ class AIMODataset(HuggingFaceDataset):
num_requests: int,
output_len: Optional[int] = None,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs) -> list:
sampled_requests = []
ind = 0
......@@ -2026,7 +2296,7 @@ class AIMODataset(HuggingFaceDataset):
))
ind += 1
self.maybe_oversample_requests(sampled_requests, num_requests,
request_id_prefix)
request_id_prefix, no_oversample)
return sampled_requests
......@@ -2098,11 +2368,11 @@ class NextEditPredictionDataset(HuggingFaceDataset):
def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs):
formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(
self.dataset_path)
formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.hf_name)
if formatting_prompt_func is None:
raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
raise ValueError(f"Unsupported dataset path: {self.hf_name}")
samples = []
for i, sample in enumerate(self.data):
sample = formatting_prompt_func(sample)
......@@ -2116,7 +2386,10 @@ class NextEditPredictionDataset(HuggingFaceDataset):
))
if len(samples) >= num_requests:
break
self.maybe_oversample_requests(samples, num_requests, request_id_prefix)
self.maybe_oversample_requests(samples,
num_requests,
request_id_prefix,
no_oversample)
return samples
......@@ -2167,6 +2440,7 @@ class ASRDataset(HuggingFaceDataset):
num_requests: int,
output_len: Optional[int] = None,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs,
) -> list:
output_len = (output_len
......@@ -2205,7 +2479,7 @@ class ASRDataset(HuggingFaceDataset):
skipped,
)
self.maybe_oversample_requests(sampled_requests, num_requests,
request_id_prefix)
request_id_prefix, no_oversample)
return sampled_requests
......@@ -2243,6 +2517,7 @@ class MLPerfDataset(HuggingFaceDataset):
num_requests: int,
output_len: Optional[int] = None,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs,
) -> list[SampleRequest]:
# Force dynamic output length based on reference completion.
......@@ -2289,7 +2564,7 @@ class MLPerfDataset(HuggingFaceDataset):
ind += 1
self.maybe_oversample_requests(sampled_requests, num_requests,
request_id_prefix)
request_id_prefix, no_oversample)
return sampled_requests
......@@ -2323,6 +2598,7 @@ class PrefixRepetitionRandomDataset(BenchmarkDataset):
num_prefixes: int = DEFAULT_NUM_PREFIXES,
output_len: int = DEFAULT_OUTPUT_LEN,
request_id_prefix: str = "",
no_oversample: bool = False,
**kwargs,
) -> list[SampleRequest]:
vocab_size = tokenizer.vocab_size
......
......@@ -17,6 +17,47 @@ from tqdm.asyncio import tqdm
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
class StreamedResponseHandler:
"""Handles streaming HTTP responses by accumulating chunks until complete
messages are available."""
def __init__(self):
self.buffer = ""
def add_chunk(self, chunk_bytes: bytes) -> list[str]:
"""Add a chunk of bytes to the buffer and return any complete
messages."""
chunk_str = chunk_bytes.decode("utf-8")
self.buffer += chunk_str
messages = []
# Split by double newlines (SSE message separator)
while "\n\n" in self.buffer:
message, self.buffer = self.buffer.split("\n\n", 1)
message = message.strip()
if message:
messages.append(message)
# if self.buffer is not empty, check if it is a complete message
# by removing data: prefix and check if it is a valid JSON
if self.buffer.startswith("data: "):
message_content = self.buffer.removeprefix("data: ").strip()
if message_content == "[DONE]":
messages.append(self.buffer.strip())
self.buffer = ""
elif message_content:
try:
json.loads(message_content)
messages.append(self.buffer.strip())
self.buffer = ""
except json.JSONDecodeError:
# Incomplete JSON, wait for more chunks.
pass
return messages
@dataclass
class RequestFuncInput:
"""The input for the request function."""
......@@ -102,46 +143,50 @@ async def async_request_openai_completions(
headers=headers) as response:
if response.status == 200:
first_chunk_received = False
async for chunk_bytes in response.content:
handler = StreamedResponseHandler()
async for chunk_bytes in response.content.iter_any():
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk_bytes = chunk_bytes.decode("utf-8")
# NOTE: SSE comments (often used as pings) start with
# a colon. These are not JSON data payload and should
# be skipped.
if chunk_bytes.startswith(":"):
continue
chunk = chunk_bytes.removeprefix("data: ")
messages = handler.add_chunk(chunk_bytes)
for message in messages:
# NOTE: SSE comments (often used as pings) start with
# a colon. These are not JSON data payload and should
# be skipped.
if message.startswith(":"):
continue
if chunk != "[DONE]":
data = json.loads(chunk)
chunk = message.removeprefix("data: ")
# NOTE: Some completion API might have a last
# usage summary response without a token so we
# want to check a token was generated
if choices := data.get("choices"):
# Note that text could be empty here
# e.g. for special tokens
text = choices[0].get("text")
timestamp = time.perf_counter()
# First token
if not first_chunk_received:
first_chunk_received = True
ttft = time.perf_counter() - st
output.ttft = ttft
if chunk != "[DONE]":
data = json.loads(chunk)
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
# NOTE: Some completion API might have a last
# usage summary response without a token so we
# want to check a token was generated
if choices := data.get("choices"):
# Note that text could be empty here
# e.g. for special tokens
text = choices[0].get("text")
timestamp = time.perf_counter()
# First token
if not first_chunk_received:
first_chunk_received = True
ttft = time.perf_counter() - st
output.ttft = ttft
most_recent_timestamp = timestamp
generated_text += text or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
most_recent_timestamp = timestamp
generated_text += text or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
if first_chunk_received:
output.success = True
else:
......@@ -227,41 +272,44 @@ async def async_request_openai_chat_completions(
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
async for chunk_bytes in response.content:
handler = StreamedResponseHandler()
async for chunk_bytes in response.content.iter_any():
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk_bytes = chunk_bytes.decode("utf-8")
# NOTE: SSE comments (often used as pings) start with
# a colon. These are not JSON data payload and should
# be skipped.
if chunk_bytes.startswith(":"):
continue
chunk = chunk_bytes.removeprefix("data: ")
messages = handler.add_chunk(chunk_bytes)
for message in messages:
# NOTE: SSE comments (often used as pings) start with
# a colon. These are not JSON data payload and should
# be skipped.
if message.startswith(":"):
continue
chunk = message.removeprefix("data: ")
if chunk != "[DONE]":
timestamp = time.perf_counter()
data = json.loads(chunk)
if chunk != "[DONE]":
timestamp = time.perf_counter()
data = json.loads(chunk)
if choices := data.get("choices"):
content = choices[0]["delta"].get("content")
# First token
if ttft == 0.0:
ttft = timestamp - st
output.ttft = ttft
if choices := data.get("choices"):
content = choices[0]["delta"].get("content")
# First token
if ttft == 0.0:
ttft = timestamp - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
# Decoding phase
else:
output.itl.append(timestamp -
most_recent_timestamp)
generated_text += content or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
generated_text += content or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
most_recent_timestamp = timestamp
most_recent_timestamp = timestamp
output.generated_text = generated_text
output.success = True
......@@ -347,36 +395,40 @@ async def async_request_openai_audio(
data=form,
headers=headers) as response:
if response.status == 200:
async for chunk_bytes in response.content:
handler = StreamedResponseHandler()
async for chunk_bytes in response.content.iter_any():
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
chunk = chunk_bytes.decode("utf-8").removeprefix(
"data: ")
if chunk != "[DONE]":
timestamp = time.perf_counter()
data = json.loads(chunk)
if choices := data.get("choices"):
content = choices[0]["delta"].get(
"content")
# First token
if ttft == 0.0:
ttft = timestamp - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(
timestamp - most_recent_timestamp)
generated_text += content or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
most_recent_timestamp = timestamp
messages = handler.add_chunk(chunk_bytes)
for message in messages:
chunk = message.decode("utf-8").removeprefix(
"data: ")
if chunk != "[DONE]":
timestamp = time.perf_counter()
data = json.loads(chunk)
if choices := data.get("choices"):
content = choices[0]["delta"].get(
"content")
# First token
if ttft == 0.0:
ttft = timestamp - st
output.ttft = ttft
# Decoding phase
else:
output.itl.append(
timestamp - most_recent_timestamp)
generated_text += content or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
"completion_tokens")
most_recent_timestamp = timestamp
output.generated_text = generated_text
output.success = True
......
......@@ -189,7 +189,7 @@ async def get_request(
# NOTE: If we simply accumulate the random delta values
# from the gamma distribution, their sum would have 1-2% gap
# from target_total_delay_s. The purpose of the following logic is to
# close the gap for stablizing the throughput data
# close the gap for stabilizing the throughput data
# from different random seeds.
target_total_delay_s = total_requests / request_rate
normalize_factor = target_total_delay_s / delay_ts[-1]
......
......@@ -37,6 +37,7 @@ def run_vllm(
requests: list[SampleRequest],
n: int,
engine_args: EngineArgs,
do_profile: bool,
disable_detokenize: bool = False,
) -> tuple[float, Optional[list[RequestOutput]]]:
from vllm import LLM, SamplingParams
......@@ -75,10 +76,14 @@ def run_vllm(
outputs = None
if not use_beam_search:
start = time.perf_counter()
if do_profile:
llm.start_profile()
outputs = llm.generate(prompts,
sampling_params,
lora_request=lora_requests,
use_tqdm=True)
if do_profile:
llm.stop_profile()
end = time.perf_counter()
else:
assert lora_requests is None, "BeamSearch API does not support LoRA"
......@@ -88,6 +93,8 @@ def run_vllm(
for request in requests:
assert request.expected_output_len == output_len
start = time.perf_counter()
if do_profile:
llm.start_profile()
llm.beam_search(
prompts,
BeamSearchParams(
......@@ -95,6 +102,8 @@ def run_vllm(
max_tokens=output_len,
ignore_eos=True,
))
if do_profile:
llm.stop_profile()
end = time.perf_counter()
return end - start, outputs
......@@ -103,6 +112,7 @@ def run_vllm_chat(
requests: list[SampleRequest],
n: int,
engine_args: EngineArgs,
do_profile: bool,
disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
"""
Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
......@@ -133,7 +143,11 @@ def run_vllm_chat(
detokenize=not disable_detokenize,
))
start = time.perf_counter()
if do_profile:
llm.start_profile()
outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
if do_profile:
llm.stop_profile()
end = time.perf_counter()
return end - start, outputs
......@@ -142,6 +156,7 @@ async def run_vllm_async(
requests: list[SampleRequest],
n: int,
engine_args: AsyncEngineArgs,
do_profile: bool,
disable_frontend_multiprocessing: bool = False,
disable_detokenize: bool = False,
) -> float:
......@@ -185,6 +200,8 @@ async def run_vllm_async(
generators = []
start = time.perf_counter()
if do_profile:
await llm.start_profile()
for i, (prompt, sp,
lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
generator = llm.generate(prompt,
......@@ -195,6 +212,8 @@ async def run_vllm_async(
all_gens = merge_async_iterators(*generators)
async for i, res in all_gens:
pass
if do_profile:
await llm.stop_profile()
end = time.perf_counter()
return end - start
......@@ -543,6 +562,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
type=str,
default=None,
help="Split of the HF dataset.")
parser.add_argument(
"--profile",
action="store_true",
default=False,
help="Use Torch Profiler. The env variable "
"VLLM_TORCH_PROFILER_DIR must be set to enable profiler.")
# prefix repetition dataset
prefix_repetition_group = parser.add_argument_group(
......@@ -600,22 +625,27 @@ def main(args: argparse.Namespace):
requests,
args.n,
AsyncEngineArgs.from_cli_args(args),
args.disable_frontend_multiprocessing,
args.disable_detokenize,
disable_frontend_multiprocessing=args.disable_frontend_multiprocessing,
disable_detokenize=args.disable_detokenize,
do_profile=args.profile,
))
else:
elapsed_time, request_outputs = run_vllm(
requests, args.n, EngineArgs.from_cli_args(args),
args.disable_detokenize)
disable_detokenize=args.disable_detokenize,
do_profile=args.profile)
elif args.backend == "hf":
assert args.tensor_parallel_size == 1
if args.profile:
raise NotImplementedError(
"Profiling not implemented yet for backend='hf'.")
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
args.hf_max_batch_size, args.trust_remote_code,
args.disable_detokenize)
elif args.backend == "vllm-chat":
elapsed_time, request_outputs = run_vllm_chat(
requests, args.n, EngineArgs.from_cli_args(args),
args.disable_detokenize)
disable_detokenize=args.disable_detokenize, do_profile=args.profile)
else:
raise ValueError(f"Unknown backend: {args.backend}")
......
......@@ -54,7 +54,6 @@ SystemEnv = namedtuple(
'is_xnnpack_available',
'cpu_info',
'rocm_version', # vllm specific field
'neuron_sdk_version', # vllm specific field
'vllm_version', # vllm specific field
'vllm_build_flags', # vllm specific field
'gpu_topo', # vllm specific field
......@@ -75,6 +74,7 @@ DEFAULT_CONDA_PATTERNS = {
"zmq",
"nvidia",
"pynvml",
"flashinfer-python",
}
DEFAULT_PIP_PATTERNS = {
......@@ -90,6 +90,7 @@ DEFAULT_PIP_PATTERNS = {
"zmq",
"nvidia",
"pynvml",
"flashinfer-python",
}
......@@ -275,15 +276,6 @@ def get_rocm_version(run_lambda):
r'HIP version: (\S+)')
def get_neuron_sdk_version(run_lambda):
# Adapted from your install script
try:
result = run_lambda(["neuron-ls"])
return result if result[0] == 0 else 'N/A'
except Exception:
return 'N/A'
def get_vllm_version():
from vllm import __version__, __version_tuple__
......@@ -306,10 +298,9 @@ def get_vllm_version():
def summarize_vllm_build_flags():
# This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
return 'CUDA Archs: {}; ROCm: {}; Neuron: {}'.format(
return 'CUDA Archs: {}; ROCm: {}'.format(
os.environ.get('TORCH_CUDA_ARCH_LIST', 'Not Set'),
'Enabled' if os.environ.get('ROCM_HOME') else 'Disabled',
'Enabled' if os.environ.get('NEURON_CORES') else 'Disabled',
)
......@@ -498,6 +489,16 @@ def get_libc_version():
return '-'.join(platform.libc_ver())
def is_uv_venv():
if os.environ.get("UV"):
return True
pyvenv_cfg_path = os.path.join(sys.prefix, 'pyvenv.cfg')
if os.path.exists(pyvenv_cfg_path):
with open(pyvenv_cfg_path, 'r') as f:
return any(line.startswith('uv = ') for line in f)
return False
def get_pip_packages(run_lambda, patterns=None):
"""Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
if patterns is None:
......@@ -513,7 +514,7 @@ def get_pip_packages(run_lambda, patterns=None):
if pip_available:
cmd = [sys.executable, '-mpip', 'list', '--format=freeze']
elif os.environ.get("UV") is not None:
elif is_uv_venv():
print("uv is set")
cmd = ["uv", "pip", "list", "--format=freeze"]
else:
......@@ -601,7 +602,6 @@ def get_env_info():
conda_packages = get_conda_packages(run_lambda)
rocm_version = get_rocm_version(run_lambda)
neuron_sdk_version = get_neuron_sdk_version(run_lambda)
vllm_version = get_vllm_version()
vllm_build_flags = summarize_vllm_build_flags()
gpu_topo = get_gpu_topo(run_lambda)
......@@ -635,7 +635,6 @@ def get_env_info():
is_xnnpack_available=is_xnnpack_available(),
cpu_info=get_cpu_info(run_lambda),
rocm_version=rocm_version,
neuron_sdk_version=neuron_sdk_version,
vllm_version=vllm_version,
vllm_build_flags=vllm_build_flags,
gpu_topo=gpu_topo,
......@@ -702,7 +701,6 @@ env_info_fmt += """
vLLM Info
==============================
ROCM Version : {rocm_version}
Neuron SDK Version : {neuron_sdk_version}
vLLM Version : {vllm_version}
vLLM Build Flags:
{vllm_build_flags}
......
......@@ -454,11 +454,12 @@ class VllmBackend:
inductor_config = config.inductor_compile_config
PASS_KEY = "post_grad_custom_post_pass"
if PASS_KEY in inductor_config:
# Config should automatically wrap all inductor passes
if isinstance(inductor_config[PASS_KEY], PostGradPassManager):
# PassManager already added to config, make sure it's correct
assert (inductor_config[PASS_KEY].uuid() ==
self.post_grad_pass_manager.uuid())
else:
# Config should automatically wrap all inductor passes
assert isinstance(inductor_config[PASS_KEY], InductorPass)
self.post_grad_pass_manager.add(inductor_config[PASS_KEY])
inductor_config[PASS_KEY] = self.post_grad_pass_manager
......
......@@ -513,7 +513,7 @@ if flashinfer_comm is not None:
torch.ops._C.static_scaled_fp8_quant(
quant_out, norm_out, scale_factor)
if scale_factor is None or norm_out is not None:
# we need to return allreduce outpput
# we need to return allreduce output
# in cases of non quant fused AR + RMS norm
# and fused AR + RMS norm + quant without fused add
allreduce_in.copy_(allreduce_out)
......
......@@ -39,6 +39,7 @@ class AttentionQuantPattern(ABC):
self,
layer: Attention,
quant_key: QuantKey,
dtype: torch.dtype,
):
self.layer = layer
self.layer_name = layer.layer_name
......@@ -46,11 +47,16 @@ class AttentionQuantPattern(ABC):
self.head_size = layer.head_size
self.quant_key = quant_key
self.quant_dtype = quant_key.dtype
self.dtype = dtype
assert self.quant_key in QUANT_OPS, \
f"unsupported quantization scheme {self.quant_key}"
self.QUANT_OP = QUANT_OPS[self.quant_key]
def empty(self, *args, **kwargs):
kwargs = {'dtype': self.dtype, 'device': "cuda", **kwargs}
return torch.empty(*args, **kwargs)
def empty_quant(self, *args, **kwargs):
kwargs = {'dtype': self.quant_dtype, 'device': "cuda", **kwargs}
return torch.empty(*args, **kwargs)
......@@ -91,12 +97,13 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern):
def __init__(
self,
layer: Attention,
dtype: torch.dtype,
symmetric: bool = True,
):
quant_key = QuantKey(dtype=FP8_DTYPE,
scale=kStaticTensorScale,
symmetric=symmetric)
super().__init__(layer, quant_key)
super().__init__(layer, quant_key, dtype)
def _register(self, pm_pass: PatternMatcherPass):
......@@ -139,10 +146,14 @@ class AttentionFp8StaticQuantPattern(AttentionQuantPattern):
return RESHAPE_OP(at1[1], [-1, self.num_heads * self.head_size])
inputs = [
empty_bf16(5, self.num_heads, self.head_size), # q
empty_bf16(5, self.num_heads, self.head_size), # k
empty_bf16(5, self.num_heads, self.head_size), # v
empty_bf16(5, self.num_heads, self.head_size), # attn_output
self.empty(5, self.num_heads, self.head_size,
dtype=self.dtype), # q
self.empty(5, self.num_heads, self.head_size,
dtype=self.dtype), # k
self.empty(5, self.num_heads, self.head_size,
dtype=self.dtype), # v
self.empty(5, self.num_heads, self.head_size,
dtype=self.dtype), # attn_output
self.empty_quant(5,
self.num_heads * self.head_size), # quant_output
empty_fp32(1, 1) # scale
......@@ -165,8 +176,8 @@ class AttentionNvfp4QuantPattern(AttentionQuantPattern):
will be passed into Attention op as the `output_scale` argument.
"""
def __init__(self, layer: Attention):
super().__init__(layer, kNvfp4Quant)
def __init__(self, layer: Attention, dtype: torch.dtype):
super().__init__(layer, kNvfp4Quant, dtype)
def _register(self, pm_pass: PatternMatcherPass):
......@@ -255,11 +266,15 @@ class AttnFusionPass(VllmInductorPass):
attn_layers = get_layers_from_vllm_config(config, Attention)
for layer_name, layer in attn_layers.items():
pattern_fp8 = AttentionFp8StaticQuantPattern(layer)
pattern_fp8 = AttentionFp8StaticQuantPattern(
layer, config.model_config.dtype)
pattern_fp8.register_if_supported(self.patterns)
pattern_nvfp4 = AttentionNvfp4QuantPattern(layer)
pattern_nvfp4.register_if_supported(self.patterns)
if current_platform.is_cuda() and hasattr(torch.ops._C,
"scaled_fp4_quant"):
pattern_nvfp4 = AttentionNvfp4QuantPattern(
layer, config.model_config.dtype)
pattern_nvfp4.register_if_supported(self.patterns)
if len(attn_layers) == 0:
logger.warning(
......
......@@ -8,8 +8,8 @@ import enum
import hashlib
import inspect
import json
import os
import textwrap
import uuid
import warnings
from collections.abc import Mapping
from contextlib import contextmanager
......@@ -33,12 +33,17 @@ from vllm.config.cache import (BlockSize, CacheConfig, CacheDType, MambaDType,
PrefixCachingHashAlgo)
from vllm.config.compilation import (CompilationConfig, CompilationLevel,
CUDAGraphMode, PassConfig)
from vllm.config.kv_events import KVEventsConfig
from vllm.config.kv_transfer import KVTransferConfig
from vllm.config.load import LoadConfig
from vllm.config.lora import LoRAConfig
from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig,
ParallelConfig)
from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy
from vllm.config.utils import ConfigType, config
from vllm.logger import init_logger
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.platforms import current_platform
from vllm.transformers_utils.config import (
ConfigFormat, get_config, get_hf_image_processor_config,
......@@ -47,9 +52,11 @@ from vllm.transformers_utils.config import (
is_interleaved, maybe_override_with_speculators_target_model,
try_get_generation_config, try_get_safetensors_metadata,
try_get_tokenizer_config, uses_mrope)
from vllm.transformers_utils.s3_utils import S3Model
from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, LayerBlockType,
from vllm.transformers_utils.runai_utils import (ObjectStorageModel,
is_runai_obj_uri)
from vllm.transformers_utils.utils import maybe_model_redirect
from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
STR_DUAL_CHUNK_FLASH_ATTN_VAL, LayerBlockType,
LazyLoader, common_broadcastable_dtype, random_uuid)
if TYPE_CHECKING:
......@@ -61,8 +68,6 @@ if TYPE_CHECKING:
from vllm.model_executor.layers.quantization import QuantizationMethods
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
from vllm.model_executor.model_loader import LoadFormats
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
from vllm.v1.sample.logits_processor import LogitsProcessor
HfOverrides = Union[dict, Callable[[type], type]]
......@@ -72,8 +77,6 @@ else:
QuantizationConfig = Any
QuantizationMethods = Any
BaseModelLoader = Any
LoadFormats = Any
TensorizerConfig = Any
LogitsProcessor = Any
HfOverrides = Union[dict[str, Any], Callable[[type], type]]
......@@ -170,6 +173,7 @@ class ModelImpl(str, enum.Enum):
AUTO = "auto"
VLLM = "vllm"
TRANSFORMERS = "transformers"
TERRATORCH = "terratorch"
def get_attr_docs(cls: type[Any]) -> dict[str, str]:
......@@ -418,7 +422,7 @@ class ModelConfig:
`--media-io-kwargs '{"video": {"num_frames": 40} }'` """
use_async_output_proc: bool = True
"""Whether to use async output processor."""
config_format: Union[str, ConfigFormat] = ConfigFormat.AUTO.value
config_format: Union[str, ConfigFormat] = "auto"
"""The format of the model config to load:\n
- "auto" will try to load the config in hf format if available else it
will try to load in mistral format.\n
......@@ -459,11 +463,6 @@ class ModelConfig:
DP (which is controlled by `--data-parallel-size`).
This is only supported on a per-model basis and falls back to
`"weights"` if the encoder does not support DP."""
override_neuron_config: dict[str, Any] = field(default_factory=dict)
"""Initialize non-default neuron config or override default neuron config
that are specific to Neuron devices, this argument will be used to
configure the neuron config that can not be gathered from the vllm
arguments. e.g. `{"cast_logits_dtype": "bfloat16"}`."""
pooler_config: Optional["PoolerConfig"] = field(init=False)
"""Pooler config which controls the behaviour of output pooling in pooling
models."""
......@@ -495,7 +494,9 @@ class ModelConfig:
back to the Transformers implementation if no vLLM implementation is
available.\n
- "vllm" will use the vLLM model implementation.\n
- "transformers" will use the Transformers model implementation."""
- "transformers" will use the Transformers model implementation.\n
- "terratorch" will use the TerraTorch model implementation.
"""
override_attention_dtype: Optional[str] = None
"""Override dtype for attention"""
logits_processors: Optional[list[Union[str, type[LogitsProcessor]]]] = None
......@@ -560,15 +561,6 @@ class ModelConfig:
"affect the random state of the Python process that "
"launched vLLM.", self.seed)
if self.runner != "draft":
# If we're not running the draft model, check for speculators config
# If speculators config, set model / tokenizer to be target model
self.model, self.tokenizer = maybe_override_with_speculators_target_model( # noqa: E501
model=self.model,
tokenizer=self.tokenizer,
revision=self.revision,
trust_remote_code=self.trust_remote_code)
# Keep set served_model_name before maybe_model_redirect(self.model)
self.served_model_name = get_served_model_name(self.model,
self.served_model_name)
......@@ -607,7 +599,16 @@ class ModelConfig:
f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
warnings.warn(DeprecationWarning(msg), stacklevel=2)
self.maybe_pull_model_tokenizer_for_s3(self.model, self.tokenizer)
self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
if self.runner != "draft":
# If we're not running the draft model, check for speculators config
# If speculators config, set model / tokenizer to be target model
self.model, self.tokenizer = maybe_override_with_speculators_target_model( # noqa: E501
model=self.model,
tokenizer=self.tokenizer,
revision=self.revision,
trust_remote_code=self.trust_remote_code)
if (backend := envs.VLLM_ATTENTION_BACKEND
) and backend == "FLASHINFER" and find_spec("flashinfer") is None:
......@@ -630,9 +631,6 @@ class ModelConfig:
raise ValueError(
"Sleep mode is not supported on current platform.")
if isinstance(self.config_format, str):
self.config_format = ConfigFormat(self.config_format)
hf_config = get_config(self.hf_config_path or self.model,
self.trust_remote_code,
self.revision,
......@@ -749,7 +747,7 @@ class ModelConfig:
self.pooler_config = self._init_pooler_config()
self.dtype = _get_and_verify_dtype(
self.dtype: torch.dtype = _get_and_verify_dtype(
self.model,
self.hf_config,
self.dtype,
......@@ -785,10 +783,6 @@ class ModelConfig:
if not self.skip_tokenizer_init:
self._verify_tokenizer_mode()
if (not current_platform.is_neuron() and self.override_neuron_config):
raise ValueError(
"`override_neuron_config` is only supported on Neuron.")
# Avoid running try_verify_and_update_config multiple times
self.config_updated = False
......@@ -840,41 +834,42 @@ class ModelConfig:
"""The architecture vllm actually used."""
return self._architecture
def maybe_pull_model_tokenizer_for_s3(self, model: str,
tokenizer: str) -> None:
"""Pull model/tokenizer from S3 to temporary directory when needed.
def maybe_pull_model_tokenizer_for_runai(self, model: str,
tokenizer: str) -> None:
"""Pull model/tokenizer from Object Storage to temporary
directory when needed.
Args:
model: Model name or path
tokenizer: Tokenizer name or path
"""
if not (is_s3(model) or is_s3(tokenizer)):
if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)):
return
if is_s3(model):
s3_model = S3Model()
s3_model.pull_files(model,
allow_pattern=["*.model", "*.py", "*.json"])
if is_runai_obj_uri(model):
object_storage_model = ObjectStorageModel()
object_storage_model.pull_files(
model, allow_pattern=["*.model", "*.py", "*.json"])
self.model_weights = model
self.model = s3_model.dir
self.model = object_storage_model.dir
# If tokenizer is same as model, download to same directory
if model == tokenizer:
s3_model.pull_files(model,
ignore_pattern=[
"*.pt", "*.safetensors", "*.bin",
"*.tensors"
])
self.tokenizer = s3_model.dir
object_storage_model.pull_files(model,
ignore_pattern=[
"*.pt", "*.safetensors",
"*.bin", "*.tensors"
])
self.tokenizer = object_storage_model.dir
return
# Only download tokenizer if needed and not already handled
if is_s3(tokenizer):
s3_tokenizer = S3Model()
s3_tokenizer.pull_files(
if is_runai_obj_uri(tokenizer):
object_storage_tokenizer = ObjectStorageModel()
object_storage_tokenizer.pull_files(
model,
ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors"])
self.tokenizer = s3_tokenizer.dir
self.tokenizer = object_storage_tokenizer.dir
def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
if self._model_info.supports_multimodal:
......@@ -1101,11 +1096,11 @@ class ModelConfig:
assert_never(runner_type)
def _parse_quant_hf_config(self):
quant_cfg = getattr(self.hf_config, "quantization_config", None)
def _parse_quant_hf_config(self, hf_config: PretrainedConfig):
quant_cfg = getattr(hf_config, "quantization_config", None)
if quant_cfg is None:
# compressed-tensors uses a "compression_config" key
quant_cfg = getattr(self.hf_config, "compression_config", None)
quant_cfg = getattr(hf_config, "compression_config", None)
else:
# Set quant_method for ModelOpt models.
......@@ -1146,7 +1141,11 @@ class ModelConfig:
self.quantization)
# Parse quantization method from the HF model config, if available.
quant_cfg = self._parse_quant_hf_config()
quant_cfg = self._parse_quant_hf_config(self.hf_config)
if quant_cfg is None and (text_config := getattr(
self.hf_config, "text_config", None)):
# Check the text config as well for multi-modal models.
quant_cfg = self._parse_quant_hf_config(text_config)
if quant_cfg is not None:
# Use the community standard 'quant_method'
......@@ -1178,7 +1177,7 @@ class ModelConfig:
]
# Any custom overrides will be in quantization_methods so we place
# them at the start of the list so custom overrides have preference
# over the built in ones.
# over the built-in ones.
quantization_methods = quantization_methods + overrides
# Detect which checkpoint is it
......@@ -1308,6 +1307,10 @@ class ModelConfig:
self.hf_config.dual_chunk_attention_config[
"sparse_attention_enabled"] = True
if envs.VLLM_ATTENTION_BACKEND != STR_DUAL_CHUNK_FLASH_ATTN_VAL:
raise ValueError("please set VLLM_ATTENTION_BACKEND to "
f"{STR_DUAL_CHUNK_FLASH_ATTN_VAL}")
def verify_async_output_proc(self, parallel_config, speculative_config,
device_config) -> None:
if not self.use_async_output_proc:
......@@ -1422,6 +1425,11 @@ class ModelConfig:
if getattr(self.hf_text_config, "head_dim", None) is not None:
return self.hf_text_config.head_dim
# NOTE: Some models (such as PLaMo2.1) use `hidden_size_per_head`
if getattr(self.hf_text_config, "hidden_size_per_head",
None) is not None:
return self.hf_text_config.hidden_size_per_head
# FIXME(woosuk): This may not be true for all models.
return (self.hf_text_config.hidden_size //
self.hf_text_config.num_attention_heads)
......@@ -1505,7 +1513,8 @@ class ModelConfig:
if (self.hf_text_config.model_type == "deepseek_mtp"
or self.hf_config.model_type == "mimo_mtp"
or self.hf_config.model_type == "glm4_moe_mtp"
or self.hf_config.model_type == "ernie_mtp"):
or self.hf_config.model_type == "ernie_mtp"
or self.hf_config.model_type == "qwen3_next_mtp"):
total_num_hidden_layers = getattr(self.hf_text_config,
"num_nextn_predict_layers", 0)
else:
......@@ -1549,7 +1558,7 @@ class ModelConfig:
for bc in block_configs[start:end])
else:
# Hybrid model Jamba
layers_block_type_value = getattr(self.hf_config,
layers_block_type_value = getattr(self.hf_text_config,
"layers_block_type", None)
if layers_block_type_value is not None:
if hasattr(self.hf_text_config,
......@@ -1568,15 +1577,28 @@ class ModelConfig:
if attn_type_list:
return sum(t == 1 for t in attn_type_list[start:end])
if layers_block_type_value is None and attn_type_list is None:
# Hybrid model Qwen3Next
layer_types_value = getattr(self.hf_config, "layer_types", None)
if layer_types_value is not None:
if getattr(block_type, "value", block_type) == "attention":
return sum(t == "full_attention"
for t in layer_types_value[start:end])
elif getattr(block_type, "value",
block_type) == "linear_attention":
return sum(t == "linear_attention"
for t in layer_types_value[start:end])
else:
return sum(t == getattr(block_type, "value", block_type)
for t in layer_types_value[start:end])
if (layers_block_type_value is None and attn_type_list is None
and layer_types_value is None):
raise ValueError(
"The model is an hybrid without a"
"layers_block_type or an attn_type_list in the hf_config,"
"cannot determine the num of "
"layers_block_type or an attn_type_list, or a layer_types "
"in the hf_config, cannot determine the num of "
f"{block_type.value} layers")
return sum(t == 1 for t in attn_type_list[start:end])
def get_mamba_chunk_size(self) -> Optional[int]:
"""
Returns the mamba chunk size if it exists
......@@ -1687,13 +1709,7 @@ class ModelConfig:
"""
For Mllama, VLLM overrides HF's is_encoder_decoder flag and sets it to
True to enable cross-attention
Neuron needs all multimodal data to be in the decoder and does not
need to explicitly enable cross-attention
"""
if (current_platform.is_neuron()
and self.hf_config.model_type == "mllama"):
return False
return is_encoder_decoder(self.hf_config)
@property
......@@ -1756,6 +1772,32 @@ class ModelConfig:
# `llm as reranker` models defaults to not using pad_token.
return getattr(self.hf_config, "use_pad_token", True)
@property
def head_dtype(self) -> torch.dtype:
"""
"head" refers to the last Linear layer(s) of an LLM,
such as the lm_head in a generation model,
or the score or classifier in a classification model.
The default head_dtype based on runner_type.\n
- The pooling model defaults to using fp32 head,
you can use --hf-overrides '{"head_dtype": "model"}' to disable it.\n
- The generate model defaults to not using fp32 head,
you can use --hf-overrides '{"head_dtype": "float32"}' to enable it.
"""
head_dtype = _get_head_dtype(config=self.hf_config,
dtype=self.dtype,
runner_type=self.runner_type)
if head_dtype not in current_platform.supported_dtypes:
logger.warning_once(
"The current platform does not support [%s] head dtype, "
"fallback to model dtype [%s].", head_dtype, self.dtype)
return self.dtype
logger.debug_once("head dtype: %s", head_dtype)
return head_dtype
def get_and_verify_max_len(self, max_model_len: int):
# Consider max_model_len in tokenizer_config only when
# pooling models use absolute position_embedding.
......@@ -1778,90 +1820,7 @@ class ModelConfig:
return max_model_len
@config
@dataclass
class LoadConfig:
"""Configuration for loading the model weights."""
load_format: Union[str, LoadFormats] = "auto"
"""The format of the model weights to load:\n
- "auto" will try to load the weights in the safetensors format and fall
back to the pytorch bin format if safetensors format is not available.\n
- "pt" will load the weights in the pytorch bin format.\n
- "safetensors" will load the weights in the safetensors format.\n
- "npcache" will load the weights in pytorch format and store a numpy cache
to speed up the loading.\n
- "dummy" will initialize the weights with random values, which is mainly
for profiling.\n
- "tensorizer" will use CoreWeave's tensorizer library for fast weight
loading. See the Tensorize vLLM Model script in the Examples section for
more information.\n
- "runai_streamer" will load the Safetensors weights using Run:ai Model
Streamer.\n
- "bitsandbytes" will load the weights using bitsandbytes quantization.\n
- "sharded_state" will load weights from pre-sharded checkpoint files,
supporting efficient loading of tensor-parallel models.\n
- "gguf" will load weights from GGUF format files (details specified in
https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n
- "mistral" will load weights from consolidated safetensors files used by
Mistral models.
- Other custom values can be supported via plugins."""
download_dir: Optional[str] = None
"""Directory to download and load the weights, default to the default
cache directory of Hugging Face."""
model_loader_extra_config: Union[dict, TensorizerConfig] = field(
default_factory=dict)
"""Extra config for model loader. This will be passed to the model loader
corresponding to the chosen load_format."""
device: Optional[str] = None
"""Device to which model weights will be loaded, default to
device_config.device"""
ignore_patterns: Optional[Union[list[str], str]] = None
"""The list of patterns to ignore when loading the model. Default to
"original/**/*" to avoid repeated loading of llama's checkpoints."""
use_tqdm_on_load: bool = True
"""Whether to enable tqdm for showing progress bar when loading model
weights."""
pt_load_map_location: Union[str, dict[str, str]] = "cpu"
"""
pt_load_map_location: the map location for loading pytorch checkpoint, to
support loading checkpoints can only be loaded on certain devices like
"cuda", this is equivalent to {"": "cuda"}. Another supported format is
mapping from different devices like from GPU 1 to GPU 0:
{"cuda:1": "cuda:0"}. Note that when passed from command line, the strings
in dictionary needs to be double quoted for json parsing. For more details,
see original doc for `map_location` in https://pytorch.org/docs/stable/generated/torch.load.html
"""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str
def __post_init__(self):
self.load_format = self.load_format.lower()
if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
logger.info(
"Ignoring the following patterns when downloading weights: %s",
self.ignore_patterns)
else:
self.ignore_patterns = ["original/**/*"]
Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu"]
Device = Literal["auto", "cuda", "cpu", "tpu", "xpu"]
@config
......@@ -1917,9 +1876,7 @@ class DeviceConfig:
self.device_type = self.device.type
# Some device types require processing inputs on CPU
if self.device_type in ["neuron"]:
self.device = torch.device("cpu")
elif self.device_type in ["tpu"]:
if self.device_type in ["tpu"]:
self.device = None
else:
# Set device with device type
......@@ -1928,7 +1885,7 @@ class DeviceConfig:
SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa",
"mlp_speculator", "draft_model", "deepseek_mtp",
"ernie_mtp"]
"ernie_mtp", "qwen3_next_mtp"]
@config
......@@ -2069,7 +2026,15 @@ class SpeculativeConfig:
"n_predict": n_predict,
"architectures": ["ErnieMTPModel"]
})
return hf_config
if hf_config.model_type == "qwen3_next":
hf_config.model_type = "qwen3_next_mtp"
if hf_config.model_type == "qwen3_next_mtp":
n_predict = getattr(hf_config, "num_nextn_predict_layers", None)
hf_config.update({
"n_predict": n_predict,
"architectures": ["Qwen3NextMTP"]
})
return hf_config
......@@ -2090,9 +2055,13 @@ class SpeculativeConfig:
(self.target_model_config.hf_text_config.model_type \
== "deepseek_v3" or
self.target_model_config.hf_text_config.model_type in
("mimo","ernie4_5_moe")):
("mimo","ernie4_5_moe", "qwen3_next")):
# use the draft model from the same model:
self.model = self.target_model_config.model
# Align the quantization of draft model for cases such as
# --quantization fp8 with a bf16 checkpoint.
if not self.quantization:
self.quantization = self.target_model_config.quantization
elif self.method in ("ngram", "[ngram]"):
self.model = "ngram"
else:
......@@ -2171,9 +2140,14 @@ class SpeculativeConfig:
# Automatically detect the method
if self.method in ('eagle', 'eagle3'):
pass
elif "eagle-" in self.draft_model_config.model.lower() or \
"eagle3-" in self.draft_model_config.model.lower():
# examples:
# yuhuili/EAGLE-LLaMA3-Instruct-8B
# yuhuili/EAGLE3-LLaMA3.1-Instruct-8B
# AngelSlim/Qwen3-8B_eagle3
elif "eagle-" in self.draft_model_config.model.lower():
self.method = "eagle"
elif "eagle3" in self.draft_model_config.model.lower():
self.method = "eagle3"
elif self.draft_model_config.hf_config.model_type == "medusa":
self.method = "medusa"
elif (self.draft_model_config.hf_config.model_type ==
......@@ -2197,6 +2171,15 @@ class SpeculativeConfig:
"one layer. Might need some code changes " \
"to support multiple layers."
)
elif (self.draft_model_config.hf_config.model_type ==
"qwen3_next_mtp"):
self.method = "qwen3_next_mtp"
if self.num_speculative_tokens > 1:
logger.warning(
"All Qwen3Next MTP models only have " \
"one layer. Might need some code changes " \
"to support multiple layers."
)
else:
self.method = "draft_model"
raise NotImplementedError(
......@@ -2412,7 +2395,8 @@ class SpeculativeConfig:
return self.num_speculative_tokens
def use_eagle(self) -> bool:
return self.method in ("eagle", "eagle3", "deepseek_mtp", "ernie_mtp")
return self.method in ("eagle", "eagle3", "deepseek_mtp", "ernie_mtp",
"qwen3_next_mtp")
def __repr__(self) -> str:
method = self.method
......@@ -2421,111 +2405,6 @@ class SpeculativeConfig:
return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
LoRADType = Literal["auto", "float16", "bfloat16"]
@config
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class LoRAConfig:
"""Configuration for LoRA."""
max_lora_rank: int = 16
"""Max LoRA rank."""
max_loras: int = 1
"""Max number of LoRAs in a single batch."""
fully_sharded_loras: bool = False
"""By default, only half of the LoRA computation is sharded with tensor
parallelism. Enabling this will use the fully sharded layers. At high
sequence length, max rank or tensor parallel size, this is likely faster.
"""
max_cpu_loras: Optional[int] = None
"""Maximum number of LoRAs to store in CPU memory. Must be >= than
`max_loras`."""
lora_dtype: Union[torch.dtype, LoRADType] = "auto"
"""Data type for LoRA. If auto, will default to base model dtype."""
lora_extra_vocab_size: int = 256
"""(Deprecated) Maximum size of extra vocabulary that can be present in a
LoRA adapter. Will be removed in v0.12.0."""
lora_vocab_padding_size: ClassVar[int] = current_platform\
.get_lora_vocab_padding_size()
default_mm_loras: Optional[dict[str, str]] = None
"""Dictionary mapping specific modalities to LoRA model paths; this field
is only applicable to multimodal models and should be leveraged when a
model always expects a LoRA to be active when a given modality is present.
Note that currently, if a request provides multiple additional
modalities, each of which have their own LoRA, we do NOT apply
default_mm_loras because we currently only support one lora adapter
per prompt. When run in offline mode, the lora IDs for n modalities
will be automatically assigned to 1-n with the names of the modalities
in alphabetic order."""
bias_enabled: bool = False
"""Enable bias for LoRA adapters."""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
factors: list[Any] = []
factors.append(self.max_lora_rank)
factors.append(self.max_loras)
factors.append(self.fully_sharded_loras)
factors.append(self.lora_dtype)
factors.append(self.lora_extra_vocab_size)
factors.append(self.lora_vocab_padding_size)
factors.append(self.bias_enabled)
hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str
def __post_init__(self):
# Deprecation warning for lora_extra_vocab_size
logger.warning(
"`lora_extra_vocab_size` is deprecated and will be removed "
"in v0.12.0. Additional vocabulary support for "
"LoRA adapters is being phased out.")
# Setting the maximum rank to 512 should be able to satisfy the vast
# majority of applications.
possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
possible_lora_extra_vocab_size = (256, 512)
if self.max_lora_rank not in possible_max_ranks:
raise ValueError(
f"max_lora_rank ({self.max_lora_rank}) must be one of "
f"{possible_max_ranks}.")
if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
raise ValueError(
f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
f"must be one of {possible_lora_extra_vocab_size}.")
if self.max_loras < 1:
raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
if self.max_cpu_loras is None:
self.max_cpu_loras = self.max_loras
elif self.max_cpu_loras < self.max_loras:
raise ValueError(
f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
f"max_loras ({self.max_loras})")
def verify_with_cache_config(self, cache_config: CacheConfig):
if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
raise ValueError(
"V0 LoRA does not support CPU offload, please use V1.")
def verify_with_model_config(self, model_config: ModelConfig):
if self.lora_dtype in (None, "auto"):
self.lora_dtype = model_config.dtype
elif isinstance(self.lora_dtype, str):
self.lora_dtype = getattr(torch, self.lora_dtype)
@config
@dataclass
class MultiModalConfig:
......@@ -2654,24 +2533,46 @@ class PoolerConfig:
## for embeddings models
normalize: Optional[bool] = None
"""
Whether to normalize the embeddings outputs.
Whether to normalize the embeddings outputs. Defaults to True.
"""
dimensions: Optional[int] = None
"""
Reduce the dimensions of embeddings if model
support matryoshka representation.
support matryoshka representation. Defaults to None.
"""
enable_chunked_processing: Optional[bool] = None
"""
Whether to enable chunked processing for long inputs that exceed the model's
maximum position embeddings. When enabled, long inputs will be split into
chunks, processed separately, and then aggregated using weighted averaging.
This allows embedding models to handle arbitrarily long text without CUDA
errors. Defaults to False.
"""
max_embed_len: Optional[int] = None
"""
Maximum input length allowed for embedding generation. When set, allows
inputs longer than max_embed_len to be accepted for embedding models.
When an input exceeds max_embed_len, it will be handled according to
the original max_model_len validation logic.
Defaults to None (i.e. set to max_model_len).
"""
## for classification models
activation: Optional[bool] = None
"""
Whether to apply activation function to the classification outputs.
Defaults to True.
"""
logit_bias: Optional[float] = None
"""
If provided, apply classification logit biases. Defaults to None.
"""
## for reward models
softmax: Optional[bool] = None
"""
Whether to apply softmax to the reward outputs.
Defaults to True.
"""
step_tag_id: Optional[int] = None
"""
......@@ -2686,25 +2587,6 @@ class PoolerConfig:
``math-shepherd-mistral-7b-prm`` model.
"""
enable_chunked_processing: Optional[bool] = None
"""
Whether to enable chunked processing for long inputs that exceed the model's
maximum position embeddings. When enabled, long inputs will be split into
chunks, processed separately, and then aggregated using weighted averaging.
This allows embedding models to handle arbitrarily long text without CUDA
errors. Defaults to False.
"""
max_embed_len: Optional[int] = None
"""
Maximum input length allowed for embedding generation. When set, allows
inputs longer than max_embed_len to be accepted for embedding models.
This parameter enables accepting long inputs without requiring
VLLM_ALLOW_LONG_MAX_MODEL_LEN environment variable. When an input exceeds
max_embed_len, it will be handled according to the original max_model_len
validation logic. Defaults to None (i.e. set to max_model_len).
"""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
......@@ -2737,6 +2619,8 @@ _STR_DTYPE_TO_TORCH_DTYPE = {
_FLOAT16_NOT_SUPPORTED_MODELS = {
"gemma2": "Numerical instability. Please use bfloat16 or float32 instead.",
"gemma3": "Numerical instability. Please use bfloat16 or float32 instead.",
"gemma3_text":
"Numerical instability. Please use bfloat16 or float32 instead.",
"plamo2": "Numerical instability. Please use bfloat16 or float32 instead.",
"glm4": "Numerical instability. Please use bfloat16 or float32 instead.",
}
......@@ -2889,6 +2773,31 @@ def _get_and_verify_dtype(
return torch_dtype
def _get_head_dtype(config: PretrainedConfig, dtype: torch.dtype,
runner_type: str) -> torch.dtype:
head_dtype: Optional[Union[str,
torch.dtype]] = getattr(config, "head_dtype",
None)
if head_dtype == "model":
return dtype
elif isinstance(head_dtype, str):
head_dtype = head_dtype.lower()
if head_dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
raise ValueError(f"Unknown dtype: {head_dtype!r}")
return _STR_DTYPE_TO_TORCH_DTYPE[head_dtype]
elif isinstance(head_dtype, torch.dtype):
return head_dtype
elif head_dtype is None:
if torch.float32 not in current_platform.supported_dtypes:
return dtype
if runner_type == "pooling":
return torch.float32
return dtype
else:
raise ValueError(f"Unknown dtype: {head_dtype}")
def _get_and_verify_max_len(
hf_config: PretrainedConfig,
tokenizer_config: Optional[dict],
......@@ -3024,16 +2933,20 @@ def _get_and_verify_max_len(
f"User-specified max_model_len ({max_model_len}) is greater "
f"than the derived max_model_len ({max_len_key}="
f"{derived_max_model_len} or model_max_length="
f"{model_max_length} in model's config.json). This may lead "
"to incorrect model outputs or CUDA errors.")
f"{model_max_length} in model's config.json).")
warning = (
"VLLM_ALLOW_LONG_MAX_MODEL_LEN must be used with extreme "
"caution. If the model uses relative position encoding (RoPE), "
"positions exceeding derived_max_model_len lead to nan. If the "
"model uses absolute position encoding, positions exceeding "
"derived_max_model_len will cause a CUDA array out-of-bounds "
"error.")
if envs.VLLM_ALLOW_LONG_MAX_MODEL_LEN:
logger.warning(
"%s Make sure the value is correct and within the "
"model context size.", msg)
logger.warning_once("%s %s", msg, warning)
else:
raise ValueError(
f"{msg} To allow overriding this maximum, set "
"the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1")
f"the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1. {warning}")
return int(max_model_len)
......@@ -3202,149 +3115,6 @@ class ObservabilityConfig:
self.collect_detailed_traces[0].split(","))
KVProducer = Literal["kv_producer", "kv_both"]
KVConsumer = Literal["kv_consumer", "kv_both"]
KVRole = Literal[KVProducer, KVConsumer]
@config
@dataclass
class KVTransferConfig:
"""Configuration for distributed KV cache transfer."""
kv_connector: Optional[str] = None
"""The KV connector for vLLM to transmit KV caches between vLLM instances.
"""
engine_id: Optional[str] = None
"""The engine id for KV transfers."""
kv_buffer_device: Optional[str] = "cuda"
"""The device used by kv connector to buffer the KV cache.
Currently only support 'cuda'."""
kv_buffer_size: float = 1e9
"""The buffer size for TorchDistributedConnector. Measured in number of
bytes. Recommended value: 1e9 (about 1GB)."""
kv_role: Optional[KVRole] = None
"""Whether this vLLM instance produces, consumes KV cache, or both. Choices
are 'kv_producer', 'kv_consumer', and 'kv_both'."""
kv_rank: Optional[int] = None
"""The rank of this vLLM instance in the KV cache transfer. Typical value:
0 for prefill instance, 1 for decode instance.
Currently only 1P1D is supported."""
kv_parallel_size: int = 1
"""The number of parallel instances for KV cache transfer. For
PyNcclConnector, this should be 2."""
kv_ip: str = "127.0.0.1"
"""The KV connector ip, used to build distributed connection."""
kv_port: int = 14579
"""The KV connector port, used to build distributed connection."""
kv_connector_extra_config: dict[str, Any] = field(default_factory=dict)
"""any extra config that the connector may need."""
kv_connector_module_path: Optional[str] = None
"""The Python module path to dynamically load the KV connector from.
Only supported in V1."""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str
def __post_init__(self) -> None:
if self.engine_id is None:
self.engine_id = str(uuid.uuid4())
if self.kv_role is not None and self.kv_role not in get_args(KVRole):
raise ValueError(f"Unsupported kv_role: {self.kv_role}. "
f"Supported roles are {get_args(KVRole)}")
if self.kv_connector is not None and self.kv_role is None:
raise ValueError("Please specify kv_disagg_role when kv_connector "
f"is set, supported roles are {get_args(KVRole)}")
@property
def is_kv_transfer_instance(self) -> bool:
return self.kv_connector is not None and \
self.kv_role in get_args(KVRole)
@property
def is_kv_producer(self) -> bool:
return self.kv_connector is not None and \
self.kv_role in get_args(KVProducer)
@property
def is_kv_consumer(self) -> bool:
return self.kv_connector is not None and \
self.kv_role in get_args(KVConsumer)
def get_from_extra_config(self, key, default) -> Any:
return self.kv_connector_extra_config.get(key, default)
@config
@dataclass
class KVEventsConfig:
"""Configuration for KV event publishing."""
enable_kv_cache_events: bool = False
"""If True, enable KV cache events for tracking block storage and removal.
Events can be published externally by zmq using the event publisher config.
"""
publisher: str = "null"
"""The publisher to use for publishing kv events. Can be "null", "zmq".
"""
endpoint: str = "tcp://*:5557"
"""The zmq endpoint to use for publishing kv events.
"""
replay_endpoint: Optional[str] = None
"""The zmq endpoint to use for replaying kv events.
"""
buffer_steps: int = 10_000
"""The number of steps to cache for replay endpoint. Will only save
events from the last N steps for the replay endpoint.
"""
hwm: int = 100_000
"""The zmq high water mark for the event publisher. After queueing N events,
events will start dropping if the consumer is not keeping up.
"""
max_queue_size: int = 100_000
"""The maximum number of events to queue while waiting for publishing.
"""
topic: str = ""
"""The topic to use for the event publisher. Consumers can subscribe to
this topic to receive events.
"""
@config
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class VllmConfig:
......@@ -3648,6 +3418,24 @@ class VllmConfig:
" Disabling `torch.compile`.")
self.compilation_config.level = CompilationLevel.NO_COMPILATION
if self.cache_config.kv_sharing_fast_prefill:
if not envs.VLLM_USE_V1:
raise NotImplementedError(
"Fast prefill optimization for KV sharing is not supported "
"in V0 currently.")
if self.speculative_config is not None and \
self.speculative_config.use_eagle():
raise NotImplementedError(
"Fast prefill optimization for KV sharing is not "
"compatible with EAGLE as EAGLE requires correct logits "
"for all tokens while fast prefill gives incorrect logits "
"for prompt tokens.")
logger.warning_once(
"--kv-sharing-fast-prefill requires changes on model side for "
"correctness and to realize prefill savings. ")
if ((not envs.VLLM_USE_V1) and self.lora_config is not None
and self.compilation_config.level
!= CompilationLevel.NO_COMPILATION):
......@@ -3658,16 +3446,37 @@ class VllmConfig:
disable_chunked_prefill_reasons: list[str] = []
if self.model_config and self.model_config.pooler_config:
pooling_type = self.model_config.pooler_config.pooling_type
if pooling_type is None or pooling_type.lower() != "last":
disable_chunked_prefill_reasons.append(
"Only \"last\" pooling supports chunked "
"prefill and prefix caching; disabling both.")
elif not getattr(self.model_config.hf_config, "is_causal", True):
if self.model_config:
if self.model_config.pooler_config:
pooling_type = self.model_config.pooler_config.pooling_type
if pooling_type is None or pooling_type.lower() != "last":
disable_chunked_prefill_reasons.append(
"Only \"last\" pooling supports chunked "
"prefill and prefix caching; disabling both.")
if not getattr(self.model_config.hf_config, "is_causal", True):
disable_chunked_prefill_reasons.append(
"Only models using causal attention supports chunked "
"prefill and prefix caching; disabling both.")
elif self.model_config.is_encoder_decoder:
self.scheduler_config.max_num_encoder_input_tokens = \
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
logger.debug(
"Encoder-decoder model detected: setting "
"`max_num_encoder_input_tokens` to encoder length (%s)",
self.scheduler_config.max_num_encoder_input_tokens)
self.scheduler_config.disable_chunked_mm_input = True
disable_chunked_prefill_reasons.append(
"Only models using causal attention supports chunked "
"prefill and prefix caching; disabling both.")
"Encoder-decoder models do not support chunked prefill nor"
" prefix caching; disabling both.")
if (self.model_config.architecture
== "WhisperForConditionalGeneration"
and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD")
!= "spawn"):
logger.warning(
"Whisper is known to have issues with "
"forked workers. If startup is hanging, "
"try setting 'VLLM_WORKER_MULTIPROC_METHOD' "
"to 'spawn'.")
if disable_chunked_prefill_reasons:
for reason in disable_chunked_prefill_reasons:
......@@ -3724,7 +3533,7 @@ class VllmConfig:
# logger should only print warning message for hybrid models. As we
# can't know whether the model is hybrid or not now, so we don't log
# warning message here and will log it later.
if not (current_platform.is_cuda() or current_platform.is_rocm()):
if not current_platform.support_hybrid_kv_cache():
# Hybrid KV cache manager is not supported on non-GPU platforms.
self.scheduler_config.disable_hybrid_kv_cache_manager = True
if self.kv_transfer_config is not None:
......@@ -3774,30 +3583,40 @@ class VllmConfig:
def _set_cudagraph_sizes(self):
"""
cudagraph batchsize padding logic:
`[1, 2, 4] + [8 * i for i in range(1, 1025)]` is a list of all possible
batch sizes that cudagraph will capture.
Depending on the engine's configuration of `max_num_seqs`, the
candidate batch sizes to capture cudagraph will shrink to the subset
which just cover the range of `[1, max_num_seqs]`. In the common case,
`max_num_seqs` is 256, and the cudagraph batch sizes will be
`[1, 2, 4, 8, 16, 24, 32, 40, ..., 256]`.
vLLM defines the default candidate list of batch sizes for CUDA graph
capture as:
However, if users specify the cudagraph capture sizes through
compilation config, we will use the specified sizes instead.
```python
max_graph_size = min(max_num_seqs * 2, 512)
# 1, 2, 4, then multiples of 8 up to max_graph_size
cuda_graph_sizes = [1, 2, 4, 8, 16, 24, 32, 40, ..., max_graph_size]
In the end, `vllm_config.compilation_config.cudagraph_capture_sizes`
will be the final sizes to capture cudagraph (in descending order).
During runtime, if batchsize is larger than
`vllm_config.compilation_config.cudagraph_capture_sizes`,
no cudagraph will be used.
If the batch size is no larger than
`vllm_config.compilation_config.cudagraph_capture_sizes`,
we can quickly find the padded graph size for a given batch size by
looking up `vllm_config.compilation_config.bs_to_padded_graph_size`.
These sizes are used to capture and reuse CUDA graphs for
performance-critical paths (e.g., decoding). Capturing enables
significantly faster kernel dispatch by avoiding Python overhead. The
list is then filtered based on `max_num_batched_tokens` (e.g., 8192 on
most GPUs), which controls the total allowed number of tokens in a
batch. Since each sequence may have a variable number of tokens, the
maximum usable batch size will depend on actual sequence lengths.
Example:
With `max_num_batched_tokens = 8192`, and typical sequences
averaging ~32 tokens, most practical batch sizes fall below 256.
However, the system will still allow capture sizes up to 512 if
shape and memory permit.
Note:
If users explicitly specify cudagraph capture sizes in the
compilation config, those will override this default logic.
At runtime:
- If batch size <= one of the `cudagraph_capture_sizes`, the closest
padded CUDA graph will be used.
- If batch size > largest `cudagraph_capture_sizes`, cudagraph will
not be used.
"""
# calculate the default `batch_size_capture_list`
......@@ -3899,7 +3718,6 @@ class VllmConfig:
f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
f"tokenizer_mode={self.model_config.tokenizer_mode}, "
f"revision={self.model_config.revision}, "
f"override_neuron_config={self.model_config.override_neuron_config}, " # noqa
f"tokenizer_revision={self.model_config.tokenizer_revision}, "
f"trust_remote_code={self.model_config.trust_remote_code}, "
f"dtype={self.model_config.dtype}, "
......@@ -3908,6 +3726,7 @@ class VllmConfig:
f"load_format={self.load_config.load_format}, "
f"tensor_parallel_size={self.parallel_config.tensor_parallel_size}, " # noqa
f"pipeline_parallel_size={self.parallel_config.pipeline_parallel_size}, " # noqa
f"data_parallel_size={self.parallel_config.data_parallel_size}, " # noqa
f"disable_custom_all_reduce={self.parallel_config.disable_custom_all_reduce}, " # noqa
f"quantization={self.model_config.quantization}, "
f"enforce_eager={self.model_config.enforce_eager}, "
......@@ -4006,7 +3825,7 @@ def contains_object_print(text):
Check if the text looks like a printed Python object, e.g.
contains any substring matching the pattern: "at 0xFFFFFFF>"
We match against 0x followed by 2-16 hex chars (there's
a max of 16 on a 64 bit system).
a max of 16 on a 64-bit system).
Args:
text (str): The text to check
......
......@@ -24,7 +24,7 @@ logger = init_logger(__name__)
BlockSize = Literal[1, 8, 16, 32, 64, 128]
CacheDType = Literal["auto", "fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"]
MambaDType = Literal["auto", "float32"]
PrefixCachingHashAlgo = Literal["builtin", "sha256", "sha256_cbor_64bit"]
PrefixCachingHashAlgo = Literal["sha256", "sha256_cbor"]
@config
......@@ -33,9 +33,8 @@ class CacheConfig:
"""Configuration for the KV cache."""
block_size: SkipValidation[BlockSize] = None # type: ignore
"""Size of a contiguous cache block in number of tokens. This is ignored on
neuron devices and set to `--max-model-len`. On CUDA devices, only block
sizes up to 32 are supported. On HPU devices, block size defaults to 128.
"""Size of a contiguous cache block in number of tokens. On CUDA devices,
only block sizes up to 32 are supported.
This config has no static default. If left unspecified by the user, it will
be set in `Platform.check_and_update_config()` based on the current
......@@ -64,17 +63,12 @@ class CacheConfig:
"""Sliding window size for the KV cache. This is primarily set in
`ModelConfig` and that value should be manually duplicated here."""
enable_prefix_caching: Optional[bool] = None
"""Whether to enable prefix caching. Disabled by default for V0. Enabled by
default for V1."""
prefix_caching_hash_algo: PrefixCachingHashAlgo = "builtin"
"""Whether to enable prefix caching. Enabled by default for V1."""
prefix_caching_hash_algo: PrefixCachingHashAlgo = "sha256"
"""Set the hash algorithm for prefix caching:\n
- "builtin" is Python's built-in hash.\n
- "sha256" is collision resistant but with certain overheads.
This option uses Pickle for object serialization before hashing.\n
- "sha256_cbor_64bit" provides a reproducible, cross-language compatible
hash. It serializes objects using canonical CBOR and hashes them with
SHA-256. The resulting hash consists of the lower 64 bits of the SHA-256
digest."""
- "sha256" uses Pickle for object serialization before hashing.\n
- "sha256_cbor" provides a reproducible, cross-language compatible hash. It
serializes objects using canonical CBOR and hashes them with SHA-256."""
cpu_offload_gb: float = 0
"""The space in GiB to offload to CPU, per GPU. Default is 0, which means
no offloading. Intuitively, this argument can be seen as a virtual way to
......@@ -119,6 +113,15 @@ class CacheConfig:
necessary for implementing this optimization in some models (e.g. Gemma3n)
"""
kv_cache_memory_bytes: Optional[int] = None
"""Size of KV Cache per GPU in bytes. By default, this is set to None
and vllm can automatically infer the kv cache size based on
gpu_memory_utilization. However, users may want to manually specify
the kv cache memory size. kv_cache_memory_bytes allows more fine-grain
control of how much memory gets used when compared with using
gpu_memory_memory_utilization. Note that kv_cache_memory_bytes
(when not-None) ignores gpu_memory_utilization"""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
......@@ -145,19 +148,12 @@ class CacheConfig:
self._verify_cache_dtype()
self._verify_prefix_caching()
self._verify_kv_sharing_fast_prefill()
def metrics_info(self):
# convert cache_config to dict(key: str, value: str) for prometheus
# metrics info
return {key: str(value) for key, value in self.__dict__.items()}
def _verify_kv_sharing_fast_prefill(self) -> None:
if self.kv_sharing_fast_prefill and not envs.VLLM_USE_V1:
raise NotImplementedError(
"Fast prefill optimization for KV sharing is not supported "
"in V0 currently.")
@model_validator(mode='after')
def _verify_args(self) -> Self:
if self.cpu_offload_gb < 0:
......
......@@ -234,7 +234,7 @@ class CompilationConfig:
- FULL_AND_PIECEWISE.
PIECEWISE mode build piecewise cudagraph only, keeping the cudagraph
incompatiable ops (i.e. some attention ops) outside the cudagraph
incompatible ops (i.e. some attention ops) outside the cudagraph
for general flexibility.
This is the default mode.
......@@ -340,6 +340,8 @@ class CompilationConfig:
"vllm.mamba_mixer",
"vllm.short_conv",
"vllm.linear_attention",
"vllm.plamo2_mamba_mixer",
"vllm.gdn_attention",
]
def compute_hash(self) -> str:
......@@ -545,7 +547,8 @@ class CompilationConfig:
# full cudagraph outside the fx graph. This reduces some cpu
# overhead when the runtime batch_size is not cudagraph captured.
# see https://github.com/vllm-project/vllm/pull/20059 for details.
self.splitting_ops = self._attention_ops
# make a copy to avoid mutating the class-level list via reference.
self.splitting_ops = list(self._attention_ops)
elif len(self.splitting_ops) == 0:
logger.warning_once("Using piecewise compilation with empty "
"splitting_ops.")
......@@ -560,6 +563,18 @@ class CompilationConfig:
self.cudagraph_mode = CUDAGraphMode.FULL
self.splitting_ops = []
if envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput":
# exclude MoE dispatch/combine from capture by ensuring
# piecewise splitting includes them, so communication remains
# outside CUDA graphs while compute can still be graphed.
moe_ops = [
"vllm.moe_forward",
"vllm.moe_forward_shared",
]
for op in moe_ops:
if op not in self.splitting_ops:
self.splitting_ops.append(op)
def splitting_ops_contain_attention(self) -> bool:
return self.splitting_ops is not None and all(
op in self.splitting_ops for op in self._attention_ops)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Optional
from pydantic.dataclasses import dataclass
from vllm.config.utils import config
@config
@dataclass
class KVEventsConfig:
"""Configuration for KV event publishing."""
enable_kv_cache_events: bool = False
"""If True, enable KV cache events for tracking block storage and removal.
Events can be published externally by zmq using the event publisher config.
"""
publisher: str = "null"
"""The publisher to use for publishing kv events. Can be "null", "zmq".
"""
endpoint: str = "tcp://*:5557"
"""The zmq endpoint to use for publishing kv events.
"""
replay_endpoint: Optional[str] = None
"""The zmq endpoint to use for replaying kv events.
"""
buffer_steps: int = 10_000
"""The number of steps to cache for replay endpoint. Will only save
events from the last N steps for the replay endpoint.
"""
hwm: int = 100_000
"""The zmq high water mark for the event publisher. After queueing N events,
events will start dropping if the consumer is not keeping up.
"""
max_queue_size: int = 100_000
"""The maximum number of events to queue while waiting for publishing.
"""
topic: str = ""
"""The topic to use for the event publisher. Consumers can subscribe to
this topic to receive events.
"""
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import hashlib
import uuid
from dataclasses import field
from typing import Any, Literal, Optional, get_args
from pydantic.dataclasses import dataclass
from vllm.config.utils import config
KVProducer = Literal["kv_producer", "kv_both"]
KVConsumer = Literal["kv_consumer", "kv_both"]
KVRole = Literal[KVProducer, KVConsumer]
@config
@dataclass
class KVTransferConfig:
"""Configuration for distributed KV cache transfer."""
kv_connector: Optional[str] = None
"""The KV connector for vLLM to transmit KV caches between vLLM instances.
"""
engine_id: Optional[str] = None
"""The engine id for KV transfers."""
kv_buffer_device: Optional[str] = "cuda"
"""The device used by kv connector to buffer the KV cache.
Currently only support 'cuda'."""
kv_buffer_size: float = 1e9
"""The buffer size for TorchDistributedConnector. Measured in number of
bytes. Recommended value: 1e9 (about 1GB)."""
kv_role: Optional[KVRole] = None
"""Whether this vLLM instance produces, consumes KV cache, or both. Choices
are 'kv_producer', 'kv_consumer', and 'kv_both'."""
kv_rank: Optional[int] = None
"""The rank of this vLLM instance in the KV cache transfer. Typical value:
0 for prefill instance, 1 for decode instance.
Currently only 1P1D is supported."""
kv_parallel_size: int = 1
"""The number of parallel instances for KV cache transfer. For
P2pNcclConnector, this should be 2."""
kv_ip: str = "127.0.0.1"
"""The KV connector ip, used to build distributed connection."""
kv_port: int = 14579
"""The KV connector port, used to build distributed connection."""
kv_connector_extra_config: dict[str, Any] = field(default_factory=dict)
"""any extra config that the connector may need."""
kv_connector_module_path: Optional[str] = None
"""The Python module path to dynamically load the KV connector from.
Only supported in V1."""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str
def __post_init__(self) -> None:
if self.engine_id is None:
self.engine_id = str(uuid.uuid4())
if self.kv_role is not None and self.kv_role not in get_args(KVRole):
raise ValueError(f"Unsupported kv_role: {self.kv_role}. "
f"Supported roles are {get_args(KVRole)}")
if self.kv_connector is not None and self.kv_role is None:
raise ValueError("Please specify kv_disagg_role when kv_connector "
f"is set, supported roles are {get_args(KVRole)}")
@property
def is_kv_transfer_instance(self) -> bool:
return self.kv_connector is not None and \
self.kv_role in get_args(KVRole)
@property
def is_kv_producer(self) -> bool:
return self.kv_connector is not None and \
self.kv_role in get_args(KVProducer)
@property
def is_kv_consumer(self) -> bool:
return self.kv_connector is not None and \
self.kv_role in get_args(KVConsumer)
def get_from_extra_config(self, key, default) -> Any:
return self.kv_connector_extra_config.get(key, default)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import hashlib
from dataclasses import field
from typing import TYPE_CHECKING, Any, Optional, Union
from pydantic.dataclasses import dataclass
from vllm.config.utils import config
from vllm.logger import init_logger
if TYPE_CHECKING:
from vllm.model_executor.model_loader import LoadFormats
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
else:
LoadFormats = Any
TensorizerConfig = Any
logger = init_logger(__name__)
@config
@dataclass
class LoadConfig:
"""Configuration for loading the model weights."""
load_format: Union[str, LoadFormats] = "auto"
"""The format of the model weights to load:\n
- "auto" will try to load the weights in the safetensors format and fall
back to the pytorch bin format if safetensors format is not available.\n
- "pt" will load the weights in the pytorch bin format.\n
- "safetensors" will load the weights in the safetensors format.\n
- "npcache" will load the weights in pytorch format and store a numpy cache
to speed up the loading.\n
- "dummy" will initialize the weights with random values, which is mainly
for profiling.\n
- "tensorizer" will use CoreWeave's tensorizer library for fast weight
loading. See the Tensorize vLLM Model script in the Examples section for
more information.\n
- "runai_streamer" will load the Safetensors weights using Run:ai Model
Streamer.\n
- "bitsandbytes" will load the weights using bitsandbytes quantization.\n
- "sharded_state" will load weights from pre-sharded checkpoint files,
supporting efficient loading of tensor-parallel models.\n
- "gguf" will load weights from GGUF format files (details specified in
https://github.com/ggml-org/ggml/blob/master/docs/gguf.md).\n
- "mistral" will load weights from consolidated safetensors files used by
Mistral models.
- Other custom values can be supported via plugins."""
download_dir: Optional[str] = None
"""Directory to download and load the weights, default to the default
cache directory of Hugging Face."""
safetensors_load_strategy: str = "lazy"
"""Specifies the loading strategy for safetensors weights.
- "lazy" (default): Weights are memory-mapped from the file. This enables
on-demand loading and is highly efficient for models on local storage.
- "eager": The entire file is read into CPU memory upfront before loading.
This is recommended for models on network filesystems (e.g., Lustre, NFS)
as it avoids inefficient random reads, significantly speeding up model
initialization. However, it uses more CPU RAM.
"""
model_loader_extra_config: Union[dict, TensorizerConfig] = field(
default_factory=dict)
"""Extra config for model loader. This will be passed to the model loader
corresponding to the chosen load_format."""
device: Optional[str] = None
"""Device to which model weights will be loaded, default to
device_config.device"""
ignore_patterns: Optional[Union[list[str], str]] = None
"""The list of patterns to ignore when loading the model. Default to
"original/**/*" to avoid repeated loading of llama's checkpoints."""
use_tqdm_on_load: bool = True
"""Whether to enable tqdm for showing progress bar when loading model
weights."""
pt_load_map_location: Union[str, dict[str, str]] = "cpu"
"""
pt_load_map_location: the map location for loading pytorch checkpoint, to
support loading checkpoints can only be loaded on certain devices like
"cuda", this is equivalent to {"": "cuda"}. Another supported format is
mapping from different devices like from GPU 1 to GPU 0:
{"cuda:1": "cuda:0"}. Note that when passed from command line, the strings
in dictionary needs to be double quoted for json parsing. For more details,
see original doc for `map_location` in https://pytorch.org/docs/stable/generated/torch.load.html
"""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
# no factors to consider.
# this config will not affect the computation graph.
factors: list[Any] = []
hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str
def __post_init__(self):
self.load_format = self.load_format.lower()
if self.ignore_patterns is not None and len(self.ignore_patterns) > 0:
logger.info(
"Ignoring the following patterns when downloading weights: %s",
self.ignore_patterns)
else:
self.ignore_patterns = ["original/**/*"]
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import hashlib
from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
import torch
from pydantic import ConfigDict
from pydantic.dataclasses import dataclass
import vllm.envs as envs
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.platforms import current_platform
if TYPE_CHECKING:
from vllm.config import ModelConfig
from vllm.config.cache import CacheConfig
else:
ModelConfig = Any
CacheConfig = Any
logger = init_logger(__name__)
LoRADType = Literal["auto", "float16", "bfloat16"]
@config
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class LoRAConfig:
"""Configuration for LoRA."""
max_lora_rank: int = 16
"""Max LoRA rank."""
max_loras: int = 1
"""Max number of LoRAs in a single batch."""
fully_sharded_loras: bool = False
"""By default, only half of the LoRA computation is sharded with tensor
parallelism. Enabling this will use the fully sharded layers. At high
sequence length, max rank or tensor parallel size, this is likely faster.
"""
max_cpu_loras: Optional[int] = None
"""Maximum number of LoRAs to store in CPU memory. Must be >= than
`max_loras`."""
lora_dtype: Union[torch.dtype, LoRADType] = "auto"
"""Data type for LoRA. If auto, will default to base model dtype."""
lora_extra_vocab_size: int = 256
"""(Deprecated) Maximum size of extra vocabulary that can be present in a
LoRA adapter. Will be removed in v0.12.0."""
lora_vocab_padding_size: ClassVar[int] = current_platform\
.get_lora_vocab_padding_size()
default_mm_loras: Optional[dict[str, str]] = None
"""Dictionary mapping specific modalities to LoRA model paths; this field
is only applicable to multimodal models and should be leveraged when a
model always expects a LoRA to be active when a given modality is present.
Note that currently, if a request provides multiple additional
modalities, each of which have their own LoRA, we do NOT apply
default_mm_loras because we currently only support one lora adapter
per prompt. When run in offline mode, the lora IDs for n modalities
will be automatically assigned to 1-n with the names of the modalities
in alphabetic order."""
bias_enabled: bool = False
"""[DEPRECATED] Enable bias for LoRA adapters. This option will be
removed in v0.12.0."""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
factors: list[Any] = []
factors.append(self.max_lora_rank)
factors.append(self.max_loras)
factors.append(self.fully_sharded_loras)
factors.append(self.lora_dtype)
factors.append(self.lora_extra_vocab_size)
factors.append(self.lora_vocab_padding_size)
factors.append(self.bias_enabled)
hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str
def __post_init__(self):
# Deprecation warning for lora_extra_vocab_size
logger.warning(
"`lora_extra_vocab_size` is deprecated and will be removed "
"in v0.12.0. Additional vocabulary support for "
"LoRA adapters is being phased out.")
# Deprecation warning for enable_lora_bias
if self.bias_enabled:
logger.warning("`enable_lora_bias` is deprecated "
"and will be removed in v0.12.0.")
# Setting the maximum rank to 512 should be able to satisfy the vast
# majority of applications.
possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
possible_lora_extra_vocab_size = (256, 512)
if self.max_lora_rank not in possible_max_ranks:
raise ValueError(
f"max_lora_rank ({self.max_lora_rank}) must be one of "
f"{possible_max_ranks}.")
if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
raise ValueError(
f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
f"must be one of {possible_lora_extra_vocab_size}.")
if self.max_loras < 1:
raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
if self.max_cpu_loras is None:
self.max_cpu_loras = self.max_loras
elif self.max_cpu_loras < self.max_loras:
raise ValueError(
f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
f"max_loras ({self.max_loras})")
def verify_with_cache_config(self, cache_config: CacheConfig):
if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
raise ValueError(
"V0 LoRA does not support CPU offload, please use V1.")
def verify_with_model_config(self, model_config: ModelConfig):
if self.lora_dtype in (None, "auto"):
self.lora_dtype = model_config.dtype
elif isinstance(self.lora_dtype, str):
self.lora_dtype = getattr(torch, self.lora_dtype)
......@@ -87,7 +87,7 @@ class ParallelConfig:
data_parallel_external_lb: bool = False
"""Whether to use "external" DP LB mode. Applies only to online serving
and when data_parallel_size > 0. This is useful for a "one-pod-per-rank"
wide-EP setup in Kuberentes. Set implicitly when --data-parallel-rank
wide-EP setup in Kubernetes. Set implicitly when --data-parallel-rank
is provided explicitly to vllm serve."""
data_parallel_hybrid_lb: bool = False
"""Whether to use "hybrid" DP LB mode. Applies only to online serving
......@@ -170,6 +170,11 @@ class ParallelConfig:
Set to be private as it's not intended to be configured by users.
"""
decode_context_parallel_size: int = 1
"""Number of decode context parallel groups, because the world size does
not change by dcp, it simply reuse the GPUs of TP group, and tp_size
needs to be divisible by dcp_size."""
@property
def world_size_across_dp(self) -> int:
"""world_size_across_dp is TPxPPxDP, it is the size of the world
......@@ -363,8 +368,10 @@ class ParallelConfig:
else:
if self.eplb_config.num_redundant_experts != 0:
raise ValueError(
"num_redundant_experts should be used with EPLB."
f"{self.eplb_config.num_redundant_experts}.")
"num_redundant_experts is set to "
f"{self.eplb_config.num_redundant_experts} but EPLB is not "
"enabled. Either enable EPLB or unset "
"num_redundant_experts.")
if self.distributed_executor_backend is None and self.world_size > 1:
# We use multiprocessing by default if world_size fits on the
# current node and we aren't in a ray placement group.
......@@ -372,10 +379,7 @@ class ParallelConfig:
from vllm.executor import ray_utils
backend: DistributedExecutorBackend = "mp"
ray_found = ray_utils.ray_is_available()
if current_platform.is_neuron():
# neuron uses single process to control multiple devices
backend = "uni"
elif current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
if current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
backend = "uni"
elif (current_platform.is_cuda()
and cuda_device_count_stateless() < self.world_size):
......
......@@ -11,7 +11,8 @@ from typing import Callable, Deque, Dict, Iterable, List, Optional
from typing import Sequence as GenericSequence
from typing import Set, Tuple, Union
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
from vllm.config import CacheConfig, SchedulerConfig
from vllm.config.lora import LoRAConfig
from vllm.core.interfaces import AllocStatus, BlockSpaceManager
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import TYPE_CHECKING, Any
from typing import Any
import torch
import torch.distributed as dist
......@@ -13,11 +13,6 @@ from .base_device_communicator import All2AllManagerBase, Cache
logger = init_logger(__name__)
if TYPE_CHECKING:
from vllm.model_executor.layers.fused_moe.layer import FusedMoE
else:
FusedMoE = None
class NaiveAll2AllManager(All2AllManagerBase):
"""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment