Commit 705f6a35 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.5.2' into v0.5.2-dtk24.04.1

parents af837396 4cf256ae
...@@ -9,7 +9,9 @@ import json ...@@ -9,7 +9,9 @@ import json
import ssl import ssl
from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
from vllm.entrypoints.openai.serving_engine import LoRAModulePath from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
PromptAdapterPath)
from vllm.utils import FlexibleArgumentParser
class LoRAParserAction(argparse.Action): class LoRAParserAction(argparse.Action):
...@@ -22,9 +24,17 @@ class LoRAParserAction(argparse.Action): ...@@ -22,9 +24,17 @@ class LoRAParserAction(argparse.Action):
setattr(namespace, self.dest, lora_list) setattr(namespace, self.dest, lora_list)
def make_arg_parser(): class PromptAdapterParserAction(argparse.Action):
parser = argparse.ArgumentParser(
description="vLLM OpenAI-Compatible RESTful API server.") def __call__(self, parser, namespace, values, option_string=None):
adapter_list = []
for item in values:
name, path = item.split('=')
adapter_list.append(PromptAdapterPath(name, path))
setattr(namespace, self.dest, adapter_list)
def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
parser.add_argument("--host", parser.add_argument("--host",
type=nullable_str, type=nullable_str,
default=None, default=None,
...@@ -64,6 +74,14 @@ def make_arg_parser(): ...@@ -64,6 +74,14 @@ def make_arg_parser():
action=LoRAParserAction, action=LoRAParserAction,
help="LoRA module configurations in the format name=path. " help="LoRA module configurations in the format name=path. "
"Multiple modules can be specified.") "Multiple modules can be specified.")
parser.add_argument(
"--prompt-adapters",
type=nullable_str,
default=None,
nargs='+',
action=PromptAdapterParserAction,
help="Prompt adapter configurations in the format name=path. "
"Multiple adapters can be specified.")
parser.add_argument("--chat-template", parser.add_argument("--chat-template",
type=nullable_str, type=nullable_str,
default=None, default=None,
...@@ -113,3 +131,9 @@ def make_arg_parser(): ...@@ -113,3 +131,9 @@ def make_arg_parser():
parser = AsyncEngineArgs.add_cli_args(parser) parser = AsyncEngineArgs.add_cli_args(parser)
return parser return parser
def create_parser_for_docs() -> FlexibleArgumentParser:
parser_for_docs = FlexibleArgumentParser(
prog="-m vllm.entrypoints.openai.api_server")
return make_arg_parser(parser_for_docs)
...@@ -103,7 +103,8 @@ class ResponseFormat(OpenAIBaseModel): ...@@ -103,7 +103,8 @@ class ResponseFormat(OpenAIBaseModel):
class StreamOptions(OpenAIBaseModel): class StreamOptions(OpenAIBaseModel):
include_usage: Optional[bool] include_usage: Optional[bool] = True
continuous_usage_stats: Optional[bool] = True
class FunctionDefinition(OpenAIBaseModel): class FunctionDefinition(OpenAIBaseModel):
...@@ -190,6 +191,27 @@ class ChatCompletionRequest(OpenAIBaseModel): ...@@ -190,6 +191,27 @@ class ChatCompletionRequest(OpenAIBaseModel):
"special tokens so this should be set to False (as is the " "special tokens so this should be set to False (as is the "
"default)."), "default)."),
) )
documents: Optional[List[Dict[str, str]]] = Field(
default=None,
description=
("A list of dicts representing documents that will be accessible to "
"the model if it is performing RAG (retrieval-augmented generation)."
" If the template does not support RAG, this argument will have no "
"effect. We recommend that each document should be a dict containing "
"\"title\" and \"text\" keys."),
)
chat_template: Optional[str] = Field(
default=None,
description=(
"A Jinja template to use for this conversion. "
"If this is not passed, the model's default chat template will be "
"used instead."),
)
chat_template_kwargs: Optional[Dict[str, Any]] = Field(
default=None,
description=("Additional kwargs to pass to the template renderer. "
"Will be accessible by the chat template."),
)
include_stop_str_in_output: Optional[bool] = Field( include_stop_str_in_output: Optional[bool] = Field(
default=False, default=False,
description=( description=(
...@@ -234,15 +256,22 @@ class ChatCompletionRequest(OpenAIBaseModel): ...@@ -234,15 +256,22 @@ class ChatCompletionRequest(OpenAIBaseModel):
logits_processors = None logits_processors = None
if self.logit_bias: if self.logit_bias:
logit_bias: Dict[int, float] = {}
try:
for token_id, bias in self.logit_bias.items():
# Convert token_id to integer before we add to LLMEngine
# Clamp the bias between -100 and 100 per OpenAI API spec
logit_bias[int(token_id)] = min(100, max(-100, bias))
except ValueError as exc:
raise ValueError(f"Found token_id `{token_id}` in logit_bias "
f"but token_id must be an integer or string "
f"representing an integer") from exc
def logit_bias_logits_processor( def logit_bias_logits_processor(
token_ids: List[int], token_ids: List[int],
logits: torch.Tensor) -> torch.Tensor: logits: torch.Tensor) -> torch.Tensor:
assert self.logit_bias is not None for token_id, bias in logit_bias.items():
for token_id, bias in self.logit_bias.items(): logits[token_id] += bias
# Clamp the bias between -100 and 100 per OpenAI API spec
bias = min(100, max(-100, bias))
logits[int(token_id)] += bias
return logits return logits
logits_processors = [logit_bias_logits_processor] logits_processors = [logit_bias_logits_processor]
...@@ -419,15 +448,22 @@ class CompletionRequest(OpenAIBaseModel): ...@@ -419,15 +448,22 @@ class CompletionRequest(OpenAIBaseModel):
logits_processors = None logits_processors = None
if self.logit_bias: if self.logit_bias:
logit_bias: Dict[int, float] = {}
try:
for token_id, bias in self.logit_bias.items():
# Convert token_id to integer
# Clamp the bias between -100 and 100 per OpenAI API spec
logit_bias[int(token_id)] = min(100, max(-100, bias))
except ValueError as exc:
raise ValueError(f"Found token_id `{token_id}` in logit_bias "
f"but token_id must be an integer or string "
f"representing an integer") from exc
def logit_bias_logits_processor( def logit_bias_logits_processor(
token_ids: List[int], token_ids: List[int],
logits: torch.Tensor) -> torch.Tensor: logits: torch.Tensor) -> torch.Tensor:
assert self.logit_bias is not None for token_id, bias in logit_bias.items():
for token_id, bias in self.logit_bias.items(): logits[token_id] += bias
# Clamp the bias between -100 and 100 per OpenAI API spec
bias = min(100, max(-100, bias))
logits[int(token_id)] += bias
return logits return logits
logits_processors = [logit_bias_logits_processor] logits_processors = [logit_bias_logits_processor]
...@@ -566,7 +602,7 @@ class CompletionStreamResponse(OpenAIBaseModel): ...@@ -566,7 +602,7 @@ class CompletionStreamResponse(OpenAIBaseModel):
class EmbeddingResponseData(BaseModel): class EmbeddingResponseData(BaseModel):
index: int index: int
object: str = "embedding" object: str = "embedding"
embedding: List[float] embedding: Union[List[float], str]
class EmbeddingResponse(BaseModel): class EmbeddingResponse(BaseModel):
...@@ -672,6 +708,17 @@ class BatchRequestInput(OpenAIBaseModel): ...@@ -672,6 +708,17 @@ class BatchRequestInput(OpenAIBaseModel):
body: Union[ChatCompletionRequest, ] body: Union[ChatCompletionRequest, ]
class BatchResponseData(OpenAIBaseModel):
# HTTP status code of the response.
status_code: int = 200
# An unique identifier for the API request.
request_id: str
# The body of the response.
body: Optional[ChatCompletionResponse] = None
class BatchRequestOutput(OpenAIBaseModel): class BatchRequestOutput(OpenAIBaseModel):
""" """
The per-line object of the batch output and error files The per-line object of the batch output and error files
...@@ -683,8 +730,29 @@ class BatchRequestOutput(OpenAIBaseModel): ...@@ -683,8 +730,29 @@ class BatchRequestOutput(OpenAIBaseModel):
# inputs. # inputs.
custom_id: str custom_id: str
response: Optional[ChatCompletionResponse] response: Optional[BatchResponseData]
# For requests that failed with a non-HTTP error, this will contain more # For requests that failed with a non-HTTP error, this will contain more
# information on the cause of the failure. # information on the cause of the failure.
error: Optional[Any] error: Optional[Any]
class TokenizeRequest(OpenAIBaseModel):
model: str
prompt: str
add_special_tokens: bool = Field(default=True)
class TokenizeResponse(OpenAIBaseModel):
tokens: List[int]
count: int
max_model_len: int
class DetokenizeRequest(OpenAIBaseModel):
model: str
tokens: List[int]
class DetokenizeResponse(OpenAIBaseModel):
prompt: str
import argparse
import asyncio import asyncio
import sys
from io import StringIO from io import StringIO
from typing import Awaitable, List
import aiohttp import aiohttp
import vllm
from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.openai.protocol import (BatchRequestInput, from vllm.entrypoints.openai.protocol import (BatchRequestInput,
BatchRequestOutput, BatchRequestOutput,
ChatCompletionResponse) BatchResponseData,
ChatCompletionResponse,
ErrorResponse)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.usage.usage_lib import UsageContext from vllm.usage.usage_lib import UsageContext
from vllm.utils import random_uuid from vllm.utils import FlexibleArgumentParser, random_uuid
from vllm.version import __version__ as VLLM_VERSION
logger = init_logger(__name__) logger = init_logger(__name__)
def parse_args(): def parse_args():
parser = argparse.ArgumentParser( parser = FlexibleArgumentParser(
description="vLLM OpenAI-Compatible batch runner.") description="vLLM OpenAI-Compatible batch runner.")
parser.add_argument( parser.add_argument(
"-i", "-i",
...@@ -55,7 +56,7 @@ async def read_file(path_or_url: str) -> str: ...@@ -55,7 +56,7 @@ async def read_file(path_or_url: str) -> str:
session.get(path_or_url) as resp: session.get(path_or_url) as resp:
return await resp.text() return await resp.text()
else: else:
with open(path_or_url, "r") as f: with open(path_or_url, "r", encoding="utf-8") as f:
return f.read() return f.read()
...@@ -68,7 +69,7 @@ async def write_file(path_or_url: str, data: str) -> None: ...@@ -68,7 +69,7 @@ async def write_file(path_or_url: str, data: str) -> None:
# We should make this async, but as long as this is always run as a # We should make this async, but as long as this is always run as a
# standalone program, blocking the event loop won't effect performance # standalone program, blocking the event loop won't effect performance
# in this particular case. # in this particular case.
with open(path_or_url, "w") as f: with open(path_or_url, "w", encoding="utf-8") as f:
f.write(data) f.write(data)
...@@ -76,20 +77,27 @@ async def run_request(chat_serving: OpenAIServingChat, ...@@ -76,20 +77,27 @@ async def run_request(chat_serving: OpenAIServingChat,
request: BatchRequestInput) -> BatchRequestOutput: request: BatchRequestInput) -> BatchRequestOutput:
chat_request = request.body chat_request = request.body
chat_response = await chat_serving.create_chat_completion(chat_request) chat_response = await chat_serving.create_chat_completion(chat_request)
if isinstance(chat_response, ChatCompletionResponse): if isinstance(chat_response, ChatCompletionResponse):
batch_output = BatchRequestOutput( batch_output = BatchRequestOutput(
id=f"vllm-{random_uuid()}", id=f"vllm-{random_uuid()}",
custom_id=request.custom_id, custom_id=request.custom_id,
response=chat_response, response=BatchResponseData(
body=chat_response, request_id=f"vllm-batch-{random_uuid()}"),
error=None, error=None,
) )
else: elif isinstance(chat_response, ErrorResponse):
batch_output = BatchRequestOutput( batch_output = BatchRequestOutput(
id=f"vllm-{random_uuid()}", id=f"vllm-{random_uuid()}",
custom_id=request.custom_id, custom_id=request.custom_id,
response=None, response=BatchResponseData(
status_code=chat_response.code,
request_id=f"vllm-batch-{random_uuid()}"),
error=chat_response, error=chat_response,
) )
else:
raise ValueError("Request must not be sent in stream mode")
return batch_output return batch_output
...@@ -114,7 +122,7 @@ async def main(args): ...@@ -114,7 +122,7 @@ async def main(args):
) )
# Submit all requests in the file to the engine "concurrently". # Submit all requests in the file to the engine "concurrently".
response_futures = [] response_futures: List[Awaitable[BatchRequestOutput]] = []
for request_json in (await read_file(args.input_file)).strip().split("\n"): for request_json in (await read_file(args.input_file)).strip().split("\n"):
request = BatchRequestInput.model_validate_json(request_json) request = BatchRequestInput.model_validate_json(request_json)
response_futures.append(run_request(openai_serving_chat, request)) response_futures.append(run_request(openai_serving_chat, request))
...@@ -128,14 +136,11 @@ async def main(args): ...@@ -128,14 +136,11 @@ async def main(args):
output_buffer.seek(0) output_buffer.seek(0)
await write_file(args.output_file, output_buffer.read().strip()) await write_file(args.output_file, output_buffer.read().strip())
# Temporary workaround for https://github.com/vllm-project/vllm/issues/4789
sys.exit(0)
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parse_args()
logger.info("vLLM API server version %s", vllm.__version__) logger.info("vLLM API server version %s", VLLM_VERSION)
logger.info("args: %s", args) logger.info("args: %s", args)
asyncio.run(main(args)) asyncio.run(main(args))
import codecs import codecs
import time import time
from dataclasses import dataclass, field from dataclasses import dataclass, field
from functools import cached_property
from typing import (AsyncGenerator, AsyncIterator, Awaitable, Dict, Iterable, from typing import (AsyncGenerator, AsyncIterator, Awaitable, Dict, Iterable,
List, Optional) List, Optional)
from typing import Sequence as GenericSequence from typing import Sequence as GenericSequence
...@@ -10,7 +11,7 @@ from fastapi import Request ...@@ -10,7 +11,7 @@ from fastapi import Request
from openai.types.chat import (ChatCompletionContentPartImageParam, from openai.types.chat import (ChatCompletionContentPartImageParam,
ChatCompletionContentPartTextParam) ChatCompletionContentPartTextParam)
from vllm.config import ModelConfig, VisionLanguageConfig from vllm.config import ModelConfig
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.openai.protocol import ( from vllm.entrypoints.openai.protocol import (
ChatCompletionContentPartParam, ChatCompletionLogProb, ChatCompletionContentPartParam, ChatCompletionLogProb,
...@@ -26,11 +27,12 @@ from vllm.inputs import PromptInputs ...@@ -26,11 +27,12 @@ from vllm.inputs import PromptInputs
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.guided_decoding import ( from vllm.model_executor.guided_decoding import (
get_guided_decoding_logits_processor) get_guided_decoding_logits_processor)
from vllm.multimodal.image import ImagePixelData from vllm.multimodal import MultiModalDataDict
from vllm.multimodal.utils import (async_get_and_parse_image, from vllm.multimodal.utils import async_get_and_parse_image
get_full_image_text_prompt)
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.sequence import Logprob from vllm.sequence import Logprob
from vllm.tracing import (contains_trace_headers, extract_trace_headers,
log_tracing_disabled_warning)
from vllm.utils import random_uuid from vllm.utils import random_uuid
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -45,7 +47,7 @@ class ConversationMessage(TypedDict): ...@@ -45,7 +47,7 @@ class ConversationMessage(TypedDict):
@dataclass(frozen=True) @dataclass(frozen=True)
class ChatMessageParseResult: class ChatMessageParseResult:
messages: List[ConversationMessage] messages: List[ConversationMessage]
image_futures: List[Awaitable[ImagePixelData]] = field( mm_futures: List[Awaitable[MultiModalDataDict]] = field(
default_factory=list) default_factory=list)
...@@ -95,82 +97,85 @@ class OpenAIServingChat(OpenAIServing): ...@@ -95,82 +97,85 @@ class OpenAIServingChat(OpenAIServing):
logger.warning( logger.warning(
"No chat template provided. Chat API will not work.") "No chat template provided. Chat API will not work.")
@cached_property
def image_token_str(self) -> Optional[str]:
# TODO: Let user specify how to insert image tokens into prompt
# (similar to chat template)
model_type = self.model_config.hf_config.model_type
if model_type == "phi3_v":
# Workaround since this token is not defined in the tokenizer
return "<|image_1|>"
if model_type in ("blip-2", "chatglm", "fuyu", "minicpmv",
"paligemma"):
# These models do not use image tokens in the prompt
return None
if model_type.startswith("llava"):
return self.tokenizer.decode(
self.model_config.hf_config.image_token_index)
else:
raise TypeError("Unknown model type: {model_type}")
# TODO: Let user specify how to insert image tokens into prompt
# (similar to chat template)
def _get_full_image_text_prompt(self, image_token_str: str,
text_prompt: str) -> str:
"""Combine image and text prompts for vision language model"""
# NOTE: For now we assume all model architectures use the same
# image + text prompt format. This may change in the future.
return f"{image_token_str}\n{text_prompt}"
def _parse_chat_message_content_parts( def _parse_chat_message_content_parts(
self, self,
role: str, role: str,
parts: Iterable[ChatCompletionContentPartParam], parts: Iterable[ChatCompletionContentPartParam],
) -> ChatMessageParseResult: ) -> ChatMessageParseResult:
texts: List[str] = [] texts: List[str] = []
image_futures: List[Awaitable[ImagePixelData]] = [] mm_futures: List[Awaitable[MultiModalDataDict]] = []
vlm_config: Optional[VisionLanguageConfig] = getattr(
self.engine.engine, "vision_language_config", None)
model_config = getattr(self.engine.engine, "model_config", None)
for part in parts: for part in parts:
part_type = part["type"] part_type = part["type"]
if part_type == "text": if part_type == "text":
text = cast(ChatCompletionContentPartTextParam, part)["text"] text = cast(ChatCompletionContentPartTextParam, part)["text"]
texts.append(text) texts.append(text)
elif part_type == "image_url": elif part_type == "image_url":
if vlm_config is None: if len(mm_futures) > 0:
raise ValueError(
"'image_url' input is not supported as the loaded "
"model is not multimodal.")
elif len(image_futures) == 0:
assert self.tokenizer is not None
image_url = cast(ChatCompletionContentPartImageParam,
part)["image_url"]
if image_url.get("detail", "auto") != "auto":
logger.warning(
"'image_url.detail' is currently not supported and "
"will be ignored.")
image_future = async_get_and_parse_image(image_url["url"])
image_futures.append(image_future)
else:
raise NotImplementedError( raise NotImplementedError(
"Multiple 'image_url' input is currently not supported." "Multiple 'image_url' input is currently not supported."
) )
image_url = cast(ChatCompletionContentPartImageParam,
part)["image_url"]
if image_url.get("detail", "auto") != "auto":
logger.warning(
"'image_url.detail' is currently not supported and "
"will be ignored.")
image_future = async_get_and_parse_image(image_url["url"])
mm_futures.append(image_future)
else: else:
raise NotImplementedError(f"Unknown part type: {part_type}") raise NotImplementedError(f"Unknown part type: {part_type}")
text_prompt = "\n".join(texts) text_prompt = "\n".join(texts)
if vlm_config is not None and len(image_futures): if mm_futures:
image_token_str = self.image_token_str
(image_token_prompt, if image_token_str is not None:
image_token_str) = vlm_config.get_image_token_text(self.tokenizer) if image_token_str in text_prompt:
logger.warning(
# NOTE: If image token string (e.g, <image>) is already present "Detected image token string in the text prompt. "
# in the text prompt, we assume it follows the same format required "Skipping prompt formatting.")
# by the engine. else:
if image_token_str in text_prompt: text_prompt = self._get_full_image_text_prompt(
logger.warning( image_token_str=image_token_str,
"Detected image token string in the text prompt. " text_prompt=text_prompt,
"Skipping prompt formatting.") )
messages = [
ConversationMessage(role=role, content=text_prompt)
]
else: messages = [ConversationMessage(role=role, content=text_prompt)]
full_prompt = get_full_image_text_prompt(
image_prompt=image_token_prompt,
text_prompt=text_prompt,
config=model_config)
messages = [
ConversationMessage(role=role, content=full_prompt)
]
else:
messages = [ConversationMessage(role=role, content=text_prompt)]
return ChatMessageParseResult(messages=messages, return ChatMessageParseResult(messages=messages, mm_futures=mm_futures)
image_futures=image_futures)
def _parse_chat_message_content( def _parse_chat_message_content(
self, self,
...@@ -180,10 +185,10 @@ class OpenAIServingChat(OpenAIServing): ...@@ -180,10 +185,10 @@ class OpenAIServingChat(OpenAIServing):
content = message.get("content") content = message.get("content")
if content is None: if content is None:
return ChatMessageParseResult(messages=[], image_futures=[]) return ChatMessageParseResult(messages=[], mm_futures=[])
if isinstance(content, str): if isinstance(content, str):
messages = [ConversationMessage(role=role, content=content)] messages = [ConversationMessage(role=role, content=content)]
return ChatMessageParseResult(messages=messages, image_futures=[]) return ChatMessageParseResult(messages=messages, mm_futures=[])
return self._parse_chat_message_content_parts(role, content) return self._parse_chat_message_content_parts(role, content)
...@@ -208,32 +213,41 @@ class OpenAIServingChat(OpenAIServing): ...@@ -208,32 +213,41 @@ class OpenAIServingChat(OpenAIServing):
try: try:
conversation: List[ConversationMessage] = [] conversation: List[ConversationMessage] = []
image_futures: List[Awaitable[ImagePixelData]] = [] mm_futures: List[Awaitable[MultiModalDataDict]] = []
for msg in request.messages: for msg in request.messages:
chat_parsed_result = self._parse_chat_message_content(msg) chat_parsed_result = self._parse_chat_message_content(msg)
conversation.extend(chat_parsed_result.messages) conversation.extend(chat_parsed_result.messages)
image_futures.extend(chat_parsed_result.image_futures) mm_futures.extend(chat_parsed_result.mm_futures)
tool_dicts = None if request.tools is None else [
tool.model_dump() for tool in request.tools
]
prompt = self.tokenizer.apply_chat_template( prompt = self.tokenizer.apply_chat_template(
conversation=conversation, conversation=conversation,
tokenize=False, tokenize=False,
add_generation_prompt=request.add_generation_prompt, add_generation_prompt=request.add_generation_prompt,
tools=tool_dicts,
documents=request.documents,
chat_template=request.chat_template,
**(request.chat_template_kwargs or {}),
) )
except Exception as e: except Exception as e:
logger.error("Error in applying chat template from request: %s", e) logger.error("Error in applying chat template from request: %s", e)
return self.create_error_response(str(e)) return self.create_error_response(str(e))
# Fetch image data mm_data: Optional[MultiModalDataDict] = None
image_data: Optional[ImagePixelData] = None
try: try:
if len(image_futures): if len(mm_futures):
# since we support only single image currently # since we support only single mm data currently
assert len(image_futures) == 1 assert len(
image_data = await image_futures[0] mm_futures
) == 1, "Multiple 'image_url' input is currently not supported."
mm_data = await mm_futures[0]
except Exception as e: except Exception as e:
logger.error("Error in loading image data: %s", e) logger.error("Error in loading multi-modal data: %s", e)
return self.create_error_response(str(e)) return self.create_error_response(str(e))
request_id = f"cmpl-{random_uuid()}" request_id = f"cmpl-{random_uuid()}"
...@@ -244,7 +258,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -244,7 +258,7 @@ class OpenAIServingChat(OpenAIServing):
prompt=prompt, prompt=prompt,
add_special_tokens=request.add_special_tokens) add_special_tokens=request.add_special_tokens)
sampling_params = request.to_sampling_params() sampling_params = request.to_sampling_params()
lora_request = self._maybe_get_lora(request) _, lora_request = self._maybe_get_adapter(request)
decoding_config = await self.engine.get_decoding_config() decoding_config = await self.engine.get_decoding_config()
guided_decoding_backend = request.guided_decoding_backend \ guided_decoding_backend = request.guided_decoding_backend \
or decoding_config.guided_decoding_backend or decoding_config.guided_decoding_backend
...@@ -264,14 +278,23 @@ class OpenAIServingChat(OpenAIServing): ...@@ -264,14 +278,23 @@ class OpenAIServingChat(OpenAIServing):
"prompt": prompt_text, "prompt": prompt_text,
"prompt_token_ids": prompt_ids, "prompt_token_ids": prompt_ids,
} }
if image_data is not None: if mm_data:
inputs["multi_modal_data"] = image_data inputs["multi_modal_data"] = mm_data
is_tracing_enabled = await self.engine.is_tracing_enabled()
trace_headers = None
if is_tracing_enabled and raw_request:
trace_headers = extract_trace_headers(raw_request.headers)
if not is_tracing_enabled and raw_request and contains_trace_headers(
raw_request.headers):
log_tracing_disabled_warning()
result_generator = self.engine.generate( result_generator = self.engine.generate(
inputs, inputs,
sampling_params, sampling_params,
request_id, request_id,
lora_request, lora_request,
trace_headers=trace_headers,
) )
# Streaming response # Streaming response
if request.stream: if request.stream:
...@@ -487,7 +510,7 @@ class OpenAIServingChat(OpenAIServing): ...@@ -487,7 +510,7 @@ class OpenAIServingChat(OpenAIServing):
final_res = res final_res = res
assert final_res is not None assert final_res is not None
choices = [] choices: List[ChatCompletionResponseChoice] = []
role = self.get_chat_request_role(request) role = self.get_chat_request_role(request)
for output in final_res.outputs: for output in final_res.outputs:
......
...@@ -16,14 +16,21 @@ from vllm.entrypoints.openai.protocol import (CompletionLogProbs, ...@@ -16,14 +16,21 @@ from vllm.entrypoints.openai.protocol import (CompletionLogProbs,
CompletionResponseChoice, CompletionResponseChoice,
CompletionResponseStreamChoice, CompletionResponseStreamChoice,
CompletionStreamResponse, CompletionStreamResponse,
UsageInfo) DetokenizeRequest,
DetokenizeResponse,
TokenizeRequest,
TokenizeResponse, UsageInfo)
# yapf: enable
from vllm.entrypoints.openai.serving_engine import (LoRAModulePath, from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
OpenAIServing) OpenAIServing,
PromptAdapterPath)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.guided_decoding import ( from vllm.model_executor.guided_decoding import (
get_guided_decoding_logits_processor) get_guided_decoding_logits_processor)
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.sequence import Logprob from vllm.sequence import Logprob
from vllm.tracing import (contains_trace_headers, extract_trace_headers,
log_tracing_disabled_warning)
from vllm.utils import merge_async_iterators, random_uuid from vllm.utils import merge_async_iterators, random_uuid
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -61,11 +68,13 @@ class OpenAIServingCompletion(OpenAIServing): ...@@ -61,11 +68,13 @@ class OpenAIServingCompletion(OpenAIServing):
def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig, def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig,
served_model_names: List[str], served_model_names: List[str],
lora_modules: Optional[List[LoRAModulePath]]): lora_modules: Optional[List[LoRAModulePath]],
prompt_adapters: Optional[List[PromptAdapterPath]]):
super().__init__(engine=engine, super().__init__(engine=engine,
model_config=model_config, model_config=model_config,
served_model_names=served_model_names, served_model_names=served_model_names,
lora_modules=lora_modules) lora_modules=lora_modules,
prompt_adapters=prompt_adapters)
async def create_completion(self, request: CompletionRequest, async def create_completion(self, request: CompletionRequest,
raw_request: Request): raw_request: Request):
...@@ -95,7 +104,12 @@ class OpenAIServingCompletion(OpenAIServing): ...@@ -95,7 +104,12 @@ class OpenAIServingCompletion(OpenAIServing):
generators: List[AsyncIterator[RequestOutput]] = [] generators: List[AsyncIterator[RequestOutput]] = []
try: try:
sampling_params = request.to_sampling_params() sampling_params = request.to_sampling_params()
lora_request = self._maybe_get_lora(request) adapter_type, adapter_request = self._maybe_get_adapter(request)
lora_request, prompt_adapter_request = None, None
if adapter_type == 'LoRA':
lora_request, prompt_adapter_request = adapter_request, None
elif adapter_type == 'PromptAdapter':
lora_request, prompt_adapter_request = None, adapter_request
decoding_config = await self.engine.get_decoding_config() decoding_config = await self.engine.get_decoding_config()
guided_decoding_backend = request.guided_decoding_backend \ guided_decoding_backend = request.guided_decoding_backend \
or decoding_config.guided_decoding_backend or decoding_config.guided_decoding_backend
...@@ -125,6 +139,14 @@ class OpenAIServingCompletion(OpenAIServing): ...@@ -125,6 +139,14 @@ class OpenAIServingCompletion(OpenAIServing):
truncate_prompt_tokens) truncate_prompt_tokens)
prompt_ids, prompt_text = prompt_formats prompt_ids, prompt_text = prompt_formats
is_tracing_enabled = await self.engine.is_tracing_enabled()
trace_headers = None
if is_tracing_enabled:
trace_headers = extract_trace_headers(raw_request.headers)
if not is_tracing_enabled and contains_trace_headers(
raw_request.headers):
log_tracing_disabled_warning()
generator = self.engine.generate( generator = self.engine.generate(
{ {
"prompt": prompt_text, "prompt": prompt_text,
...@@ -133,6 +155,8 @@ class OpenAIServingCompletion(OpenAIServing): ...@@ -133,6 +155,8 @@ class OpenAIServingCompletion(OpenAIServing):
sampling_params, sampling_params,
f"{request_id}-{i}", f"{request_id}-{i}",
lora_request=lora_request, lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request,
trace_headers=trace_headers,
) )
generators.append(generator) generators.append(generator)
...@@ -256,16 +280,6 @@ class OpenAIServingCompletion(OpenAIServing): ...@@ -256,16 +280,6 @@ class OpenAIServingCompletion(OpenAIServing):
previous_num_tokens[i] = len(output.token_ids) previous_num_tokens[i] = len(output.token_ids)
finish_reason = output.finish_reason finish_reason = output.finish_reason
stop_reason = output.stop_reason stop_reason = output.stop_reason
if output.finish_reason is not None: # return final usage
prompt_tokens = len(res.prompt_token_ids)
completion_tokens = len(output.token_ids)
final_usage = UsageInfo(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
)
else:
final_usage = None
chunk = CompletionStreamResponse( chunk = CompletionStreamResponse(
id=request_id, id=request_id,
...@@ -282,9 +296,21 @@ class OpenAIServingCompletion(OpenAIServing): ...@@ -282,9 +296,21 @@ class OpenAIServingCompletion(OpenAIServing):
]) ])
if (request.stream_options if (request.stream_options
and request.stream_options.include_usage): and request.stream_options.include_usage):
chunk.usage = None if (request.stream_options.continuous_usage_stats
or output.finish_reason is not None):
prompt_tokens = len(res.prompt_token_ids)
completion_tokens = len(output.token_ids)
usage = UsageInfo(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens,
)
if request.stream_options.continuous_usage_stats:
chunk.usage = usage
else:
chunk.usage = None
response_json = chunk.model_dump_json(exclude_unset=True) response_json = chunk.model_dump_json(exclude_unset=False)
yield f"data: {response_json}\n\n" yield f"data: {response_json}\n\n"
if (request.stream_options if (request.stream_options
...@@ -294,10 +320,10 @@ class OpenAIServingCompletion(OpenAIServing): ...@@ -294,10 +320,10 @@ class OpenAIServingCompletion(OpenAIServing):
created=created_time, created=created_time,
model=model_name, model=model_name,
choices=[], choices=[],
usage=final_usage, usage=usage,
) )
final_usage_data = (final_usage_chunk.model_dump_json( final_usage_data = (final_usage_chunk.model_dump_json(
exclude_unset=True, exclude_none=True)) exclude_unset=False, exclude_none=True))
yield f"data: {final_usage_data}\n\n" yield f"data: {final_usage_data}\n\n"
except ValueError as e: except ValueError as e:
...@@ -330,7 +356,7 @@ class OpenAIServingCompletion(OpenAIServing): ...@@ -330,7 +356,7 @@ class OpenAIServingCompletion(OpenAIServing):
out_logprobs = prompt_logprobs out_logprobs = prompt_logprobs
output_text = prompt_text output_text = prompt_text
elif request.echo and request.max_tokens > 0: elif request.echo and request.max_tokens > 0:
token_ids = prompt_token_ids + output.token_ids token_ids = prompt_token_ids + list(output.token_ids)
out_logprobs = (prompt_logprobs + output.logprobs out_logprobs = (prompt_logprobs + output.logprobs
if request.logprobs is not None else None) if request.logprobs is not None else None)
output_text = prompt_text + output.text output_text = prompt_text + output.text
...@@ -431,3 +457,29 @@ class OpenAIServingCompletion(OpenAIServing): ...@@ -431,3 +457,29 @@ class OpenAIServingCompletion(OpenAIServing):
tokens=out_tokens, tokens=out_tokens,
top_logprobs=out_top_logprobs, top_logprobs=out_top_logprobs,
) )
async def create_tokenize(self,
request: TokenizeRequest) -> TokenizeResponse:
error_check_ret = await self._check_model(request)
if error_check_ret is not None:
return error_check_ret
(input_ids, input_text) = self._validate_prompt_and_tokenize(
request,
prompt=request.prompt,
add_special_tokens=request.add_special_tokens)
return TokenizeResponse(tokens=input_ids,
count=len(input_ids),
max_model_len=self.max_model_len)
async def create_detokenize(
self, request: DetokenizeRequest) -> DetokenizeResponse:
error_check_ret = await self._check_model(request)
if error_check_ret is not None:
return error_check_ret
(input_ids, input_text) = self._validate_prompt_and_tokenize(
request, prompt_ids=request.tokens)
return DetokenizeResponse(prompt=input_text)
import base64
import time import time
from typing import AsyncIterator, List, Optional, Tuple from typing import AsyncIterator, List, Optional, Tuple
import numpy as np
from fastapi import Request from fastapi import Request
from vllm.config import ModelConfig from vllm.config import ModelConfig
...@@ -20,19 +22,18 @@ TypeTokenIDs = List[int] ...@@ -20,19 +22,18 @@ TypeTokenIDs = List[int]
def request_output_to_embedding_response( def request_output_to_embedding_response(
final_res_batch: List[EmbeddingRequestOutput], final_res_batch: List[EmbeddingRequestOutput], request_id: str,
request_id: str, created_time: int, model_name: str,
created_time: int, encoding_format: str) -> EmbeddingResponse:
model_name: str, data: List[EmbeddingResponseData] = []
) -> EmbeddingResponse:
data = []
num_prompt_tokens = 0 num_prompt_tokens = 0
for idx, final_res in enumerate(final_res_batch): for idx, final_res in enumerate(final_res_batch):
assert final_res is not None assert final_res is not None
prompt_token_ids = final_res.prompt_token_ids prompt_token_ids = final_res.prompt_token_ids
embedding = final_res.outputs.embedding
embedding_data = EmbeddingResponseData( if encoding_format == "base64":
index=idx, embedding=final_res.outputs.embedding) embedding = base64.b64encode(np.array(embedding))
embedding_data = EmbeddingResponseData(index=idx, embedding=embedding)
data.append(embedding_data) data.append(embedding_data)
num_prompt_tokens += len(prompt_token_ids) num_prompt_tokens += len(prompt_token_ids)
...@@ -72,10 +73,8 @@ class OpenAIServingEmbedding(OpenAIServing): ...@@ -72,10 +73,8 @@ class OpenAIServingEmbedding(OpenAIServing):
if error_check_ret is not None: if error_check_ret is not None:
return error_check_ret return error_check_ret
# Return error for unsupported features. encoding_format = (request.encoding_format
if request.encoding_format == "base64": if request.encoding_format else "float")
return self.create_error_response(
"base64 encoding is not currently supported")
if request.dimensions is not None: if request.dimensions is not None:
return self.create_error_response( return self.create_error_response(
"dimensions is currently not supported") "dimensions is currently not supported")
...@@ -129,7 +128,8 @@ class OpenAIServingEmbedding(OpenAIServing): ...@@ -129,7 +128,8 @@ class OpenAIServingEmbedding(OpenAIServing):
return self.create_error_response("Client disconnected") return self.create_error_response("Client disconnected")
final_res_batch[i] = res final_res_batch[i] = res
response = request_output_to_embedding_response( response = request_output_to_embedding_response(
final_res_batch, request_id, created_time, model_name) final_res_batch, request_id, created_time, model_name,
encoding_format)
except ValueError as e: except ValueError as e:
# TODO: Use a vllm-specific Validation Error # TODO: Use a vllm-specific Validation Error
return self.create_error_response(str(e)) return self.create_error_response(str(e))
......
...@@ -10,17 +10,25 @@ from vllm.config import ModelConfig ...@@ -10,17 +10,25 @@ from vllm.config import ModelConfig
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
CompletionRequest, CompletionRequest,
DetokenizeRequest,
EmbeddingRequest, ErrorResponse, EmbeddingRequest, ErrorResponse,
ModelCard, ModelList, ModelCard, ModelList,
ModelPermission) ModelPermission, TokenizeRequest)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sequence import Logprob from vllm.sequence import Logprob
from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.transformers_utils.tokenizer import get_tokenizer
logger = init_logger(__name__) logger = init_logger(__name__)
@dataclass
class PromptAdapterPath:
name: str
local_path: str
@dataclass @dataclass
class LoRAModulePath: class LoRAModulePath:
name: str name: str
...@@ -29,12 +37,18 @@ class LoRAModulePath: ...@@ -29,12 +37,18 @@ class LoRAModulePath:
class OpenAIServing: class OpenAIServing:
def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig, def __init__(
served_model_names: List[str], self,
lora_modules: Optional[List[LoRAModulePath]]): engine: AsyncLLMEngine,
model_config: ModelConfig,
served_model_names: List[str],
lora_modules: Optional[List[LoRAModulePath]],
prompt_adapters: Optional[List[PromptAdapterPath]] = None,
):
super().__init__() super().__init__()
self.engine = engine self.engine = engine
self.model_config = model_config
self.max_model_len = model_config.max_model_len self.max_model_len = model_config.max_model_len
# A separate tokenizer to map token IDs to strings. # A separate tokenizer to map token IDs to strings.
...@@ -47,9 +61,8 @@ class OpenAIServing: ...@@ -47,9 +61,8 @@ class OpenAIServing:
self.served_model_names = served_model_names self.served_model_names = served_model_names
if lora_modules is None: self.lora_requests = []
self.lora_requests = [] if lora_modules is not None:
else:
self.lora_requests = [ self.lora_requests = [
LoRARequest( LoRARequest(
lora_name=lora.name, lora_name=lora.name,
...@@ -58,6 +71,20 @@ class OpenAIServing: ...@@ -58,6 +71,20 @@ class OpenAIServing:
) for i, lora in enumerate(lora_modules, start=1) ) for i, lora in enumerate(lora_modules, start=1)
] ]
self.prompt_adapter_requests = []
if prompt_adapters is not None:
for i, prompt_adapter in enumerate(prompt_adapters, start=1):
with open(f"./{prompt_adapter.local_path}"
f"/adapter_config.json") as f:
adapter_config = json.load(f)
num_virtual_tokens = adapter_config["num_virtual_tokens"]
self.prompt_adapter_requests.append(
PromptAdapterRequest(
prompt_adapter_name=prompt_adapter.name,
prompt_adapter_id=i,
prompt_adapter_local_path=prompt_adapter.local_path,
prompt_adapter_num_virtual_tokens=num_virtual_tokens))
async def show_available_models(self) -> ModelList: async def show_available_models(self) -> ModelList:
"""Show available models. Right now we only have one model.""" """Show available models. Right now we only have one model."""
model_cards = [ model_cards = [
...@@ -73,7 +100,14 @@ class OpenAIServing: ...@@ -73,7 +100,14 @@ class OpenAIServing:
permission=[ModelPermission()]) permission=[ModelPermission()])
for lora in self.lora_requests for lora in self.lora_requests
] ]
prompt_adapter_cards = [
ModelCard(id=prompt_adapter.prompt_adapter_name,
root=self.served_model_names[0],
permission=[ModelPermission()])
for prompt_adapter in self.prompt_adapter_requests
]
model_cards.extend(lora_cards) model_cards.extend(lora_cards)
model_cards.extend(prompt_adapter_cards)
return ModelList(data=model_cards) return ModelList(data=model_cards)
def create_error_response( def create_error_response(
...@@ -99,34 +133,45 @@ class OpenAIServing: ...@@ -99,34 +133,45 @@ class OpenAIServing:
return json_str return json_str
async def _check_model( async def _check_model(
self, request: Union[CompletionRequest, ChatCompletionRequest, self, request: Union[ChatCompletionRequest, CompletionRequest,
EmbeddingRequest] DetokenizeRequest, EmbeddingRequest,
TokenizeRequest]
) -> Optional[ErrorResponse]: ) -> Optional[ErrorResponse]:
if request.model in self.served_model_names: if request.model in self.served_model_names:
return None return None
if request.model in [lora.lora_name for lora in self.lora_requests]: if request.model in [lora.lora_name for lora in self.lora_requests]:
return None return None
if request.model in [
prompt_adapter.prompt_adapter_name
for prompt_adapter in self.prompt_adapter_requests
]:
return None
return self.create_error_response( return self.create_error_response(
message=f"The model `{request.model}` does not exist.", message=f"The model `{request.model}` does not exist.",
err_type="NotFoundError", err_type="NotFoundError",
status_code=HTTPStatus.NOT_FOUND) status_code=HTTPStatus.NOT_FOUND)
def _maybe_get_lora( def _maybe_get_adapter(
self, request: Union[CompletionRequest, ChatCompletionRequest, self, request: Union[CompletionRequest, ChatCompletionRequest,
EmbeddingRequest] EmbeddingRequest]
) -> Optional[LoRARequest]: ) -> Tuple[Optional[str], Optional[Union[LoRARequest,
PromptAdapterRequest]]]:
if request.model in self.served_model_names: if request.model in self.served_model_names:
return None return None, None
for lora in self.lora_requests: for lora in self.lora_requests:
if request.model == lora.lora_name: if request.model == lora.lora_name:
return lora return 'LoRA', lora
for prompt_adapter in self.prompt_adapter_requests:
if request.model == prompt_adapter.prompt_adapter_name:
return 'PromptAdapter', prompt_adapter
# if _check_model has been called earlier, this will be unreachable # if _check_model has been called earlier, this will be unreachable
raise ValueError(f"The model `{request.model}` does not exist.") raise ValueError(f"The model `{request.model}` does not exist.")
def _validate_prompt_and_tokenize( def _validate_prompt_and_tokenize(
self, self,
request: Union[ChatCompletionRequest, CompletionRequest, request: Union[ChatCompletionRequest, CompletionRequest,
EmbeddingRequest], DetokenizeRequest, EmbeddingRequest,
TokenizeRequest],
prompt: Optional[str] = None, prompt: Optional[str] = None,
prompt_ids: Optional[List[int]] = None, prompt_ids: Optional[List[int]] = None,
truncate_prompt_tokens: Optional[Annotated[int, truncate_prompt_tokens: Optional[Annotated[int,
...@@ -174,6 +219,11 @@ class OpenAIServing: ...@@ -174,6 +219,11 @@ class OpenAIServing:
f"generation. Please reduce the length of the input.", ) f"generation. Please reduce the length of the input.", )
return input_ids, input_text return input_ids, input_text
# Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens
# and does not require model context length validation
if isinstance(request, (TokenizeRequest, DetokenizeRequest)):
return input_ids, input_text
if request.max_tokens is None: if request.max_tokens is None:
if token_num >= self.max_model_len: if token_num >= self.max_model_len:
raise ValueError( raise ValueError(
......
...@@ -5,6 +5,7 @@ if TYPE_CHECKING: ...@@ -5,6 +5,7 @@ if TYPE_CHECKING:
VLLM_HOST_IP: str = "" VLLM_HOST_IP: str = ""
VLLM_PORT: Optional[int] = None VLLM_PORT: Optional[int] = None
VLLM_USE_MODELSCOPE: bool = False VLLM_USE_MODELSCOPE: bool = False
VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
VLLM_INSTANCE_ID: Optional[str] = None VLLM_INSTANCE_ID: Optional[str] = None
VLLM_NCCL_SO_PATH: Optional[str] = None VLLM_NCCL_SO_PATH: Optional[str] = None
LD_LIBRARY_PATH: Optional[str] = None LD_LIBRARY_PATH: Optional[str] = None
...@@ -27,14 +28,20 @@ if TYPE_CHECKING: ...@@ -27,14 +28,20 @@ if TYPE_CHECKING:
VLLM_TRACE_FUNCTION: int = 0 VLLM_TRACE_FUNCTION: int = 0
VLLM_ATTENTION_BACKEND: Optional[str] = None VLLM_ATTENTION_BACKEND: Optional[str] = None
VLLM_CPU_KVCACHE_SPACE: int = 0 VLLM_CPU_KVCACHE_SPACE: int = 0
VLLM_OPENVINO_KVCACHE_SPACE: int = 0
VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/"
VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
VLLM_USE_RAY_COMPILED_DAG: bool = False VLLM_USE_RAY_COMPILED_DAG: bool = False
VLLM_WORKER_MULTIPROC_METHOD: str = "spawn" VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
VLLM_IMAGE_FETCH_TIMEOUT: int = 5 VLLM_IMAGE_FETCH_TIMEOUT: int = 5
VLLM_TARGET_DEVICE: str = "cuda" VLLM_TARGET_DEVICE: str = "cuda"
MAX_JOBS: Optional[str] = None MAX_JOBS: Optional[str] = None
NVCC_THREADS: Optional[str] = None NVCC_THREADS: Optional[str] = None
VLLM_USE_PRECOMPILED: bool = False VLLM_USE_PRECOMPILED: bool = False
VLLM_INSTALL_PUNICA_KERNELS: bool = False VLLM_INSTALL_PUNICA_KERNELS: bool = False
VLLM_NO_DEPRECATION_WARNING: bool = False
CMAKE_BUILD_TYPE: Optional[str] = None CMAKE_BUILD_TYPE: Optional[str] = None
VERBOSE: bool = False VERBOSE: bool = False
...@@ -47,7 +54,8 @@ environment_variables: Dict[str, Callable[[], Any]] = { ...@@ -47,7 +54,8 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# ================== Installation Time Env Vars ================== # ================== Installation Time Env Vars ==================
# Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu] # Target device of vLLM, supporting [cuda (by default),
# rocm, neuron, cpu, openvino]
"VLLM_TARGET_DEVICE": "VLLM_TARGET_DEVICE":
lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"), lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
...@@ -113,6 +121,10 @@ environment_variables: Dict[str, Callable[[], Any]] = { ...@@ -113,6 +121,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
"VLLM_INSTANCE_ID": "VLLM_INSTANCE_ID":
lambda: os.environ.get("VLLM_INSTANCE_ID", None), lambda: os.environ.get("VLLM_INSTANCE_ID", None),
# Interval in seconds to log a warning message when the ring buffer is full
"VLLM_RINGBUFFER_WARNING_INTERVAL":
lambda: int(os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")),
# path to cudatoolkit home directory, under which should be bin, include, # path to cudatoolkit home directory, under which should be bin, include,
# and lib directories. # and lib directories.
"CUDA_HOME": "CUDA_HOME":
...@@ -194,6 +206,7 @@ environment_variables: Dict[str, Callable[[], Any]] = { ...@@ -194,6 +206,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# - "FLASH_ATTN": use FlashAttention # - "FLASH_ATTN": use FlashAttention
# - "XFORMERS": use XFormers # - "XFORMERS": use XFormers
# - "ROCM_FLASH": use ROCmFlashAttention # - "ROCM_FLASH": use ROCmFlashAttention
# - "FLASHINFER": use flashinfer
"VLLM_ATTENTION_BACKEND": "VLLM_ATTENTION_BACKEND":
lambda: os.getenv("VLLM_ATTENTION_BACKEND", None), lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
...@@ -202,6 +215,22 @@ environment_variables: Dict[str, Callable[[], Any]] = { ...@@ -202,6 +215,22 @@ environment_variables: Dict[str, Callable[[], Any]] = {
"VLLM_CPU_KVCACHE_SPACE": "VLLM_CPU_KVCACHE_SPACE":
lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")), lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
# OpenVINO key-value cache space
# default is 4GB
"VLLM_OPENVINO_KVCACHE_SPACE":
lambda: int(os.getenv("VLLM_OPENVINO_KVCACHE_SPACE", "0")),
# OpenVINO KV cache precision
# default is bf16 if natively supported by platform, otherwise f16
# To enable KV cache compression, please, explicitly specify u8
"VLLM_OPENVINO_CPU_KV_CACHE_PRECISION":
lambda: os.getenv("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION", None),
# Enables weights compression during model export via HF Optimum
# default is False
"VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS":
lambda: bool(os.getenv("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False)),
# If the env var is set, it uses the Ray's compiled DAG API # If the env var is set, it uses the Ray's compiled DAG API
# which optimizes the control plane overhead. # which optimizes the control plane overhead.
# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
...@@ -211,12 +240,23 @@ environment_variables: Dict[str, Callable[[], Any]] = { ...@@ -211,12 +240,23 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# Use dedicated multiprocess context for workers. # Use dedicated multiprocess context for workers.
# Both spawn and fork work # Both spawn and fork work
"VLLM_WORKER_MULTIPROC_METHOD": "VLLM_WORKER_MULTIPROC_METHOD":
lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"), lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
# Timeout for fetching images when serving multimodal models # Timeout for fetching images when serving multimodal models
# Default is 5 seconds # Default is 5 seconds
"VLLM_IMAGE_FETCH_TIMEOUT": "VLLM_IMAGE_FETCH_TIMEOUT":
lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")), lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
# Path to the XLA persistent cache directory.
# Only used for XLA devices such as TPUs.
"VLLM_XLA_CACHE_PATH":
lambda: os.getenv("VLLM_XLA_CACHE_PATH", "~/.vllm/xla_cache/"),
"VLLM_FUSED_MOE_CHUNK_SIZE":
lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "65536")),
# If set, vllm will skip the deprecation warnings.
"VLLM_NO_DEPRECATION_WARNING":
lambda: bool(int(os.getenv("VLLM_NO_DEPRECATION_WARNING", "0"))),
} }
# end-env-vars-definition # end-env-vars-definition
......
...@@ -7,6 +7,7 @@ from vllm.config import CacheConfig, ModelConfig, SchedulerConfig ...@@ -7,6 +7,7 @@ from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
make_async) make_async)
...@@ -46,8 +47,9 @@ class CPUExecutor(ExecutorBase): ...@@ -46,8 +47,9 @@ class CPUExecutor(ExecutorBase):
rank=0, rank=0,
distributed_init_method=distributed_init_method, distributed_init_method=distributed_init_method,
lora_config=self.lora_config, lora_config=self.lora_config,
vision_language_config=self.vision_language_config, multimodal_config=self.multimodal_config,
kv_cache_dtype=self.cache_config.cache_dtype, kv_cache_dtype=self.cache_config.cache_dtype,
prompt_adapter_config=self.prompt_adapter_config,
is_driver_worker=True, is_driver_worker=True,
) )
self.driver_worker.init_device() self.driver_worker.init_device()
...@@ -84,9 +86,25 @@ class CPUExecutor(ExecutorBase): ...@@ -84,9 +86,25 @@ class CPUExecutor(ExecutorBase):
def remove_lora(self, lora_id: int) -> bool: def remove_lora(self, lora_id: int) -> bool:
return self.driver_worker.remove_lora(lora_id) return self.driver_worker.remove_lora(lora_id)
def pin_lora(self, lora_id: int) -> bool:
return self.driver_worker.pin_lora(lora_id)
def list_loras(self) -> Set[int]: def list_loras(self) -> Set[int]:
return self.driver_worker.list_loras() return self.driver_worker.list_loras()
def add_prompt_adapter(
self, prompt_adapter_request: PromptAdapterRequest) -> bool:
return self.driver_worker.add_prompt_adapter(prompt_adapter_request)
def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
return self.driver_worker.remove_prompt_adapter(prompt_adapter_id)
def list_prompt_adapters(self) -> Set[int]:
return self.driver_worker.list_prompt_adapters()
def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
return self.driver_worker.pin_prompt_adapter(prompt_adapter_id)
def check_health(self) -> None: def check_health(self) -> None:
# CPUExecutor will always be healthy as long as # CPUExecutor will always be healthy as long as
# it's running. # it's running.
......
...@@ -64,12 +64,12 @@ class DistributedGPUExecutor(GPUExecutor): ...@@ -64,12 +64,12 @@ class DistributedGPUExecutor(GPUExecutor):
num_cpu_blocks=num_cpu_blocks) num_cpu_blocks=num_cpu_blocks)
def execute_model( def execute_model(
self, self, execute_model_req: ExecuteModelRequest
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: ) -> Optional[List[SamplerOutput]]:
if self.parallel_worker_tasks is None: if self.parallel_worker_tasks is None:
self.parallel_worker_tasks = self._run_workers( self.parallel_worker_tasks = self._run_workers(
"start_worker_execution_loop", "start_worker_execution_loop",
async_run_remote_workers_only=True, async_run_tensor_parallel_workers_only=True,
**self.extra_execute_model_run_workers_kwargs) **self.extra_execute_model_run_workers_kwargs)
# Only the driver worker returns the sampling results. # Only the driver worker returns the sampling results.
...@@ -79,7 +79,7 @@ class DistributedGPUExecutor(GPUExecutor): ...@@ -79,7 +79,7 @@ class DistributedGPUExecutor(GPUExecutor):
if self.parallel_worker_tasks is None: if self.parallel_worker_tasks is None:
return return
self._driver_execute_model() self._driver_execute_model(execute_model_req=None)
parallel_worker_tasks = self.parallel_worker_tasks parallel_worker_tasks = self.parallel_worker_tasks
self.parallel_worker_tasks = None self.parallel_worker_tasks = None
# Ensure that workers exit model loop cleanly # Ensure that workers exit model loop cleanly
...@@ -100,6 +100,13 @@ class DistributedGPUExecutor(GPUExecutor): ...@@ -100,6 +100,13 @@ class DistributedGPUExecutor(GPUExecutor):
lora_id=lora_id, lora_id=lora_id,
) )
def pin_lora(self, lora_id: int) -> bool:
assert lora_id > 0, "lora_id must be greater than 0."
return self._run_workers(
"pin_lora",
lora_id=lora_id,
)
def list_loras(self) -> Set[int]: def list_loras(self) -> Set[int]:
return self._run_workers("list_loras") return self._run_workers("list_loras")
...@@ -116,13 +123,13 @@ class DistributedGPUExecutor(GPUExecutor): ...@@ -116,13 +123,13 @@ class DistributedGPUExecutor(GPUExecutor):
@abstractmethod @abstractmethod
def _driver_execute_model( def _driver_execute_model(
self, self, execute_model_req: Optional[ExecuteModelRequest]
execute_model_req: Optional[ExecuteModelRequest] = None ) -> Optional[List[SamplerOutput]]:
) -> List[SamplerOutput]:
"""Run execute_model in the driver worker. """Run execute_model in the driver worker.
Passing None will cause the driver to stop the model execution Passing None will cause the driver to stop the model execution loop
loop running in each of the remote workers. running in each of the remote workers. In this case, this method
returns None. Otherwise, this method returns the model output.
""" """
raise NotImplementedError raise NotImplementedError
...@@ -131,17 +138,17 @@ class DistributedGPUExecutor(GPUExecutor): ...@@ -131,17 +138,17 @@ class DistributedGPUExecutor(GPUExecutor):
self, self,
method: str, method: str,
*args, *args,
async_run_remote_workers_only: bool = False, async_run_tensor_parallel_workers_only: bool = False,
max_concurrent_workers: Optional[int] = None, max_concurrent_workers: Optional[int] = None,
**kwargs, **kwargs,
) -> Any: ) -> Any:
"""Runs the given method on all workers. """Runs the given method on all workers.
Args: Args:
async_run_remote_workers_only: If True the method will be run only async_run_tensor_parallel_workers_only: If True the method will be
in the remote workers, not the driver worker. It will also be run only in the remote TP workers, not the driver worker.
run asynchronously and return a list of futures rather than It will also be run asynchronously and return a list of futures
blocking on the results. rather than blocking on the results.
""" """
raise NotImplementedError raise NotImplementedError
......
import asyncio
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import List, Optional, Set, Tuple from typing import List, Optional, Set, Tuple
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
ModelConfig, ParallelConfig, SchedulerConfig, ModelConfig, MultiModalConfig, ParallelConfig,
SpeculativeConfig, VisionLanguageConfig) PromptAdapterConfig, SchedulerConfig,
SpeculativeConfig)
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.sequence import ExecuteModelRequest, SamplerOutput
...@@ -25,8 +28,9 @@ class ExecutorBase(ABC): ...@@ -25,8 +28,9 @@ class ExecutorBase(ABC):
device_config: DeviceConfig, device_config: DeviceConfig,
load_config: LoadConfig, load_config: LoadConfig,
lora_config: Optional[LoRAConfig], lora_config: Optional[LoRAConfig],
vision_language_config: Optional[VisionLanguageConfig], multimodal_config: Optional[MultiModalConfig],
speculative_config: Optional[SpeculativeConfig], speculative_config: Optional[SpeculativeConfig],
prompt_adapter_config: Optional[PromptAdapterConfig],
) -> None: ) -> None:
self.model_config = model_config self.model_config = model_config
self.cache_config = cache_config self.cache_config = cache_config
...@@ -35,8 +39,9 @@ class ExecutorBase(ABC): ...@@ -35,8 +39,9 @@ class ExecutorBase(ABC):
self.parallel_config = parallel_config self.parallel_config = parallel_config
self.scheduler_config = scheduler_config self.scheduler_config = scheduler_config
self.device_config = device_config self.device_config = device_config
self.vision_language_config = vision_language_config self.multimodal_config = multimodal_config
self.speculative_config = speculative_config self.speculative_config = speculative_config
self.prompt_adapter_config = prompt_adapter_config
self._init_executor() self._init_executor()
...@@ -69,8 +74,8 @@ class ExecutorBase(ABC): ...@@ -69,8 +74,8 @@ class ExecutorBase(ABC):
@abstractmethod @abstractmethod
def execute_model( def execute_model(
self, self, execute_model_req: ExecuteModelRequest
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: ) -> Optional[List[SamplerOutput]]:
"""Executes at least one model step on the given sequences.""" """Executes at least one model step on the given sequences."""
raise NotImplementedError raise NotImplementedError
...@@ -86,10 +91,31 @@ class ExecutorBase(ABC): ...@@ -86,10 +91,31 @@ class ExecutorBase(ABC):
def remove_lora(self, lora_id: int) -> bool: def remove_lora(self, lora_id: int) -> bool:
raise NotImplementedError raise NotImplementedError
@abstractmethod
def pin_lora(self, lora_id: int) -> bool:
raise NotImplementedError # type: ignore
@abstractmethod @abstractmethod
def list_loras(self) -> Set[int]: def list_loras(self) -> Set[int]:
raise NotImplementedError raise NotImplementedError
@abstractmethod
def add_prompt_adapter(
self, prompt_adapter_request: PromptAdapterRequest) -> bool:
raise NotImplementedError
@abstractmethod
def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
raise NotImplementedError
@abstractmethod
def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
raise NotImplementedError # type: ignore
@abstractmethod
def list_prompt_adapters(self) -> Set[int]:
raise NotImplementedError
@abstractmethod @abstractmethod
def check_health(self) -> None: def check_health(self) -> None:
"""Checks if the executor is healthy. If not, it should raise an """Checks if the executor is healthy. If not, it should raise an
...@@ -106,6 +132,26 @@ class ExecutorBase(ABC): ...@@ -106,6 +132,26 @@ class ExecutorBase(ABC):
class ExecutorAsyncBase(ExecutorBase): class ExecutorAsyncBase(ExecutorBase):
def __init__(
self,
model_config: ModelConfig,
cache_config: CacheConfig,
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
device_config: DeviceConfig,
load_config: LoadConfig,
lora_config: Optional[LoRAConfig],
multimodal_config: Optional[MultiModalConfig],
speculative_config: Optional[SpeculativeConfig],
prompt_adapter_config: Optional[PromptAdapterConfig],
) -> None:
self.pp_locks: Optional[List[asyncio.Lock]] = None
super().__init__(model_config, cache_config, parallel_config,
scheduler_config, device_config, load_config,
lora_config, multimodal_config, speculative_config,
prompt_adapter_config)
@abstractmethod @abstractmethod
async def execute_model_async( async def execute_model_async(
self, self,
......
...@@ -3,6 +3,7 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union ...@@ -3,6 +3,7 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.prompt_adapter.request import PromptAdapterRequest
from vllm.sequence import ExecuteModelRequest, PoolerOutput, SamplerOutput from vllm.sequence import ExecuteModelRequest, PoolerOutput, SamplerOutput
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
make_async) make_async)
...@@ -43,9 +44,11 @@ class GPUExecutor(ExecutorBase): ...@@ -43,9 +44,11 @@ class GPUExecutor(ExecutorBase):
rank=rank, rank=rank,
distributed_init_method=distributed_init_method, distributed_init_method=distributed_init_method,
lora_config=self.lora_config, lora_config=self.lora_config,
vision_language_config=self.vision_language_config, multimodal_config=self.multimodal_config,
speculative_config=self.speculative_config, speculative_config=self.speculative_config,
is_driver_worker=rank == 0, prompt_adapter_config=self.prompt_adapter_config,
is_driver_worker=(not self.parallel_config)
or (rank % self.parallel_config.tensor_parallel_size == 0),
) )
def _create_worker(self, def _create_worker(self,
...@@ -87,7 +90,7 @@ class GPUExecutor(ExecutorBase): ...@@ -87,7 +90,7 @@ class GPUExecutor(ExecutorBase):
def execute_model( def execute_model(
self, execute_model_req: ExecuteModelRequest self, execute_model_req: ExecuteModelRequest
) -> List[Union[SamplerOutput, PoolerOutput]]: ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
output = self.driver_worker.execute_model(execute_model_req) output = self.driver_worker.execute_model(execute_model_req)
return output return output
...@@ -99,9 +102,32 @@ class GPUExecutor(ExecutorBase): ...@@ -99,9 +102,32 @@ class GPUExecutor(ExecutorBase):
assert lora_id > 0, "lora_id must be greater than 0." assert lora_id > 0, "lora_id must be greater than 0."
return self.driver_worker.remove_lora(lora_id) return self.driver_worker.remove_lora(lora_id)
def pin_lora(self, lora_id: int) -> bool:
assert lora_id > 0, "lora_id must be greater than 0."
return self.driver_worker.pin_lora(lora_id)
def list_loras(self) -> Set[int]: def list_loras(self) -> Set[int]:
return self.driver_worker.list_loras() return self.driver_worker.list_loras()
def add_prompt_adapter(
self, prompt_adapter_request: PromptAdapterRequest) -> bool:
assert prompt_adapter_request.prompt_adapter_id > 0, \
"prompt_adapter_id must be greater than 0."
return self.driver_worker.add_prompt_adapter(prompt_adapter_request)
def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
assert prompt_adapter_id > 0, \
"prompt_adapter_id must be greater than 0."
return self.driver_worker.remove_prompt_adapter(prompt_adapter_id)
def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
assert prompt_adapter_id > 0, \
"prompt_adapter_id must be greater than 0."
return self.driver_worker.pin_prompt_adapter(prompt_adapter_id)
def list_prompt_adapters(self) -> Set[int]:
return self.driver_worker.list_prompt_adapters()
def check_health(self) -> None: def check_health(self) -> None:
# GPUExecutor will always be healthy as long as # GPUExecutor will always be healthy as long as
# it's running. # it's running.
......
...@@ -9,8 +9,12 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper, ...@@ -9,8 +9,12 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
ResultHandler, WorkerMonitor) ResultHandler, WorkerMonitor)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, from vllm.triton_utils import maybe_set_triton_cache_manager
get_vllm_instance_id, make_async) from vllm.utils import (cuda_device_count_stateless,
error_on_invalid_device_count_status,
get_distributed_init_method, get_open_port,
get_vllm_instance_id, make_async,
update_environment_variables)
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -24,8 +28,9 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor): ...@@ -24,8 +28,9 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
# Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
if "CUDA_VISIBLE_DEVICES" not in os.environ: if "CUDA_VISIBLE_DEVICES" not in os.environ:
os.environ["CUDA_VISIBLE_DEVICES"] = (",".join( update_environment_variables({
map(str, range(world_size)))) "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
})
# Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers # Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers
os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id() os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id()
...@@ -33,12 +38,25 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor): ...@@ -33,12 +38,25 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
# Disable torch async compiling which won't work with daemonic processes # Disable torch async compiling which won't work with daemonic processes
os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
from torch.cuda import device_count # Set OMP_NUM_THREADS to 1 if it is not set explicitly, avoids CPU
assert world_size <= device_count(), ( # contention amongst the shards
if "OMP_NUM_THREADS" not in os.environ:
os.environ["OMP_NUM_THREADS"] = "1"
# workaround for https://github.com/vllm-project/vllm/issues/6103
if world_size > 1:
maybe_set_triton_cache_manager()
assert world_size <= cuda_device_count_stateless(), (
"please set tensor_parallel_size to less than max local gpu count") "please set tensor_parallel_size to less than max local gpu count")
error_on_invalid_device_count_status()
# Multiprocessing-based executor does not support multi-node setting.
# Since it only works for single node, we can use the loopback address
# 127.0.0.1 for communication.
distributed_init_method = get_distributed_init_method( distributed_init_method = get_distributed_init_method(
get_ip(), get_open_port()) "127.0.0.1", get_open_port())
if world_size == 1: if world_size == 1:
self.workers = [] self.workers = []
...@@ -73,32 +91,30 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor): ...@@ -73,32 +91,30 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
worker_monitor.close() worker_monitor.close()
def _driver_execute_model( def _driver_execute_model(
self, self, execute_model_req: Optional[ExecuteModelRequest]
execute_model_req: Optional[ExecuteModelRequest] = None ) -> Optional[List[SamplerOutput]]:
) -> List[SamplerOutput]:
"""Run execute_model in the driver worker. """Run execute_model in the driver worker.
Passing None will cause the driver to stop the model execution Passing None will cause the driver to stop the model execution
loop running in each of the remote workers. loop running in each of the remote workers.
""" """
return self.driver_worker.execute_model( return self.driver_worker.execute_model(execute_model_req)
execute_model_req=execute_model_req)
def _run_workers( def _run_workers(
self, self,
method: str, method: str,
*args, *args,
async_run_remote_workers_only: bool = False, async_run_tensor_parallel_workers_only: bool = False,
max_concurrent_workers: Optional[int] = None, max_concurrent_workers: Optional[int] = None,
**kwargs, **kwargs,
) -> Any: ) -> Any:
"""Runs the given method on all workers. """Runs the given method on all workers.
Args: Args:
async_run_remote_workers_only: If True the method will be run only async_run_tensor_parallel_workers_only: If True the method will be
in the remote workers, not the driver worker. It will also be run only in the remote TP workers, not the driver worker.
run asynchronously and return a list of futures rather than It will also be run asynchronously and return a list of futures
blocking on the results. rather than blocking on the results.
""" """
if max_concurrent_workers: if max_concurrent_workers:
...@@ -111,7 +127,7 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor): ...@@ -111,7 +127,7 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
for worker in self.workers for worker in self.workers
] ]
if async_run_remote_workers_only: if async_run_tensor_parallel_workers_only:
# Just return futures # Just return futures
return worker_outputs return worker_outputs
......
...@@ -48,15 +48,14 @@ class NeuronExecutor(ExecutorBase): ...@@ -48,15 +48,14 @@ class NeuronExecutor(ExecutorBase):
def execute_model( def execute_model(
self, self,
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
assert (execute_model_req.blocks_to_swap_in == {} assert (not execute_model_req.blocks_to_swap_in
and execute_model_req.blocks_to_swap_out == {} and not execute_model_req.blocks_to_swap_out
and execute_model_req.blocks_to_copy == {}), ( and not execute_model_req.blocks_to_copy), (
"Cache operations are not supported for Neuron backend.") "Cache operations are not supported for Neuron backend.")
assert execute_model_req.num_lookahead_slots == 0, ( assert execute_model_req.num_lookahead_slots == 0, (
"lookahead not supported for Neuron backend.") "lookahead not supported for Neuron backend.")
output = self.driver_worker.execute_model( output = self.driver_worker.execute_model(execute_model_req)
execute_model_req.seq_group_metadata_list)
return output return output
def add_lora(self, lora_request: LoRARequest) -> bool: def add_lora(self, lora_request: LoRARequest) -> bool:
...@@ -65,9 +64,28 @@ class NeuronExecutor(ExecutorBase): ...@@ -65,9 +64,28 @@ class NeuronExecutor(ExecutorBase):
def remove_lora(self, lora_id: int) -> bool: def remove_lora(self, lora_id: int) -> bool:
return self.driver_worker.remove_lora(lora_id) return self.driver_worker.remove_lora(lora_id)
def pin_lora(self, lora_id: int) -> bool:
return self.driver_worker.pin_lora(lora_id)
def list_loras(self) -> Set[int]: def list_loras(self) -> Set[int]:
return self.driver_worker.list_loras() return self.driver_worker.list_loras()
def add_prompt_adapter(self, prompt_adapter_request) -> bool:
raise NotImplementedError(
"Soft prompt is currently not supported by the Neuron backend.")
def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
raise NotImplementedError(
"Soft prompt is currently not supported by the Neuron backend.")
def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
raise NotImplementedError(
"Soft prompt is currently not supported by the Neuron backend.")
def list_prompt_adapters(self) -> Set[int]:
raise NotImplementedError(
"Soft prompt is currently not supported by the Neuron backend.")
def check_health(self) -> None: def check_health(self) -> None:
# NeuronExecutor will always be healthy as long as # NeuronExecutor will always be healthy as long as
# it's running. # it's running.
......
from typing import List, Set, Tuple
import openvino as ov
import openvino.properties.hint as hints
import torch
import vllm.envs as envs
from vllm.config import CacheConfig, ModelConfig
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
make_async)
logger = init_logger(__name__)
class OpenVINOExecutor(ExecutorBase):
def _init_executor(self) -> None:
assert self.device_config.device_type == "openvino"
assert self.lora_config is None, "OpenVINO backend doesn't support LoRA"
self.model_config = _verify_and_get_model_config(self.model_config)
self.cache_config = _verify_and_get_cache_config(self.cache_config)
# Instantiate the worker and load the model to CPU.
self._init_worker()
def _init_worker(self):
from vllm.worker.openvino_worker import OpenVINOWorker
assert (
self.parallel_config.world_size == 1
), "OpenVINOExecutor only supports single CPU socket currently."
distributed_init_method = get_distributed_init_method(
get_ip(), get_open_port())
self.driver_worker = OpenVINOWorker(
model_config=self.model_config,
parallel_config=self.parallel_config,
scheduler_config=self.scheduler_config,
device_config=self.device_config,
cache_config=self.cache_config,
load_config=self.load_config,
local_rank=0,
rank=0,
distributed_init_method=distributed_init_method,
lora_config=self.lora_config,
multimodal_config=self.multimodal_config,
kv_cache_dtype=self.cache_config.cache_dtype,
is_driver_worker=True,
)
self.driver_worker.init_device()
self.driver_worker.load_model()
def determine_num_available_blocks(self) -> Tuple[int, int]:
"""Determine the number of available KV blocks by invoking the
underlying worker.
"""
return self.driver_worker.determine_num_available_blocks()
def initialize_cache(self, num_gpu_blocks: int,
num_cpu_blocks: int) -> None:
"""Initialize the KV cache by invoking the underlying worker."""
# NOTE: We log here to avoid multiple logs when number of workers is
# greater than one. We could log in the engine, but not all executors
# have GPUs.
# NOTE: `cpu block` for OpenVINO backend is located on CPU memory but is
# referred as `gpu block`. Because we want to reuse the existing block
# management procedure.
logger.info("# CPU blocks: %d", num_gpu_blocks)
self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
def execute_model(
self,
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
output = self.driver_worker.execute_model(execute_model_req)
return output
def add_lora(self, lora_request: LoRARequest) -> bool:
return self.driver_worker.add_lora(lora_request)
def remove_lora(self, lora_id: int) -> bool:
return self.driver_worker.remove_lora(lora_id)
def pin_lora(self, lora_id: int) -> bool:
return self.driver_worker.pin_lora(lora_id)
def list_loras(self) -> Set[int]:
return self.driver_worker.list_loras()
def add_prompt_adapter(self, prompt_adapter_request) -> bool:
raise NotImplementedError(
"Soft prompt is currently not supported by the OPENVINO backend.")
def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
raise NotImplementedError(
"Soft prompt is currently not supported by the OPENVINO backend.")
def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
raise NotImplementedError(
"Soft prompt is currently not supported by the OPENVINO backend.")
def list_prompt_adapters(self) -> Set[int]:
raise NotImplementedError(
"Soft prompt is currently not supported by the OPENVINO backend.")
def check_health(self) -> None:
# OpenVINOExecutor will always be healthy as long as
# it's running.
return
class OpenVINOExecutorAsync(OpenVINOExecutor, ExecutorAsyncBase):
async def execute_model_async(
self,
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
output = await make_async(self.driver_worker.execute_model
)(execute_model_req=execute_model_req, )
return output
async def check_health_async(self) -> None:
# OpenVINOExecutor will always be healthy as long as
# it's running.
return
def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
if config.dtype != torch.float32:
logger.warning(
f"Only float32 dtype is supported on OpenVINO, casting from {config.dtype}." # noqa: G004, E501
)
config.dtype = torch.float32
if not config.enforce_eager:
logger.warning(
"CUDA graph is not supported on OpenVINO backend, fallback to the "
"eager mode.")
config.enforce_eager = True
return config
def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
logger.info("KV cache type is overried to u8 via "
"VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
config.cache_dtype = ov.Type.u8
else:
core = ov.Core()
inference_precision = core.get_property("CPU",
hints.inference_precision)
if inference_precision == ov.Type.bf16:
config.cache_dtype = ov.Type.bf16
else:
config.cache_dtype = ov.Type.f16
if config.block_size != 32:
logger.info(
f"OpenVINO optimal block size is 32, overriding currently set {config.block_size}" # noqa: G004, E501
)
config.block_size = 32
kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
if kv_cache_space >= 0:
_GB = 1 << 30
if kv_cache_space == 0:
config.openvino_kvcache_space_bytes = 4 * _GB # type: ignore
logger.warning(
"Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
"for OpenVINO backend is not set, using 4 by default.")
else:
config.openvino_kvcache_space_bytes = kv_cache_space * _GB # type: ignore
else:
raise RuntimeError(
"Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
f" {kv_cache_space}, expect a positive integer value.")
return config
...@@ -11,7 +11,8 @@ from vllm.executor.distributed_gpu_executor import ( # yapf: disable ...@@ -11,7 +11,8 @@ from vllm.executor.distributed_gpu_executor import ( # yapf: disable
from vllm.executor.ray_utils import RayWorkerWrapper, ray from vllm.executor.ray_utils import RayWorkerWrapper, ray
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.sequence import ExecuteModelRequest, SamplerOutput from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, from vllm.utils import (error_on_invalid_device_count_status,
get_distributed_init_method, get_ip, get_open_port,
get_vllm_instance_id, make_async) get_vllm_instance_id, make_async)
if ray is not None: if ray is not None:
...@@ -62,7 +63,8 @@ class RayGPUExecutor(DistributedGPUExecutor): ...@@ -62,7 +63,8 @@ class RayGPUExecutor(DistributedGPUExecutor):
def _init_workers_ray(self, placement_group: "PlacementGroup", def _init_workers_ray(self, placement_group: "PlacementGroup",
**ray_remote_kwargs): **ray_remote_kwargs):
if self.parallel_config.tensor_parallel_size == 1: if (self.parallel_config.tensor_parallel_size == 1
and self.parallel_config.pipeline_parallel_size == 1):
# For single GPU case, we use a ray worker with constrained memory. # For single GPU case, we use a ray worker with constrained memory.
num_gpus = self.cache_config.gpu_memory_utilization num_gpus = self.cache_config.gpu_memory_utilization
else: else:
...@@ -132,11 +134,38 @@ class RayGPUExecutor(DistributedGPUExecutor): ...@@ -132,11 +134,38 @@ class RayGPUExecutor(DistributedGPUExecutor):
worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids", worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
use_dummy_driver=True) use_dummy_driver=True)
node_workers = defaultdict(list) # the order in `worker_node_and_gpu_ids` does not necessarily match
node_gpus = defaultdict(list) # the machine boundaries. We need to make sure that workers in the
# same node are assigned consecutive ranks.
for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): # examples:
node_workers[node_id].append(i) # [('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [0]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [0]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [1]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [2]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [3]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [1]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [2]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [3])] # noqa
# initialize worker ranks with -1 (unassigned)
worker_ranks = [-1 for x in worker_node_and_gpu_ids]
current_rank = 0
while -1 in worker_ranks:
# whenever we find an unassigned worker, find the node
index = worker_ranks.index(-1)
current_node_id = worker_node_and_gpu_ids[index][0]
# assign ranks to all workers in the same node
for i, (node_id, _) in enumerate(worker_node_and_gpu_ids):
if node_id == current_node_id:
worker_ranks[i] = current_rank
current_rank += 1
# with the above example, worker_ranks will be [0, 4, 5, 6, 7, 1, 2, 3]
node_workers = defaultdict(list) # node id -> list of worker ranks
node_gpus = defaultdict(list) # node id -> list of gpu ids
for worker_rank, (node_id, gpu_ids) in zip(worker_ranks,
worker_node_and_gpu_ids):
node_workers[node_id].append(worker_rank)
# `gpu_ids` can be a list of strings or integers.
# convert them to integers for consistency.
# NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
# string sorting is not sufficient.
# see https://github.com/vllm-project/vllm/issues/5590
gpu_ids = [int(x) for x in gpu_ids]
node_gpus[node_id].extend(gpu_ids) node_gpus[node_id].extend(gpu_ids)
for node_id, gpu_ids in node_gpus.items(): for node_id, gpu_ids in node_gpus.items():
node_gpus[node_id] = sorted(gpu_ids) node_gpus[node_id] = sorted(gpu_ids)
...@@ -155,16 +184,29 @@ class RayGPUExecutor(DistributedGPUExecutor): ...@@ -155,16 +184,29 @@ class RayGPUExecutor(DistributedGPUExecutor):
self._run_workers("update_environment_variables", self._run_workers("update_environment_variables",
all_args=all_args_to_update_environment_variables) all_args=all_args_to_update_environment_variables)
if len(node_gpus) == 1:
# in single node case, we don't need to get the IP address.
# the loopback address is sufficient
# NOTE: a node may have several IP addresses, one for each
# network interface. `get_ip()` might return any of them,
# while they might not work for communication inside the node
# if the network setup is complicated. Using the loopback address
# solves this issue, as it always works for communication inside
# the node.
driver_ip = "127.0.0.1"
distributed_init_method = get_distributed_init_method( distributed_init_method = get_distributed_init_method(
driver_ip, get_open_port()) driver_ip, get_open_port())
error_on_invalid_device_count_status()
# Initialize the actual workers inside worker wrapper. # Initialize the actual workers inside worker wrapper.
init_worker_all_kwargs = [ init_worker_all_kwargs = [
self._get_worker_kwargs( self._get_worker_kwargs(
local_rank=node_workers[node_id].index(rank), local_rank=node_workers[node_id].index(rank),
rank=rank, rank=rank,
distributed_init_method=distributed_init_method, distributed_init_method=distributed_init_method,
) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids) ) for rank, (node_id,
_) in zip(worker_ranks, worker_node_and_gpu_ids)
] ]
self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
...@@ -173,10 +215,26 @@ class RayGPUExecutor(DistributedGPUExecutor): ...@@ -173,10 +215,26 @@ class RayGPUExecutor(DistributedGPUExecutor):
max_concurrent_workers=self.parallel_config. max_concurrent_workers=self.parallel_config.
max_parallel_loading_workers) max_parallel_loading_workers)
# This is the list of workers that are rank 0 of each TP group EXCEPT
# global rank 0. These are the workers that will broadcast to the
# rest of the workers.
self.tp_driver_workers: List[RayWorkerWrapper] = []
# This is the list of workers that are not drivers and not the first
# worker in a TP group. These are the workers that will be
# broadcasted to.
self.non_driver_workers: List[RayWorkerWrapper] = []
for idx, rank in enumerate(worker_ranks[1:]):
# We need to skip the driver worker, which we
# do by skipping worker_ranks[0] which is always 0.
if rank % self.parallel_config.tensor_parallel_size == 0:
self.tp_driver_workers.append(self.workers[idx])
else:
self.non_driver_workers.append(self.workers[idx])
def _driver_execute_model( def _driver_execute_model(
self, self, execute_model_req: Optional[ExecuteModelRequest]
execute_model_req: Optional[ExecuteModelRequest] = None ) -> Optional[List[SamplerOutput]]:
) -> List[SamplerOutput]:
"""Run execute_model in the driver worker. """Run execute_model in the driver worker.
Passing None will cause the driver to stop the model execution Passing None will cause the driver to stop the model execution
...@@ -189,7 +247,7 @@ class RayGPUExecutor(DistributedGPUExecutor): ...@@ -189,7 +247,7 @@ class RayGPUExecutor(DistributedGPUExecutor):
self, self,
method: str, method: str,
*args, *args,
async_run_remote_workers_only: bool = False, async_run_tensor_parallel_workers_only: bool = False,
all_args: Optional[List[Tuple[Any, ...]]] = None, all_args: Optional[List[Tuple[Any, ...]]] = None,
all_kwargs: Optional[List[Dict[str, Any]]] = None, all_kwargs: Optional[List[Dict[str, Any]]] = None,
use_dummy_driver: bool = False, use_dummy_driver: bool = False,
...@@ -200,10 +258,11 @@ class RayGPUExecutor(DistributedGPUExecutor): ...@@ -200,10 +258,11 @@ class RayGPUExecutor(DistributedGPUExecutor):
"""Runs the given method on all workers. Can be used in the following """Runs the given method on all workers. Can be used in the following
ways: ways:
- async_run_remote_workers_only: If True the method will be run only Args:
in the remote workers, not the driver worker. It will also be - async_run_tensor_parallel_workers_only: If True the method will be
run asynchronously and return a list of futures rather than blocking run only in the remote TP workers, not the driver worker.
on the results. It will also be run asynchronously and return a list of futures
rather than blocking on the results.
- args/kwargs: All workers share the same args/kwargs - args/kwargs: All workers share the same args/kwargs
- all_args/all_kwargs: args/kwargs for each worker are specified - all_args/all_kwargs: args/kwargs for each worker are specified
individually individually
...@@ -213,7 +272,9 @@ class RayGPUExecutor(DistributedGPUExecutor): ...@@ -213,7 +272,9 @@ class RayGPUExecutor(DistributedGPUExecutor):
raise NotImplementedError( raise NotImplementedError(
"max_concurrent_workers is not supported yet.") "max_concurrent_workers is not supported yet.")
count = len(self.workers) count = len(self.workers) if not \
async_run_tensor_parallel_workers_only \
else len(self.non_driver_workers)
all_worker_args = repeat(args, count) if all_args is None \ all_worker_args = repeat(args, count) if all_args is None \
else islice(all_args, 1, None) else islice(all_args, 1, None)
all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \ all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
...@@ -227,14 +288,17 @@ class RayGPUExecutor(DistributedGPUExecutor): ...@@ -227,14 +288,17 @@ class RayGPUExecutor(DistributedGPUExecutor):
ray_worker_outputs = [] ray_worker_outputs = []
else: else:
# Start the ray workers first. # Start the ray workers first.
ray_workers = self.workers
if async_run_tensor_parallel_workers_only:
ray_workers = self.non_driver_workers
ray_worker_outputs = [ ray_worker_outputs = [
worker.execute_method.remote(method, *worker_args, worker.execute_method.remote(method, *worker_args,
**worker_kwargs) **worker_kwargs)
for (worker, worker_args, worker_kwargs for (worker, worker_args, worker_kwargs
) in zip(self.workers, all_worker_args, all_worker_kwargs) ) in zip(ray_workers, all_worker_args, all_worker_kwargs)
] ]
if async_run_remote_workers_only: if async_run_tensor_parallel_workers_only:
# Just return futures # Just return futures
return ray_worker_outputs return ray_worker_outputs
...@@ -304,12 +368,41 @@ class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync): ...@@ -304,12 +368,41 @@ class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync):
self, self,
execute_model_req: Optional[ExecuteModelRequest] = None execute_model_req: Optional[ExecuteModelRequest] = None
) -> List[SamplerOutput]: ) -> List[SamplerOutput]:
return await self.driver_exec_method("execute_model", if self.pp_locks is None:
execute_model_req) # This locks each pipeline parallel stage so multiple virtual
# engines can't execute on the same stage at the same time
# We create the locks here to avoid creating them in the constructor
# which uses a different asyncio loop.
self.pp_locks = [
asyncio.Lock()
for _ in range(self.parallel_config.pipeline_parallel_size)
]
async def _run_task_with_lock(task, lock, *args, **kwargs):
async with lock:
return await task(*args, **kwargs)
tasks = []
tasks.append(
asyncio.create_task(
_run_task_with_lock(self.driver_exec_method, self.pp_locks[0],
"execute_model", execute_model_req)))
for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
start=1):
tasks.append(
asyncio.create_task(
_run_task_with_lock(driver_worker.execute_method.remote,
self.pp_locks[pp_rank],
"execute_model", execute_model_req)))
results = await asyncio.gather(*tasks)
# Only the last PP stage has the final results.
return results[-1]
async def _start_worker_execution_loop(self): async def _start_worker_execution_loop(self):
coros = [ coros = [
worker.execute_method.remote("start_worker_execution_loop") worker.execute_method.remote("start_worker_execution_loop")
for worker in self.workers for worker in self.non_driver_workers
] ]
return await asyncio.gather(*coros) return await asyncio.gather(*coros)
...@@ -3,7 +3,7 @@ from typing import List, Optional, Tuple ...@@ -3,7 +3,7 @@ from typing import List, Optional, Tuple
from vllm.config import ParallelConfig from vllm.config import ParallelConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils import get_ip, is_hip from vllm.utils import get_ip, is_hip, is_xpu
from vllm.worker.worker_base import WorkerWrapperBase from vllm.worker.worker_base import WorkerWrapperBase
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -42,14 +42,26 @@ try: ...@@ -42,14 +42,26 @@ try:
output = pickle.dumps(output) output = pickle.dumps(output)
return output return output
ray_import_err = None
except ImportError as e: except ImportError as e:
logger.warning(
"Failed to import Ray with %r. For multi-node inference, "
"please install Ray with `pip install ray`.", e)
ray = None # type: ignore ray = None # type: ignore
ray_import_err = e
RayWorkerWrapper = None # type: ignore RayWorkerWrapper = None # type: ignore
def ray_is_available() -> bool:
"""Returns True if Ray is available."""
return ray is not None
def assert_ray_available():
"""Raise an exception if Ray is not available."""
if ray is None:
raise ValueError("Failed to import Ray, please install Ray with "
"`pip install ray`.") from ray_import_err
def initialize_ray_cluster( def initialize_ray_cluster(
parallel_config: ParallelConfig, parallel_config: ParallelConfig,
ray_address: Optional[str] = None, ray_address: Optional[str] = None,
...@@ -65,13 +77,10 @@ def initialize_ray_cluster( ...@@ -65,13 +77,10 @@ def initialize_ray_cluster(
ray_address: The address of the Ray cluster. If None, uses ray_address: The address of the Ray cluster. If None, uses
the default Ray cluster address. the default Ray cluster address.
""" """
if ray is None: assert_ray_available()
raise ImportError(
"Ray is not installed. Please install Ray to use multi-node "
"serving.")
# Connect to a ray cluster. # Connect to a ray cluster.
if is_hip(): if is_hip() or is_xpu():
ray.init(address=ray_address, ray.init(address=ray_address,
ignore_reinit_error=True, ignore_reinit_error=True,
num_gpus=parallel_config.world_size) num_gpus=parallel_config.world_size)
......
import asyncio
import os
import pickle
from collections import defaultdict
from itertools import islice, repeat
from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Set,
Tuple, Union)
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
ModelConfig, MultiModalConfig, ParallelConfig,
PromptAdapterConfig, SchedulerConfig,
SpeculativeConfig)
from vllm.executor.distributed_gpu_executor import ( # yapf: disable
DistributedGPUExecutor, DistributedGPUExecutorAsync)
from vllm.executor.ray_utils import RayWorkerWrapper, ray
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
make_async)
if ray is not None:
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
if TYPE_CHECKING:
from ray.util.placement_group import PlacementGroup
logger = init_logger(__name__)
# If the env var is set, it uses the Ray's compiled DAG API
# which optimizes the control plane overhead.
# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0))
class RayXPUExecutor(DistributedGPUExecutor):
def __init__(
self,
model_config: ModelConfig,
cache_config: CacheConfig,
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
device_config: DeviceConfig,
load_config: LoadConfig,
lora_config: Optional[LoRAConfig],
multimodal_config: Optional[MultiModalConfig],
prompt_adapter_config: Optional[PromptAdapterConfig],
speculative_config: Optional[SpeculativeConfig],
) -> None:
assert device_config.device_type == "xpu"
assert (not speculative_config
), "Speculative decoding not yet supported for XPU backend"
self.model_config = model_config
self.cache_config = cache_config
self.load_config = load_config
self.lora_config = lora_config
self.parallel_config = parallel_config
self.scheduler_config = scheduler_config
self.device_config = device_config
self.multimodal_config = multimodal_config
self.prompt_adapter_config = prompt_adapter_config
placement_group = self.parallel_config.placement_group
# Disable Ray usage stats collection.
ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
if ray_usage != "1":
os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
# Create the parallel GPU workers.
self._init_workers_ray(placement_group)
# Profile the memory usage and initialize the cache.
self.forward_dag = None
if USE_RAY_COMPILED_DAG:
self.forward_dag = self._compiled_ray_dag()
# This is non-None when the execute model loop is running
# in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
# Updated by implementations that require additional args to be passed
# to the _run_workers execute_model call
self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {}
def _init_executor(self) -> None:
pass
def determine_num_available_blocks(self) -> Tuple[int, int]:
"""Determine the number of available KV blocks.
This invokes `determine_num_available_blocks` on each worker and takes
the min of the results, guaranteeing that the selected cache sizes are
compatible with all workers.
Returns:
- Tuple[num_gpu_blocks, num_cpu_blocks]
"""
# Get the maximum number of blocks that can be allocated on GPU and CPU.
num_blocks = self._run_workers("determine_num_available_blocks", )
# Since we use a shared centralized controller, we take the minimum
# number of blocks across all workers to make sure all the memory
# operators can be applied to all workers.
num_gpu_blocks = min(b[0] for b in num_blocks)
num_cpu_blocks = min(b[1] for b in num_blocks)
return num_gpu_blocks, num_cpu_blocks
def _init_workers_ray(self, placement_group: "PlacementGroup",
**ray_remote_kwargs):
if self.parallel_config.tensor_parallel_size == 1:
# For single GPU case, we use a ray worker with constrained memory.
num_gpus = self.cache_config.gpu_memory_utilization
else:
# Otherwise, the ray workers are allocated with a full GPU.
num_gpus = 1
# The driver dummy worker does not actually use any resources.
# It holds the resource for the driver worker.
self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
# The remaining workers are the actual ray actors.
self.workers: List[RayWorkerWrapper] = []
# Create the workers.
driver_ip = get_ip()
for bundle_id, bundle in enumerate(placement_group.bundle_specs):
if not bundle.get("GPU", 0):
continue
scheduling_strategy = PlacementGroupSchedulingStrategy(
placement_group=placement_group,
placement_group_capture_child_tasks=True,
placement_group_bundle_index=bundle_id,
)
worker = ray.remote(
num_cpus=0,
num_gpus=num_gpus,
scheduling_strategy=scheduling_strategy,
**ray_remote_kwargs,
)(RayWorkerWrapper).remote(
worker_module_name="vllm.worker.xpu_worker",
worker_class_name="XPUWorker",
trust_remote_code=self.model_config.trust_remote_code,
)
worker_ip = ray.get(worker.get_node_ip.remote())
if worker_ip == driver_ip and self.driver_dummy_worker is None:
# If the worker is on the same node as the driver, we use it
# as the resource holder for the driver process.
self.driver_dummy_worker = worker
self.driver_worker = RayWorkerWrapper(
worker_module_name="vllm.worker.xpu_worker",
worker_class_name="XPUWorker",
trust_remote_code=self.model_config.trust_remote_code,
)
else:
# Else, added to the list of workers.
self.workers.append(worker)
if self.driver_dummy_worker is None:
raise ValueError(
"Ray does not allocate any GPUs on the driver node. Consider "
"adjusting the Ray placement group or running the driver on a "
"GPU node.")
# Get the set of GPU IDs used on each node.
worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
use_dummy_driver=True)
node_workers = defaultdict(list)
node_gpus = defaultdict(list)
for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
node_workers[node_id].append(i)
node_gpus[node_id].extend(gpu_ids)
for node_id, gpu_ids in node_gpus.items():
node_gpus[node_id] = sorted(gpu_ids)
# TODO: add env var for xpu
distributed_init_method = get_distributed_init_method(
driver_ip, get_open_port())
def collect_arg_helper_func(**kwargs):
# avoid writing `{"name": value}` manually
return kwargs
init_worker_all_kwargs = []
# Initialize the actual workers inside worker wrapper.
for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids, ):
local_rank = node_workers[node_id].index(rank)
init_worker_all_kwargs.append(
collect_arg_helper_func(
model_config=self.model_config,
parallel_config=self.parallel_config,
scheduler_config=self.scheduler_config,
device_config=self.device_config,
cache_config=self.cache_config,
load_config=self.load_config,
local_rank=local_rank,
rank=rank,
distributed_init_method=distributed_init_method,
lora_config=self.lora_config,
multimodal_config=self.multimodal_config,
is_driver_worker=rank == 0,
))
self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
self._run_workers("init_device")
self._run_workers(
"load_model",
max_concurrent_workers=self.parallel_config.
max_parallel_loading_workers,
)
def initialize_cache(self, num_gpu_blocks: int,
num_cpu_blocks: int) -> None:
"""Initialize the KV cache in all workers.
"""
# NOTE: We log here to avoid multiple logs when number of workers is
# greater than one. We could log in the engine, but not all executors
# have GPUs.
logger.info("# GPU blocks: %d, "
"# CPU blocks: %d", num_gpu_blocks, num_cpu_blocks)
self.cache_config.num_gpu_blocks = num_gpu_blocks
self.cache_config.num_cpu_blocks = num_cpu_blocks
self._run_workers("initialize_cache",
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=num_cpu_blocks)
def _driver_execute_model(
self,
execute_model_req: Optional[ExecuteModelRequest] = None
) -> List[SamplerOutput]:
"""Run execute_model in the driver worker.
Passing None will cause the driver to stop the model execution
loop running in each of the remote workers.
"""
return self.driver_worker.execute_method("execute_model",
execute_model_req)
def add_lora(self, lora_request: LoRARequest) -> bool:
assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
return self._run_workers(
"add_lora",
lora_request=lora_request,
)
def remove_lora(self, lora_id: int) -> bool:
assert lora_id > 0, "lora_id must be greater than 0."
return self._run_workers(
"remove_lora",
lora_id=lora_id,
)
def list_loras(self) -> Set[int]:
return self._run_workers("list_loras")
def _run_workers(
self,
method: str,
*args,
async_run_remote_workers_only: bool = False,
all_args: Optional[List[Tuple[Any, ...]]] = None,
all_kwargs: Optional[List[Dict[str, Any]]] = None,
use_dummy_driver: bool = False,
max_concurrent_workers: Optional[int] = None,
use_ray_compiled_dag: bool = False,
**kwargs,
) -> Any:
"""Runs the given method on all workers. Can be used in the following
ways:
- args/kwargs: All workers share the same args/kwargs
- args/kwargs and driver_args/driver_kwargs: Driver worker has
different args
- all_args/all_kwargs: args/kwargs for each worker are specified
individually
"""
if max_concurrent_workers:
raise NotImplementedError(
"max_concurrent_workers is not supported yet.")
count = len(self.workers)
all_worker_args = repeat(args, count) if all_args is None \
else islice(all_args, 1, None)
all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
else islice(all_kwargs, 1, None)
if use_ray_compiled_dag:
# Right now, compiled DAG can only accept a single
# input. TODO(sang): Fix it.
assert self.forward_dag is not None
output_channels = self.forward_dag.execute(1)
else:
# Start the ray workers first.
ray_worker_outputs = [
worker.execute_method.remote(method, *worker_args,
**worker_kwargs)
for (worker, worker_args, worker_kwargs
) in zip(self.workers, all_worker_args, all_worker_kwargs)
]
if async_run_remote_workers_only:
# Just return futures
return ray_worker_outputs
driver_args = args if all_args is None else all_args[0]
driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
# Start the driver worker after all the ray workers.
if not use_dummy_driver:
driver_worker_output = self.driver_worker.execute_method(
method, *driver_args, **driver_kwargs)
else:
assert self.driver_dummy_worker is not None
driver_worker_output = ray.get(
self.driver_dummy_worker.execute_method.remote(
method, *driver_args, **driver_kwargs))
# Get the results of the ray workers.
if self.workers:
if use_ray_compiled_dag:
try:
ray_worker_outputs = [
pickle.loads(chan.begin_read())
for chan in output_channels
]
finally:
# Has to call end_read in order to reuse the DAG.
for chan in output_channels:
chan.end_read()
else:
ray_worker_outputs = ray.get(ray_worker_outputs)
return [driver_worker_output] + ray_worker_outputs
def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
"""Wait for futures returned from _run_workers() with
async_run_remote_workers_only to complete."""
ray.get(parallel_worker_tasks)
def _compiled_ray_dag(self):
import pkg_resources
required_version = "2.9"
current_version = pkg_resources.get_distribution("ray").version
if current_version < required_version:
raise ValueError(f"Ray version {required_version} or greater is "
f"required, but found {current_version}")
from ray.dag import InputNode, MultiOutputNode
assert self.parallel_config.worker_use_ray
# Right now, compiled DAG requires at least 1 arg. We send
# a dummy value for now. It will be fixed soon.
with InputNode() as input_data:
forward_dag = MultiOutputNode([
worker.execute_model_compiled_dag_remote.
bind( # type: ignore[attr-defined]
input_data) for worker in self.workers
])
return forward_dag.experimental_compile()
def check_health(self) -> None:
"""Raises an error if engine is unhealthy."""
self._check_if_any_actor_is_dead()
def _check_if_any_actor_is_dead(self):
if not self.workers:
return
dead_actors = []
for actor in self.workers:
actor_state = ray.state.actors(actor._ray_actor_id.hex()) # pylint: disable=protected-access
if actor_state["State"] == "DEAD":
dead_actors.append(actor)
if dead_actors:
raise RuntimeError("At least one Worker is dead. "
f"Dead Workers: {dead_actors}. ")
class RayXPUExecutorAsync(RayXPUExecutor, DistributedGPUExecutorAsync):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.driver_exec_method = make_async(self.driver_worker.execute_method)
async def _driver_execute_model_async(
self,
execute_model_req: Optional[ExecuteModelRequest] = None
) -> List[SamplerOutput]:
return await self.driver_exec_method("execute_model",
execute_model_req)
async def _start_worker_execution_loop(self):
coros = [
worker.execute_method.remote("start_worker_execution_loop")
for worker in self.workers
]
return await asyncio.gather(*coros)
from typing import Any, Dict, List, Optional, Set, Tuple
import torch
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
make_async)
logger = init_logger(__name__)
class TPUExecutor(ExecutorBase):
def _init_executor(self) -> None:
assert not self.scheduler_config.chunked_prefill_enabled, (
"Chunked prefill is not yet supported for TPU backend")
assert not self.speculative_config, (
"Speculative decoding is not yet supported for TPU backend")
if self.model_config.dtype in (torch.float16, torch.float32):
logger.warning(
"The TPU backend currently does not support %s. "
"Using bfloat16 instead.", self.model_config.dtype)
self.model_config.dtype = torch.bfloat16
# Instantiate the worker and load the model to the device.
self.driver_worker = self._create_worker()
self.driver_worker.init_device()
self.driver_worker.load_model()
def _get_worker_kwargs(
self,
local_rank: int = 0,
rank: int = 0,
distributed_init_method: Optional[str] = None,
) -> Dict[str, Any]:
"""Return worker init args for a given rank."""
if distributed_init_method is None:
distributed_init_method = get_distributed_init_method(
get_ip(), get_open_port())
return dict(
model_config=self.model_config,
parallel_config=self.parallel_config,
scheduler_config=self.scheduler_config,
device_config=self.device_config,
cache_config=self.cache_config,
load_config=self.load_config,
local_rank=local_rank,
rank=rank,
distributed_init_method=distributed_init_method,
multimodal_config=self.multimodal_config,
is_driver_worker=rank == 0,
)
def _create_worker(
self,
local_rank: int = 0,
rank: int = 0,
distributed_init_method: Optional[str] = None,
):
from vllm.worker.tpu_worker import TPUWorker
worker = TPUWorker(**self._get_worker_kwargs(local_rank, rank,
distributed_init_method))
return worker
def initialize_cache(
self,
num_gpu_blocks: int,
num_cpu_blocks: int,
) -> None:
"""Initialize the KV cache by invoking the underlying worker."""
# NOTE: This is logged in the executor because there can be >1 worker
# with other executors. We could log in the engine level, but work
# remains to abstract away the device for non-GPU configurations.
logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
num_cpu_blocks)
self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
def determine_num_available_blocks(self) -> Tuple[int, int]:
"""Determine the number of available KV blocks by invoking the
underlying worker."""
return self.driver_worker.determine_num_available_blocks()
def execute_model(
self,
execute_model_req: ExecuteModelRequest,
) -> List[SamplerOutput]:
output = self.driver_worker.execute_model(execute_model_req)
return output
def add_lora(self, lora_request: LoRARequest) -> bool:
raise NotImplementedError(
"LoRA is currently not supported by the TPU backend.")
def remove_lora(self, lora_id: int) -> bool:
raise NotImplementedError(
"LoRA is currently not supported by the TPU backend.")
def pin_lora(self, lora_id: int) -> bool:
raise NotImplementedError(
"LoRA is currently not supported by the TPU backend.")
def list_loras(self) -> Set[int]:
raise NotImplementedError(
"LoRA is currently not supported by the TPU backend.")
def add_prompt_adapter(self, prompt_adapter_request) -> bool:
raise NotImplementedError(
"Soft prompt is currently not supported by the TPU backend.")
def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
raise NotImplementedError(
"Soft prompt is currently not supported by the TPU backend.")
def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
raise NotImplementedError(
"Soft prompt is currently not supported by the TPU backend.")
def list_prompt_adapters(self) -> Set[int]:
raise NotImplementedError(
"Soft prompt is currently not supported by the TPU backend.")
def check_health(self) -> None:
# TPUExecutor will always be healthy as long as it's running.
return
class TPUExecutorAsync(TPUExecutor, ExecutorAsyncBase):
async def execute_model_async(
self,
sexecute_model_req: ExecuteModelRequest,
) -> SamplerOutput:
output = await make_async(self.driver_worker.execute_model
)(sexecute_model_req)
return output
from typing import List, Optional
import torch
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
ModelConfig, MultiModalConfig, ParallelConfig,
PromptAdapterConfig, SchedulerConfig,
SpeculativeConfig)
from vllm.executor.executor_base import ExecutorAsyncBase
from vllm.executor.gpu_executor import GPUExecutor
from vllm.logger import init_logger
from vllm.sequence import ExecuteModelRequest, SamplerOutput
from vllm.utils import make_async
from vllm.worker.worker_base import WorkerWrapperBase
logger = init_logger(__name__)
class XPUExecutor(GPUExecutor):
def __init__(
self,
model_config: ModelConfig,
cache_config: CacheConfig,
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
device_config: DeviceConfig,
load_config: LoadConfig,
lora_config: Optional[LoRAConfig],
multimodal_config: Optional[MultiModalConfig],
prompt_adapter_config: Optional[PromptAdapterConfig],
speculative_config: Optional[SpeculativeConfig],
) -> None:
assert device_config.device_type == "xpu"
assert (not speculative_config
), "Speculative decoding not yet supported for XPU backend"
model_config = _verify_and_get_model_config(model_config)
self.model_config = model_config
self.cache_config = cache_config
self.load_config = load_config
self.lora_config = lora_config
self.parallel_config = parallel_config
self.scheduler_config = scheduler_config
self.device_config = device_config
self.multimodal_config = multimodal_config
self.prompt_adapter_config = prompt_adapter_config
self.speculative_config = None
# Instantiate the worker and load the model to GPU.
self._init_executor()
def _create_worker(self,
local_rank: int = 0,
rank: int = 0,
distributed_init_method: Optional[str] = None):
if self.speculative_config is None:
worker_module_name = "vllm.worker.xpu_worker"
worker_class_name = "XPUWorker"
else:
raise NotImplementedError(
"XPU does not support speculative decoding")
wrapper = WorkerWrapperBase(
worker_module_name=worker_module_name,
worker_class_name=worker_class_name,
)
wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank,
distributed_init_method))
return wrapper.worker
def execute_model(
self,
execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
output = self.driver_worker.execute_model(execute_model_req)
return output
class XPUExecutorAsync(XPUExecutor, ExecutorAsyncBase):
async def execute_model_async(
self,
execute_model_req: ExecuteModelRequest,
) -> List[SamplerOutput]:
output = await make_async(self.driver_worker.execute_model
)(execute_model_req=execute_model_req)
return output
def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
if config.dtype == torch.bfloat16:
logger.warning(
"bfloat16 is not fully supported on XPU, casting to float16.")
config.dtype = torch.float16
if not config.enforce_eager:
logger.warning(
"CUDA graph is not supported on XPU, fallback to the eager "
"mode.")
config.enforce_eager = True
return config
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment