Merge tag 'v0.5.2' into v0.5.2-dtk24.04.1

705f6a35 · zhuwenwen · af837396 · 4cf256ae · 705f6a35 · 705f6a35
Commit 705f6a35 authored Jul 16, 2024 by zhuwenwen
20 changed files
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -9,7 +9,9 @@ import json
 import ssl
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
-from vllm.entrypoints.openai.serving_engine import LoRAModulePath
+from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
+                                                    PromptAdapterPath)
+from vllm.utils import FlexibleArgumentParser
 class LoRAParserAction(argparse.Action):
@@ -22,9 +24,17 @@ class LoRAParserAction(argparse.Action):
        setattr(namespace, self.dest, lora_list)
-def make_arg_parser():
+class PromptAdapterParserAction(argparse.Action):
-    parser = argparse.ArgumentParser(
-        description="vLLM OpenAI-Compatible RESTful API server.")
+    def __call__(self, parser, namespace, values, option_string=None):
+        adapter_list = []
+        for item in values:
+            name, path = item.split('=')
+            adapter_list.append(PromptAdapterPath(name, path))
+        setattr(namespace, self.dest, adapter_list)
+def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
    parser.add_argument("--host",
                        type=nullable_str,
                        default=None,
@@ -64,6 +74,14 @@ def make_arg_parser():
        action=LoRAParserAction,
        help="LoRA module configurations in the format name=path. "
        "Multiple modules can be specified.")
+    parser.add_argument(
+        "--prompt-adapters",
+        type=nullable_str,
+        default=None,
+        nargs='+',
+        action=PromptAdapterParserAction,
+        help="Prompt adapter configurations in the format name=path. "
+        "Multiple adapters can be specified.")
    parser.add_argument("--chat-template",
                        type=nullable_str,
                        default=None,
@@ -113,3 +131,9 @@ def make_arg_parser():
    parser = AsyncEngineArgs.add_cli_args(parser)
    return parser
+def create_parser_for_docs() -> FlexibleArgumentParser:
+    parser_for_docs = FlexibleArgumentParser(
+        prog="-m vllm.entrypoints.openai.api_server")
+    return make_arg_parser(parser_for_docs)
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -103,7 +103,8 @@ class ResponseFormat(OpenAIBaseModel):
 class StreamOptions(OpenAIBaseModel):
-    include_usage: Optional[bool]
+    include_usage: Optional[bool] = True
+    continuous_usage_stats: Optional[bool] = True
 class FunctionDefinition(OpenAIBaseModel):
@@ -190,6 +191,27 @@ class ChatCompletionRequest(OpenAIBaseModel):
            "special tokens so this should be set to False (as is the "
            "default)."),
    )
+    documents: Optional[List[Dict[str, str]]] = Field(
+        default=None,
+        description=
+        ("A list of dicts representing documents that will be accessible to "
+         "the model if it is performing RAG (retrieval-augmented generation)."
+         " If the template does not support RAG, this argument will have no "
+         "effect. We recommend that each document should be a dict containing "
+         "\"title\" and \"text\" keys."),
+    )
+    chat_template: Optional[str] = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "If this is not passed, the model's default chat template will be "
+            "used instead."),
+    )
+    chat_template_kwargs: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to the template renderer. "
+                     "Will be accessible by the chat template."),
+    )
    include_stop_str_in_output: Optional[bool] = Field(
        default=False,
        description=(
@@ -234,15 +256,22 @@ class ChatCompletionRequest(OpenAIBaseModel):
        logits_processors = None
        if self.logit_bias:
+            logit_bias: Dict[int, float] = {}
+            try:
+                for token_id, bias in self.logit_bias.items():
+                    # Convert token_id to integer before we add to LLMEngine
+                    # Clamp the bias between -100 and 100 per OpenAI API spec
+                    logit_bias[int(token_id)] = min(100, max(-100, bias))
+            except ValueError as exc:
+                raise ValueError(f"Found token_id `{token_id}` in logit_bias "
+                                 f"but token_id must be an integer or string "
+                                 f"representing an integer") from exc
            def logit_bias_logits_processor(
                    token_ids: List[int],
                    logits: torch.Tensor) -> torch.Tensor:
-                assert self.logit_bias is not None
+                for token_id, bias in logit_bias.items():
-                for token_id, bias in self.logit_bias.items():
+                    logits[token_id] += bias
-                    # Clamp the bias between -100 and 100 per OpenAI API spec
-                    bias = min(100, max(-100, bias))
-                    logits[int(token_id)] += bias
                return logits
            logits_processors = [logit_bias_logits_processor]
@@ -419,15 +448,22 @@ class CompletionRequest(OpenAIBaseModel):
        logits_processors = None
        if self.logit_bias:
+            logit_bias: Dict[int, float] = {}
+            try:
+                for token_id, bias in self.logit_bias.items():
+                    # Convert token_id to integer
+                    # Clamp the bias between -100 and 100 per OpenAI API spec
+                    logit_bias[int(token_id)] = min(100, max(-100, bias))
+            except ValueError as exc:
+                raise ValueError(f"Found token_id `{token_id}` in logit_bias "
+                                 f"but token_id must be an integer or string "
+                                 f"representing an integer") from exc
            def logit_bias_logits_processor(
                    token_ids: List[int],
                    logits: torch.Tensor) -> torch.Tensor:
-                assert self.logit_bias is not None
+                for token_id, bias in logit_bias.items():
-                for token_id, bias in self.logit_bias.items():
+                    logits[token_id] += bias
-                    # Clamp the bias between -100 and 100 per OpenAI API spec
-                    bias = min(100, max(-100, bias))
-                    logits[int(token_id)] += bias
                return logits
            logits_processors = [logit_bias_logits_processor]
@@ -566,7 +602,7 @@ class CompletionStreamResponse(OpenAIBaseModel):
 class EmbeddingResponseData(BaseModel):
    index: int
    object: str = "embedding"
-    embedding: List[float]
+    embedding: Union[List[float], str]
 class EmbeddingResponse(BaseModel):
@@ -672,6 +708,17 @@ class BatchRequestInput(OpenAIBaseModel):
    body: Union[ChatCompletionRequest, ]
+class BatchResponseData(OpenAIBaseModel):
+    # HTTP status code of the response.
+    status_code: int = 200
+    # An unique identifier for the API request.
+    request_id: str
+    # The body of the response.
+    body: Optional[ChatCompletionResponse] = None
 class BatchRequestOutput(OpenAIBaseModel):
    """
    The per-line object of the batch output and error files
@@ -683,8 +730,29 @@ class BatchRequestOutput(OpenAIBaseModel):
    # inputs.
    custom_id: str
-    response: Optional[ChatCompletionResponse]
+    response: Optional[BatchResponseData]
    # For requests that failed with a non-HTTP error, this will contain more
    # information on the cause of the failure.
    error: Optional[Any]
+class TokenizeRequest(OpenAIBaseModel):
+    model: str
+    prompt: str
+    add_special_tokens: bool = Field(default=True)
+class TokenizeResponse(OpenAIBaseModel):
+    tokens: List[int]
+    count: int
+    max_model_len: int
+class DetokenizeRequest(OpenAIBaseModel):
+    model: str
+    tokens: List[int]
+class DetokenizeResponse(OpenAIBaseModel):
+    prompt: str
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
-import argparse
 import asyncio
-import sys
 from io import StringIO
+from typing import Awaitable, List
 import aiohttp
-import vllm
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import (BatchRequestInput,
                                              BatchRequestOutput,
-                                              ChatCompletionResponse)
+                                              BatchResponseData,
+                                              ChatCompletionResponse,
+                                              ErrorResponse)
 from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import random_uuid
+from vllm.utils import FlexibleArgumentParser, random_uuid
+from vllm.version import __version__ as VLLM_VERSION
 logger = init_logger(__name__)
 def parse_args():
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
        description="vLLM OpenAI-Compatible batch runner.")
    parser.add_argument(
        "-i",
@@ -55,7 +56,7 @@ async def read_file(path_or_url: str) -> str:
                   session.get(path_or_url) as resp:
            return await resp.text()
    else:
-        with open(path_or_url, "r") as f:
+        with open(path_or_url, "r", encoding="utf-8") as f:
            return f.read()
@@ -68,7 +69,7 @@ async def write_file(path_or_url: str, data: str) -> None:
        # We should make this async, but as long as this is always run as a
        # standalone program, blocking the event loop won't effect performance
        # in this particular case.
-        with open(path_or_url, "w") as f:
+        with open(path_or_url, "w", encoding="utf-8") as f:
            f.write(data)
@@ -76,20 +77,27 @@ async def run_request(chat_serving: OpenAIServingChat,
                      request: BatchRequestInput) -> BatchRequestOutput:
    chat_request = request.body
    chat_response = await chat_serving.create_chat_completion(chat_request)
    if isinstance(chat_response, ChatCompletionResponse):
        batch_output = BatchRequestOutput(
            id=f"vllm-{random_uuid()}",
            custom_id=request.custom_id,
-            response=chat_response,
+            response=BatchResponseData(
+                body=chat_response, request_id=f"vllm-batch-{random_uuid()}"),
            error=None,
        )
-    else:
+    elif isinstance(chat_response, ErrorResponse):
        batch_output = BatchRequestOutput(
            id=f"vllm-{random_uuid()}",
            custom_id=request.custom_id,
-            response=None,
+            response=BatchResponseData(
+                status_code=chat_response.code,
+                request_id=f"vllm-batch-{random_uuid()}"),
            error=chat_response,
        )
+    else:
+        raise ValueError("Request must not be sent in stream mode")
    return batch_output
@@ -114,7 +122,7 @@ async def main(args):
    )
    # Submit all requests in the file to the engine "concurrently".
-    response_futures = []
+    response_futures: List[Awaitable[BatchRequestOutput]] = []
    for request_json in (await read_file(args.input_file)).strip().split("\n"):
        request = BatchRequestInput.model_validate_json(request_json)
        response_futures.append(run_request(openai_serving_chat, request))
@@ -128,14 +136,11 @@ async def main(args):
    output_buffer.seek(0)
    await write_file(args.output_file, output_buffer.read().strip())
-    # Temporary workaround for https://github.com/vllm-project/vllm/issues/4789
-    sys.exit(0)
 if __name__ == "__main__":
    args = parse_args()
-    logger.info("vLLM API server version %s", vllm.__version__)
+    logger.info("vLLM API server version %s", VLLM_VERSION)
    logger.info("args: %s", args)
    asyncio.run(main(args))
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
 import codecs
 import time
 from dataclasses import dataclass, field
+from functools import cached_property
 from typing import (AsyncGenerator, AsyncIterator, Awaitable, Dict, Iterable,
                    List, Optional)
 from typing import Sequence as GenericSequence
@@ -10,7 +11,7 @@ from fastapi import Request
 from openai.types.chat import (ChatCompletionContentPartImageParam,
                               ChatCompletionContentPartTextParam)
-from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.config import ModelConfig
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import (
    ChatCompletionContentPartParam, ChatCompletionLogProb,
@@ -26,11 +27,12 @@ from vllm.inputs import PromptInputs
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding import (
    get_guided_decoding_logits_processor)
-from vllm.multimodal.image import ImagePixelData
+from vllm.multimodal import MultiModalDataDict
-from vllm.multimodal.utils import (async_get_and_parse_image,
+from vllm.multimodal.utils import async_get_and_parse_image
-                                   get_full_image_text_prompt)
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
+from vllm.tracing import (contains_trace_headers, extract_trace_headers,
+                          log_tracing_disabled_warning)
 from vllm.utils import random_uuid
 logger = init_logger(__name__)
@@ -45,7 +47,7 @@ class ConversationMessage(TypedDict):
 @dataclass(frozen=True)
 class ChatMessageParseResult:
    messages: List[ConversationMessage]
-    image_futures: List[Awaitable[ImagePixelData]] = field(
+    mm_futures: List[Awaitable[MultiModalDataDict]] = field(
        default_factory=list)
@@ -95,82 +97,85 @@ class OpenAIServingChat(OpenAIServing):
            logger.warning(
                "No chat template provided. Chat API will not work.")
+    @cached_property
+    def image_token_str(self) -> Optional[str]:
+        # TODO: Let user specify how to insert image tokens into prompt
+        # (similar to chat template)
+        model_type = self.model_config.hf_config.model_type
+        if model_type == "phi3_v":
+            # Workaround since this token is not defined in the tokenizer
+            return "<|image_1|>"
+        if model_type in ("blip-2", "chatglm", "fuyu", "minicpmv",
+                          "paligemma"):
+            # These models do not use image tokens in the prompt
+            return None
+        if model_type.startswith("llava"):
+            return self.tokenizer.decode(
+                self.model_config.hf_config.image_token_index)
+        else:
+            raise TypeError("Unknown model type: {model_type}")
+    # TODO: Let user specify how to insert image tokens into prompt
+    # (similar to chat template)
+    def _get_full_image_text_prompt(self, image_token_str: str,
+                                    text_prompt: str) -> str:
+        """Combine image and text prompts for vision language model"""
+        # NOTE: For now we assume all model architectures use the same
+        # image + text prompt format. This may change in the future.
+        return f"{image_token_str}\n{text_prompt}"
    def _parse_chat_message_content_parts(
        self,
        role: str,
        parts: Iterable[ChatCompletionContentPartParam],
    ) -> ChatMessageParseResult:
        texts: List[str] = []
-        image_futures: List[Awaitable[ImagePixelData]] = []
+        mm_futures: List[Awaitable[MultiModalDataDict]] = []
-        vlm_config: Optional[VisionLanguageConfig] = getattr(
-            self.engine.engine, "vision_language_config", None)
-        model_config = getattr(self.engine.engine, "model_config", None)
        for part in parts:
            part_type = part["type"]
            if part_type == "text":
                text = cast(ChatCompletionContentPartTextParam, part)["text"]
                texts.append(text)
            elif part_type == "image_url":
-                if vlm_config is None:
+                if len(mm_futures) > 0:
-                    raise ValueError(
-                        "'image_url' input is not supported as the loaded "
-                        "model is not multimodal.")
-                elif len(image_futures) == 0:
-                    assert self.tokenizer is not None
-                    image_url = cast(ChatCompletionContentPartImageParam,
-                                     part)["image_url"]
-                    if image_url.get("detail", "auto") != "auto":
-                        logger.warning(
-                            "'image_url.detail' is currently not supported and "
-                            "will be ignored.")
-                    image_future = async_get_and_parse_image(image_url["url"])
-                    image_futures.append(image_future)
-                else:
                    raise NotImplementedError(
                        "Multiple 'image_url' input is currently not supported."
                    )
+                image_url = cast(ChatCompletionContentPartImageParam,
+                                 part)["image_url"]
+                if image_url.get("detail", "auto") != "auto":
+                    logger.warning(
+                        "'image_url.detail' is currently not supported and "
+                        "will be ignored.")
+                image_future = async_get_and_parse_image(image_url["url"])
+                mm_futures.append(image_future)
            else:
                raise NotImplementedError(f"Unknown part type: {part_type}")
        text_prompt = "\n".join(texts)
-        if vlm_config is not None and len(image_futures):
+        if mm_futures:
+            image_token_str = self.image_token_str
-            (image_token_prompt,
+            if image_token_str is not None:
-             image_token_str) = vlm_config.get_image_token_text(self.tokenizer)
+                if image_token_str in text_prompt:
+                    logger.warning(
-            # NOTE: If image token string (e.g, <image>) is already present
+                        "Detected image token string in the text prompt. "
-            # in the text prompt, we assume it follows the same format required
+                        "Skipping prompt formatting.")
-            # by the engine.
+                else:
-            if image_token_str in text_prompt:
+                    text_prompt = self._get_full_image_text_prompt(
-                logger.warning(
+                        image_token_str=image_token_str,
-                    "Detected image token string in the text prompt. "
+                        text_prompt=text_prompt,
-                    "Skipping prompt formatting.")
+                    )
-                messages = [
-                    ConversationMessage(role=role, content=text_prompt)
-                ]
-            else:
+        messages = [ConversationMessage(role=role, content=text_prompt)]
-                full_prompt = get_full_image_text_prompt(
-                    image_prompt=image_token_prompt,
-                    text_prompt=text_prompt,
-                    config=model_config)
-                messages = [
-                    ConversationMessage(role=role, content=full_prompt)
-                ]
-        else:
-            messages = [ConversationMessage(role=role, content=text_prompt)]
-        return ChatMessageParseResult(messages=messages,
+        return ChatMessageParseResult(messages=messages, mm_futures=mm_futures)
-                                      image_futures=image_futures)
    def _parse_chat_message_content(
        self,
@@ -180,10 +185,10 @@ class OpenAIServingChat(OpenAIServing):
        content = message.get("content")
        if content is None:
-            return ChatMessageParseResult(messages=[], image_futures=[])
+            return ChatMessageParseResult(messages=[], mm_futures=[])
        if isinstance(content, str):
            messages = [ConversationMessage(role=role, content=content)]
-            return ChatMessageParseResult(messages=messages, image_futures=[])
+            return ChatMessageParseResult(messages=messages, mm_futures=[])
        return self._parse_chat_message_content_parts(role, content)
@@ -208,32 +213,41 @@ class OpenAIServingChat(OpenAIServing):
        try:
            conversation: List[ConversationMessage] = []
-            image_futures: List[Awaitable[ImagePixelData]] = []
+            mm_futures: List[Awaitable[MultiModalDataDict]] = []
            for msg in request.messages:
                chat_parsed_result = self._parse_chat_message_content(msg)
                conversation.extend(chat_parsed_result.messages)
-                image_futures.extend(chat_parsed_result.image_futures)
+                mm_futures.extend(chat_parsed_result.mm_futures)
+            tool_dicts = None if request.tools is None else [
+                tool.model_dump() for tool in request.tools
+            ]
            prompt = self.tokenizer.apply_chat_template(
                conversation=conversation,
                tokenize=False,
                add_generation_prompt=request.add_generation_prompt,
+                tools=tool_dicts,
+                documents=request.documents,
+                chat_template=request.chat_template,
+                **(request.chat_template_kwargs or {}),
            )
        except Exception as e:
            logger.error("Error in applying chat template from request: %s", e)
            return self.create_error_response(str(e))
-        # Fetch image data
+        mm_data: Optional[MultiModalDataDict] = None
-        image_data: Optional[ImagePixelData] = None
        try:
-            if len(image_futures):
+            if len(mm_futures):
-                # since we support only single image currently
+                # since we support only single mm data currently
-                assert len(image_futures) == 1
+                assert len(
-                image_data = await image_futures[0]
+                    mm_futures
+                ) == 1, "Multiple 'image_url' input is currently not supported."
+                mm_data = await mm_futures[0]
        except Exception as e:
-            logger.error("Error in loading image data: %s", e)
+            logger.error("Error in loading multi-modal data: %s", e)
            return self.create_error_response(str(e))
        request_id = f"cmpl-{random_uuid()}"
@@ -244,7 +258,7 @@ class OpenAIServingChat(OpenAIServing):
                prompt=prompt,
                add_special_tokens=request.add_special_tokens)
            sampling_params = request.to_sampling_params()
-            lora_request = self._maybe_get_lora(request)
+            _, lora_request = self._maybe_get_adapter(request)
            decoding_config = await self.engine.get_decoding_config()
            guided_decoding_backend = request.guided_decoding_backend \
                or decoding_config.guided_decoding_backend
@@ -264,14 +278,23 @@ class OpenAIServingChat(OpenAIServing):
            "prompt": prompt_text,
            "prompt_token_ids": prompt_ids,
        }
-        if image_data is not None:
+        if mm_data:
-            inputs["multi_modal_data"] = image_data
+            inputs["multi_modal_data"] = mm_data
+        is_tracing_enabled = await self.engine.is_tracing_enabled()
+        trace_headers = None
+        if is_tracing_enabled and raw_request:
+            trace_headers = extract_trace_headers(raw_request.headers)
+        if not is_tracing_enabled and raw_request and contains_trace_headers(
+                raw_request.headers):
+            log_tracing_disabled_warning()
        result_generator = self.engine.generate(
            inputs,
            sampling_params,
            request_id,
            lora_request,
+            trace_headers=trace_headers,
        )
        # Streaming response
        if request.stream:
@@ -487,7 +510,7 @@ class OpenAIServingChat(OpenAIServing):
            final_res = res
        assert final_res is not None
-        choices = []
+        choices: List[ChatCompletionResponseChoice] = []
        role = self.get_chat_request_role(request)
        for output in final_res.outputs:

--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -16,14 +16,21 @@ from vllm.entrypoints.openai.protocol import (CompletionLogProbs,
                                              CompletionResponseChoice,
                                              CompletionResponseStreamChoice,
                                              CompletionStreamResponse,
-                                              UsageInfo)
+                                              DetokenizeRequest,
+                                              DetokenizeResponse,
+                                              TokenizeRequest,
+                                              TokenizeResponse, UsageInfo)
+# yapf: enable
 from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
-                                                    OpenAIServing)
+                                                    OpenAIServing,
+                                                    PromptAdapterPath)
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding import (
    get_guided_decoding_logits_processor)
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
+from vllm.tracing import (contains_trace_headers, extract_trace_headers,
+                          log_tracing_disabled_warning)
 from vllm.utils import merge_async_iterators, random_uuid
 logger = init_logger(__name__)
@@ -61,11 +68,13 @@ class OpenAIServingCompletion(OpenAIServing):
    def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig,
                 served_model_names: List[str],
-                 lora_modules: Optional[List[LoRAModulePath]]):
+                 lora_modules: Optional[List[LoRAModulePath]],
+                 prompt_adapters: Optional[List[PromptAdapterPath]]):
        super().__init__(engine=engine,
                         model_config=model_config,
                         served_model_names=served_model_names,
-                         lora_modules=lora_modules)
+                         lora_modules=lora_modules,
+                         prompt_adapters=prompt_adapters)
    async def create_completion(self, request: CompletionRequest,
                                raw_request: Request):
@@ -95,7 +104,12 @@ class OpenAIServingCompletion(OpenAIServing):
        generators: List[AsyncIterator[RequestOutput]] = []
        try:
            sampling_params = request.to_sampling_params()
-            lora_request = self._maybe_get_lora(request)
+            adapter_type, adapter_request = self._maybe_get_adapter(request)
+            lora_request, prompt_adapter_request = None, None
+            if adapter_type == 'LoRA':
+                lora_request, prompt_adapter_request = adapter_request, None
+            elif adapter_type == 'PromptAdapter':
+                lora_request, prompt_adapter_request = None, adapter_request
            decoding_config = await self.engine.get_decoding_config()
            guided_decoding_backend = request.guided_decoding_backend \
                or decoding_config.guided_decoding_backend
@@ -125,6 +139,14 @@ class OpenAIServingCompletion(OpenAIServing):
                        truncate_prompt_tokens)
                prompt_ids, prompt_text = prompt_formats
+                is_tracing_enabled = await self.engine.is_tracing_enabled()
+                trace_headers = None
+                if is_tracing_enabled:
+                    trace_headers = extract_trace_headers(raw_request.headers)
+                if not is_tracing_enabled and contains_trace_headers(
+                        raw_request.headers):
+                    log_tracing_disabled_warning()
                generator = self.engine.generate(
                    {
                        "prompt": prompt_text,
@@ -133,6 +155,8 @@ class OpenAIServingCompletion(OpenAIServing):
                    sampling_params,
                    f"{request_id}-{i}",
                    lora_request=lora_request,
+                    prompt_adapter_request=prompt_adapter_request,
+                    trace_headers=trace_headers,
                )
                generators.append(generator)
@@ -256,16 +280,6 @@ class OpenAIServingCompletion(OpenAIServing):
                    previous_num_tokens[i] = len(output.token_ids)
                    finish_reason = output.finish_reason
                    stop_reason = output.stop_reason
-                    if output.finish_reason is not None:  # return final usage
-                        prompt_tokens = len(res.prompt_token_ids)
-                        completion_tokens = len(output.token_ids)
-                        final_usage = UsageInfo(
-                            prompt_tokens=prompt_tokens,
-                            completion_tokens=completion_tokens,
-                            total_tokens=prompt_tokens + completion_tokens,
-                        )
-                    else:
-                        final_usage = None
                    chunk = CompletionStreamResponse(
                        id=request_id,
@@ -282,9 +296,21 @@ class OpenAIServingCompletion(OpenAIServing):
                        ])
                    if (request.stream_options
                            and request.stream_options.include_usage):
-                        chunk.usage = None
+                        if (request.stream_options.continuous_usage_stats
+                                or output.finish_reason is not None):
+                            prompt_tokens = len(res.prompt_token_ids)
+                            completion_tokens = len(output.token_ids)
+                            usage = UsageInfo(
+                                prompt_tokens=prompt_tokens,
+                                completion_tokens=completion_tokens,
+                                total_tokens=prompt_tokens + completion_tokens,
+                            )
+                        if request.stream_options.continuous_usage_stats:
+                            chunk.usage = usage
+                        else:
+                            chunk.usage = None
-                    response_json = chunk.model_dump_json(exclude_unset=True)
+                    response_json = chunk.model_dump_json(exclude_unset=False)
                    yield f"data: {response_json}\n\n"
            if (request.stream_options
@@ -294,10 +320,10 @@ class OpenAIServingCompletion(OpenAIServing):
                    created=created_time,
                    model=model_name,
                    choices=[],
-                    usage=final_usage,
+                    usage=usage,
                )
                final_usage_data = (final_usage_chunk.model_dump_json(
-                    exclude_unset=True, exclude_none=True))
+                    exclude_unset=False, exclude_none=True))
                yield f"data: {final_usage_data}\n\n"
        except ValueError as e:
@@ -330,7 +356,7 @@ class OpenAIServingCompletion(OpenAIServing):
                    out_logprobs = prompt_logprobs
                    output_text = prompt_text
                elif request.echo and request.max_tokens > 0:
-                    token_ids = prompt_token_ids + output.token_ids
+                    token_ids = prompt_token_ids + list(output.token_ids)
                    out_logprobs = (prompt_logprobs + output.logprobs
                                    if request.logprobs is not None else None)
                    output_text = prompt_text + output.text
@@ -431,3 +457,29 @@ class OpenAIServingCompletion(OpenAIServing):
            tokens=out_tokens,
            top_logprobs=out_top_logprobs,
        )
+    async def create_tokenize(self,
+                              request: TokenizeRequest) -> TokenizeResponse:
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+        (input_ids, input_text) = self._validate_prompt_and_tokenize(
+            request,
+            prompt=request.prompt,
+            add_special_tokens=request.add_special_tokens)
+        return TokenizeResponse(tokens=input_ids,
+                                count=len(input_ids),
+                                max_model_len=self.max_model_len)
+    async def create_detokenize(
+            self, request: DetokenizeRequest) -> DetokenizeResponse:
+        error_check_ret = await self._check_model(request)
+        if error_check_ret is not None:
+            return error_check_ret
+        (input_ids, input_text) = self._validate_prompt_and_tokenize(
+            request, prompt_ids=request.tokens)
+        return DetokenizeResponse(prompt=input_text)
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
+import base64
 import time
 from typing import AsyncIterator, List, Optional, Tuple
+import numpy as np
 from fastapi import Request
 from vllm.config import ModelConfig
@@ -20,19 +22,18 @@ TypeTokenIDs = List[int]
 def request_output_to_embedding_response(
-    final_res_batch: List[EmbeddingRequestOutput],
+        final_res_batch: List[EmbeddingRequestOutput], request_id: str,
-    request_id: str,
+        created_time: int, model_name: str,
-    created_time: int,
+        encoding_format: str) -> EmbeddingResponse:
-    model_name: str,
+    data: List[EmbeddingResponseData] = []
-) -> EmbeddingResponse:
-    data = []
    num_prompt_tokens = 0
    for idx, final_res in enumerate(final_res_batch):
        assert final_res is not None
        prompt_token_ids = final_res.prompt_token_ids
+        embedding = final_res.outputs.embedding
-        embedding_data = EmbeddingResponseData(
+        if encoding_format == "base64":
-            index=idx, embedding=final_res.outputs.embedding)
+            embedding = base64.b64encode(np.array(embedding))
+        embedding_data = EmbeddingResponseData(index=idx, embedding=embedding)
        data.append(embedding_data)
        num_prompt_tokens += len(prompt_token_ids)
@@ -72,10 +73,8 @@ class OpenAIServingEmbedding(OpenAIServing):
        if error_check_ret is not None:
            return error_check_ret
-        # Return error for unsupported features.
+        encoding_format = (request.encoding_format
-        if request.encoding_format == "base64":
+                           if request.encoding_format else "float")
-            return self.create_error_response(
-                "base64 encoding is not currently supported")
        if request.dimensions is not None:
            return self.create_error_response(
                "dimensions is currently not supported")
@@ -129,7 +128,8 @@ class OpenAIServingEmbedding(OpenAIServing):
                    return self.create_error_response("Client disconnected")
                final_res_batch[i] = res
            response = request_output_to_embedding_response(
-                final_res_batch, request_id, created_time, model_name)
+                final_res_batch, request_id, created_time, model_name,
+                encoding_format)
        except ValueError as e:
            # TODO: Use a vllm-specific Validation Error
            return self.create_error_response(str(e))

--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -10,17 +10,25 @@ from vllm.config import ModelConfig
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              CompletionRequest,
+                                              DetokenizeRequest,
                                              EmbeddingRequest, ErrorResponse,
                                              ModelCard, ModelList,
-                                              ModelPermission)
+                                              ModelPermission, TokenizeRequest)
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import get_tokenizer
 logger = init_logger(__name__)
+@dataclass
+class PromptAdapterPath:
+    name: str
+    local_path: str
 @dataclass
 class LoRAModulePath:
    name: str
@@ -29,12 +37,18 @@ class LoRAModulePath:
 class OpenAIServing:
-    def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig,
+    def __init__(
-                 served_model_names: List[str],
+        self,
-                 lora_modules: Optional[List[LoRAModulePath]]):
+        engine: AsyncLLMEngine,
+        model_config: ModelConfig,
+        served_model_names: List[str],
+        lora_modules: Optional[List[LoRAModulePath]],
+        prompt_adapters: Optional[List[PromptAdapterPath]] = None,
+    ):
        super().__init__()
        self.engine = engine
+        self.model_config = model_config
        self.max_model_len = model_config.max_model_len
        # A separate tokenizer to map token IDs to strings.
@@ -47,9 +61,8 @@ class OpenAIServing:
        self.served_model_names = served_model_names
-        if lora_modules is None:
+        self.lora_requests = []
-            self.lora_requests = []
+        if lora_modules is not None:
-        else:
            self.lora_requests = [
                LoRARequest(
                    lora_name=lora.name,
@@ -58,6 +71,20 @@ class OpenAIServing:
                ) for i, lora in enumerate(lora_modules, start=1)
            ]
+        self.prompt_adapter_requests = []
+        if prompt_adapters is not None:
+            for i, prompt_adapter in enumerate(prompt_adapters, start=1):
+                with open(f"./{prompt_adapter.local_path}"
+                          f"/adapter_config.json") as f:
+                    adapter_config = json.load(f)
+                    num_virtual_tokens = adapter_config["num_virtual_tokens"]
+                self.prompt_adapter_requests.append(
+                    PromptAdapterRequest(
+                        prompt_adapter_name=prompt_adapter.name,
+                        prompt_adapter_id=i,
+                        prompt_adapter_local_path=prompt_adapter.local_path,
+                        prompt_adapter_num_virtual_tokens=num_virtual_tokens))
    async def show_available_models(self) -> ModelList:
        """Show available models. Right now we only have one model."""
        model_cards = [
@@ -73,7 +100,14 @@ class OpenAIServing:
                      permission=[ModelPermission()])
            for lora in self.lora_requests
        ]
+        prompt_adapter_cards = [
+            ModelCard(id=prompt_adapter.prompt_adapter_name,
+                      root=self.served_model_names[0],
+                      permission=[ModelPermission()])
+            for prompt_adapter in self.prompt_adapter_requests
+        ]
        model_cards.extend(lora_cards)
+        model_cards.extend(prompt_adapter_cards)
        return ModelList(data=model_cards)
    def create_error_response(
@@ -99,34 +133,45 @@ class OpenAIServing:
        return json_str
    async def _check_model(
-        self, request: Union[CompletionRequest, ChatCompletionRequest,
+        self, request: Union[ChatCompletionRequest, CompletionRequest,
-                             EmbeddingRequest]
+                             DetokenizeRequest, EmbeddingRequest,
+                             TokenizeRequest]
    ) -> Optional[ErrorResponse]:
        if request.model in self.served_model_names:
            return None
        if request.model in [lora.lora_name for lora in self.lora_requests]:
            return None
+        if request.model in [
+                prompt_adapter.prompt_adapter_name
+                for prompt_adapter in self.prompt_adapter_requests
+        ]:
+            return None
        return self.create_error_response(
            message=f"The model `{request.model}` does not exist.",
            err_type="NotFoundError",
            status_code=HTTPStatus.NOT_FOUND)
-    def _maybe_get_lora(
+    def _maybe_get_adapter(
        self, request: Union[CompletionRequest, ChatCompletionRequest,
                             EmbeddingRequest]
-    ) -> Optional[LoRARequest]:
+    ) -> Tuple[Optional[str], Optional[Union[LoRARequest,
+                                             PromptAdapterRequest]]]:
        if request.model in self.served_model_names:
-            return None
+            return None, None
        for lora in self.lora_requests:
            if request.model == lora.lora_name:
-                return lora
+                return 'LoRA', lora
+        for prompt_adapter in self.prompt_adapter_requests:
+            if request.model == prompt_adapter.prompt_adapter_name:
+                return 'PromptAdapter', prompt_adapter
        # if _check_model has been called earlier, this will be unreachable
        raise ValueError(f"The model `{request.model}` does not exist.")
    def _validate_prompt_and_tokenize(
            self,
            request: Union[ChatCompletionRequest, CompletionRequest,
-                           EmbeddingRequest],
+                           DetokenizeRequest, EmbeddingRequest,
+                           TokenizeRequest],
            prompt: Optional[str] = None,
            prompt_ids: Optional[List[int]] = None,
            truncate_prompt_tokens: Optional[Annotated[int,
@@ -174,6 +219,11 @@ class OpenAIServing:
                    f"generation. Please reduce the length of the input.", )
            return input_ids, input_text
+        # Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens
+        # and does not require model context length validation
+        if isinstance(request, (TokenizeRequest, DetokenizeRequest)):
+            return input_ids, input_text
        if request.max_tokens is None:
            if token_num >= self.max_model_len:
                raise ValueError(

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -5,6 +5,7 @@ if TYPE_CHECKING:
    VLLM_HOST_IP: str = ""
    VLLM_PORT: Optional[int] = None
    VLLM_USE_MODELSCOPE: bool = False
+    VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
    VLLM_INSTANCE_ID: Optional[str] = None
    VLLM_NCCL_SO_PATH: Optional[str] = None
    LD_LIBRARY_PATH: Optional[str] = None
@@ -27,14 +28,20 @@ if TYPE_CHECKING:
    VLLM_TRACE_FUNCTION: int = 0
    VLLM_ATTENTION_BACKEND: Optional[str] = None
    VLLM_CPU_KVCACHE_SPACE: int = 0
+    VLLM_OPENVINO_KVCACHE_SPACE: int = 0
+    VLLM_OPENVINO_CPU_KV_CACHE_PRECISION: Optional[str] = None
+    VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS: bool = False
+    VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/"
+    VLLM_FUSED_MOE_CHUNK_SIZE: int = 64 * 1024
    VLLM_USE_RAY_COMPILED_DAG: bool = False
-    VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
+    VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
    VLLM_IMAGE_FETCH_TIMEOUT: int = 5
    VLLM_TARGET_DEVICE: str = "cuda"
    MAX_JOBS: Optional[str] = None
    NVCC_THREADS: Optional[str] = None
    VLLM_USE_PRECOMPILED: bool = False
    VLLM_INSTALL_PUNICA_KERNELS: bool = False
+    VLLM_NO_DEPRECATION_WARNING: bool = False
    CMAKE_BUILD_TYPE: Optional[str] = None
    VERBOSE: bool = False
@@ -47,7 +54,8 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    # ================== Installation Time Env Vars ==================
-    # Target device of vLLM, supporting [cuda (by default), rocm, neuron, cpu]
+    # Target device of vLLM, supporting [cuda (by default),
+    # rocm, neuron, cpu, openvino]
    "VLLM_TARGET_DEVICE":
    lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda"),
@@ -113,6 +121,10 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    "VLLM_INSTANCE_ID":
    lambda: os.environ.get("VLLM_INSTANCE_ID", None),
+    # Interval in seconds to log a warning message when the ring buffer is full
+    "VLLM_RINGBUFFER_WARNING_INTERVAL":
+    lambda: int(os.environ.get("VLLM_RINGBUFFER_WARNING_INTERVAL", "60")),
    # path to cudatoolkit home directory, under which should be bin, include,
    # and lib directories.
    "CUDA_HOME":
@@ -194,6 +206,7 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    # - "FLASH_ATTN": use FlashAttention
    # - "XFORMERS": use XFormers
    # - "ROCM_FLASH": use ROCmFlashAttention
+    # - "FLASHINFER": use flashinfer
    "VLLM_ATTENTION_BACKEND":
    lambda: os.getenv("VLLM_ATTENTION_BACKEND", None),
@@ -202,6 +215,22 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    "VLLM_CPU_KVCACHE_SPACE":
    lambda: int(os.getenv("VLLM_CPU_KVCACHE_SPACE", "0")),
+    # OpenVINO key-value cache space
+    # default is 4GB
+    "VLLM_OPENVINO_KVCACHE_SPACE":
+    lambda: int(os.getenv("VLLM_OPENVINO_KVCACHE_SPACE", "0")),
+    # OpenVINO KV cache precision
+    # default is bf16 if natively supported by platform, otherwise f16
+    # To enable KV cache compression, please, explicitly specify u8
+    "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION":
+    lambda: os.getenv("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION", None),
+    # Enables weights compression during model export via HF Optimum
+    # default is False
+    "VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS":
+    lambda: bool(os.getenv("VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS", False)),
    # If the env var is set, it uses the Ray's compiled DAG API
    # which optimizes the control plane overhead.
    # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
@@ -211,12 +240,23 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    # Use dedicated multiprocess context for workers.
    # Both spawn and fork work
    "VLLM_WORKER_MULTIPROC_METHOD":
-    lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),
+    lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
    # Timeout for fetching images when serving multimodal models
    # Default is 5 seconds
    "VLLM_IMAGE_FETCH_TIMEOUT":
    lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
+    # Path to the XLA persistent cache directory.
+    # Only used for XLA devices such as TPUs.
+    "VLLM_XLA_CACHE_PATH":
+    lambda: os.getenv("VLLM_XLA_CACHE_PATH", "~/.vllm/xla_cache/"),
+    "VLLM_FUSED_MOE_CHUNK_SIZE":
+    lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "65536")),
+    # If set, vllm will skip the deprecation warnings.
+    "VLLM_NO_DEPRECATION_WARNING":
+    lambda: bool(int(os.getenv("VLLM_NO_DEPRECATION_WARNING", "0"))),
 }
 # end-env-vars-definition

--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -7,6 +7,7 @@ from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
                        make_async)
@@ -46,8 +47,9 @@ class CPUExecutor(ExecutorBase):
            rank=0,
            distributed_init_method=distributed_init_method,
            lora_config=self.lora_config,
-            vision_language_config=self.vision_language_config,
+            multimodal_config=self.multimodal_config,
            kv_cache_dtype=self.cache_config.cache_dtype,
+            prompt_adapter_config=self.prompt_adapter_config,
            is_driver_worker=True,
        )
        self.driver_worker.init_device()
@@ -84,9 +86,25 @@ class CPUExecutor(ExecutorBase):
    def remove_lora(self, lora_id: int) -> bool:
        return self.driver_worker.remove_lora(lora_id)
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.driver_worker.pin_lora(lora_id)
    def list_loras(self) -> Set[int]:
        return self.driver_worker.list_loras()
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        return self.driver_worker.add_prompt_adapter(prompt_adapter_request)
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        return self.driver_worker.remove_prompt_adapter(prompt_adapter_id)
+    def list_prompt_adapters(self) -> Set[int]:
+        return self.driver_worker.list_prompt_adapters()
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        return self.driver_worker.pin_prompt_adapter(prompt_adapter_id)
    def check_health(self) -> None:
        # CPUExecutor will always be healthy as long as
        # it's running.

--- a/vllm/executor/distributed_gpu_executor.py
+++ b/vllm/executor/distributed_gpu_executor.py
@@ -64,12 +64,12 @@ class DistributedGPUExecutor(GPUExecutor):
                          num_cpu_blocks=num_cpu_blocks)
    def execute_model(
-            self,
+        self, execute_model_req: ExecuteModelRequest
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+    ) -> Optional[List[SamplerOutput]]:
        if self.parallel_worker_tasks is None:
            self.parallel_worker_tasks = self._run_workers(
                "start_worker_execution_loop",
-                async_run_remote_workers_only=True,
+                async_run_tensor_parallel_workers_only=True,
                **self.extra_execute_model_run_workers_kwargs)
        # Only the driver worker returns the sampling results.
@@ -79,7 +79,7 @@ class DistributedGPUExecutor(GPUExecutor):
        if self.parallel_worker_tasks is None:
            return
-        self._driver_execute_model()
+        self._driver_execute_model(execute_model_req=None)
        parallel_worker_tasks = self.parallel_worker_tasks
        self.parallel_worker_tasks = None
        # Ensure that workers exit model loop cleanly
@@ -100,6 +100,13 @@ class DistributedGPUExecutor(GPUExecutor):
            lora_id=lora_id,
        )
+    def pin_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self._run_workers(
+            "pin_lora",
+            lora_id=lora_id,
+        )
    def list_loras(self) -> Set[int]:
        return self._run_workers("list_loras")
@@ -116,13 +123,13 @@ class DistributedGPUExecutor(GPUExecutor):
    @abstractmethod
    def _driver_execute_model(
-        self,
+        self, execute_model_req: Optional[ExecuteModelRequest]
-        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> Optional[List[SamplerOutput]]:
-    ) -> List[SamplerOutput]:
        """Run execute_model in the driver worker.
-        Passing None will cause the driver to stop the model execution
+        Passing None will cause the driver to stop the model execution loop
-        loop running in each of the remote workers.
+        running in each of the remote workers. In this case, this method
+        returns None. Otherwise, this method returns the model output.
        """
        raise NotImplementedError
@@ -131,17 +138,17 @@ class DistributedGPUExecutor(GPUExecutor):
        self,
        method: str,
        *args,
-        async_run_remote_workers_only: bool = False,
+        async_run_tensor_parallel_workers_only: bool = False,
        max_concurrent_workers: Optional[int] = None,
        **kwargs,
    ) -> Any:
        """Runs the given method on all workers.
        Args:
-            async_run_remote_workers_only: If True the method will be run only
+            async_run_tensor_parallel_workers_only: If True the method will be
-                in the remote workers, not the driver worker. It will also be
+                run only in the remote TP workers, not the driver worker.
-                run asynchronously and return a list of futures rather than
+                It will also be run asynchronously and return a list of futures
-                blocking on the results.
+                rather than blocking on the results.
        """
        raise NotImplementedError

--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
+import asyncio
 from abc import ABC, abstractmethod
 from typing import List, Optional, Set, Tuple
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ParallelConfig, SchedulerConfig,
+                         ModelConfig, MultiModalConfig, ParallelConfig,
-                         SpeculativeConfig, VisionLanguageConfig)
+                         PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig)
 from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
@@ -25,8 +28,9 @@ class ExecutorBase(ABC):
        device_config: DeviceConfig,
        load_config: LoadConfig,
        lora_config: Optional[LoRAConfig],
-        vision_language_config: Optional[VisionLanguageConfig],
+        multimodal_config: Optional[MultiModalConfig],
        speculative_config: Optional[SpeculativeConfig],
+        prompt_adapter_config: Optional[PromptAdapterConfig],
    ) -> None:
        self.model_config = model_config
        self.cache_config = cache_config
@@ -35,8 +39,9 @@ class ExecutorBase(ABC):
        self.parallel_config = parallel_config
        self.scheduler_config = scheduler_config
        self.device_config = device_config
-        self.vision_language_config = vision_language_config
+        self.multimodal_config = multimodal_config
        self.speculative_config = speculative_config
+        self.prompt_adapter_config = prompt_adapter_config
        self._init_executor()
@@ -69,8 +74,8 @@ class ExecutorBase(ABC):
    @abstractmethod
    def execute_model(
-            self,
+        self, execute_model_req: ExecuteModelRequest
-            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+    ) -> Optional[List[SamplerOutput]]:
        """Executes at least one model step on the given sequences."""
        raise NotImplementedError
@@ -86,10 +91,31 @@ class ExecutorBase(ABC):
    def remove_lora(self, lora_id: int) -> bool:
        raise NotImplementedError
+    @abstractmethod
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError  # type: ignore
    @abstractmethod
    def list_loras(self) -> Set[int]:
        raise NotImplementedError
+    @abstractmethod
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        raise NotImplementedError
+    @abstractmethod
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError
+    @abstractmethod
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError  # type: ignore
+    @abstractmethod
+    def list_prompt_adapters(self) -> Set[int]:
+        raise NotImplementedError
    @abstractmethod
    def check_health(self) -> None:
        """Checks if the executor is healthy. If not, it should raise an
@@ -106,6 +132,26 @@ class ExecutorBase(ABC):
 class ExecutorAsyncBase(ExecutorBase):
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        cache_config: CacheConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        multimodal_config: Optional[MultiModalConfig],
+        speculative_config: Optional[SpeculativeConfig],
+        prompt_adapter_config: Optional[PromptAdapterConfig],
+    ) -> None:
+        self.pp_locks: Optional[List[asyncio.Lock]] = None
+        super().__init__(model_config, cache_config, parallel_config,
+                         scheduler_config, device_config, load_config,
+                         lora_config, multimodal_config, speculative_config,
+                         prompt_adapter_config)
    @abstractmethod
    async def execute_model_async(
            self,

--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -3,6 +3,7 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest, PoolerOutput, SamplerOutput
 from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
                        make_async)
@@ -43,9 +44,11 @@ class GPUExecutor(ExecutorBase):
            rank=rank,
            distributed_init_method=distributed_init_method,
            lora_config=self.lora_config,
-            vision_language_config=self.vision_language_config,
+            multimodal_config=self.multimodal_config,
            speculative_config=self.speculative_config,
-            is_driver_worker=rank == 0,
+            prompt_adapter_config=self.prompt_adapter_config,
+            is_driver_worker=(not self.parallel_config)
+            or (rank % self.parallel_config.tensor_parallel_size == 0),
        )
    def _create_worker(self,
@@ -87,7 +90,7 @@ class GPUExecutor(ExecutorBase):
    def execute_model(
        self, execute_model_req: ExecuteModelRequest
-    ) -> List[Union[SamplerOutput, PoolerOutput]]:
+    ) -> Optional[List[Union[SamplerOutput, PoolerOutput]]]:
        output = self.driver_worker.execute_model(execute_model_req)
        return output
@@ -99,9 +102,32 @@ class GPUExecutor(ExecutorBase):
        assert lora_id > 0, "lora_id must be greater than 0."
        return self.driver_worker.remove_lora(lora_id)
+    def pin_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self.driver_worker.pin_lora(lora_id)
    def list_loras(self) -> Set[int]:
        return self.driver_worker.list_loras()
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        assert prompt_adapter_request.prompt_adapter_id > 0, \
+            "prompt_adapter_id must be greater than 0."
+        return self.driver_worker.add_prompt_adapter(prompt_adapter_request)
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        assert prompt_adapter_id > 0, \
+            "prompt_adapter_id must be greater than 0."
+        return self.driver_worker.remove_prompt_adapter(prompt_adapter_id)
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        assert prompt_adapter_id > 0, \
+                "prompt_adapter_id must be greater than 0."
+        return self.driver_worker.pin_prompt_adapter(prompt_adapter_id)
+    def list_prompt_adapters(self) -> Set[int]:
+        return self.driver_worker.list_prompt_adapters()
    def check_health(self) -> None:
        # GPUExecutor will always be healthy as long as
        # it's running.

--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -9,8 +9,12 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
                                                  ResultHandler, WorkerMonitor)
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+from vllm.triton_utils import maybe_set_triton_cache_manager
-                        get_vllm_instance_id, make_async)
+from vllm.utils import (cuda_device_count_stateless,
+                        error_on_invalid_device_count_status,
+                        get_distributed_init_method, get_open_port,
+                        get_vllm_instance_id, make_async,
+                        update_environment_variables)
 logger = init_logger(__name__)
@@ -24,8 +28,9 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
        # Set CUDA_VISIBLE_DEVICES for the driver, inherited by workers
        if "CUDA_VISIBLE_DEVICES" not in os.environ:
-            os.environ["CUDA_VISIBLE_DEVICES"] = (",".join(
+            update_environment_variables({
-                map(str, range(world_size))))
+                "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size))))
+            })
        # Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers
        os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id()
@@ -33,12 +38,25 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
        # Disable torch async compiling which won't work with daemonic processes
        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
-        from torch.cuda import device_count
+        # Set OMP_NUM_THREADS to 1 if it is not set explicitly, avoids CPU
-        assert world_size <= device_count(), (
+        # contention amongst the shards
+        if "OMP_NUM_THREADS" not in os.environ:
+            os.environ["OMP_NUM_THREADS"] = "1"
+        # workaround for https://github.com/vllm-project/vllm/issues/6103
+        if world_size > 1:
+            maybe_set_triton_cache_manager()
+        assert world_size <= cuda_device_count_stateless(), (
            "please set tensor_parallel_size to less than max local gpu count")
+        error_on_invalid_device_count_status()
+        # Multiprocessing-based executor does not support multi-node setting.
+        # Since it only works for single node, we can use the loopback address
+        # 127.0.0.1 for communication.
        distributed_init_method = get_distributed_init_method(
-            get_ip(), get_open_port())
+            "127.0.0.1", get_open_port())
        if world_size == 1:
            self.workers = []
@@ -73,32 +91,30 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
            worker_monitor.close()
    def _driver_execute_model(
-        self,
+        self, execute_model_req: Optional[ExecuteModelRequest]
-        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> Optional[List[SamplerOutput]]:
-    ) -> List[SamplerOutput]:
        """Run execute_model in the driver worker.
        Passing None will cause the driver to stop the model execution
        loop running in each of the remote workers.
        """
-        return self.driver_worker.execute_model(
+        return self.driver_worker.execute_model(execute_model_req)
-            execute_model_req=execute_model_req)
    def _run_workers(
        self,
        method: str,
        *args,
-        async_run_remote_workers_only: bool = False,
+        async_run_tensor_parallel_workers_only: bool = False,
        max_concurrent_workers: Optional[int] = None,
        **kwargs,
    ) -> Any:
        """Runs the given method on all workers.
        Args:
-            async_run_remote_workers_only: If True the method will be run only
+            async_run_tensor_parallel_workers_only: If True the method will be
-                in the remote workers, not the driver worker. It will also be
+                run only in the remote TP workers, not the driver worker.
-                run asynchronously and return a list of futures rather than
+                It will also be run asynchronously and return a list of futures
-                blocking on the results.
+                rather than blocking on the results.
        """
        if max_concurrent_workers:
@@ -111,7 +127,7 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
            for worker in self.workers
        ]
-        if async_run_remote_workers_only:
+        if async_run_tensor_parallel_workers_only:
            # Just return futures
            return worker_outputs

--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -48,15 +48,14 @@ class NeuronExecutor(ExecutorBase):
    def execute_model(
            self,
            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
-        assert (execute_model_req.blocks_to_swap_in == {}
+        assert (not execute_model_req.blocks_to_swap_in
-                and execute_model_req.blocks_to_swap_out == {}
+                and not execute_model_req.blocks_to_swap_out
-                and execute_model_req.blocks_to_copy == {}), (
+                and not execute_model_req.blocks_to_copy), (
                    "Cache operations are not supported for Neuron backend.")
        assert execute_model_req.num_lookahead_slots == 0, (
            "lookahead not supported for Neuron backend.")
-        output = self.driver_worker.execute_model(
+        output = self.driver_worker.execute_model(execute_model_req)
-            execute_model_req.seq_group_metadata_list)
        return output
    def add_lora(self, lora_request: LoRARequest) -> bool:
@@ -65,9 +64,28 @@ class NeuronExecutor(ExecutorBase):
    def remove_lora(self, lora_id: int) -> bool:
        return self.driver_worker.remove_lora(lora_id)
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.driver_worker.pin_lora(lora_id)
    def list_loras(self) -> Set[int]:
        return self.driver_worker.list_loras()
+    def add_prompt_adapter(self, prompt_adapter_request) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the Neuron backend.")
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the Neuron backend.")
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the Neuron backend.")
+    def list_prompt_adapters(self) -> Set[int]:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the Neuron backend.")
    def check_health(self) -> None:
        # NeuronExecutor will always be healthy as long as
        # it's running.

--- a/vllm/executor/openvino_executor.py
+++ b/vllm/executor/openvino_executor.py
+from typing import List, Set, Tuple
+import openvino as ov
+import openvino.properties.hint as hints
+import torch
+import vllm.envs as envs
+from vllm.config import CacheConfig, ModelConfig
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async)
+logger = init_logger(__name__)
+class OpenVINOExecutor(ExecutorBase):
+    def _init_executor(self) -> None:
+        assert self.device_config.device_type == "openvino"
+        assert self.lora_config is None, "OpenVINO backend doesn't support LoRA"
+        self.model_config = _verify_and_get_model_config(self.model_config)
+        self.cache_config = _verify_and_get_cache_config(self.cache_config)
+        # Instantiate the worker and load the model to CPU.
+        self._init_worker()
+    def _init_worker(self):
+        from vllm.worker.openvino_worker import OpenVINOWorker
+        assert (
+            self.parallel_config.world_size == 1
+        ), "OpenVINOExecutor only supports single CPU socket currently."
+        distributed_init_method = get_distributed_init_method(
+            get_ip(), get_open_port())
+        self.driver_worker = OpenVINOWorker(
+            model_config=self.model_config,
+            parallel_config=self.parallel_config,
+            scheduler_config=self.scheduler_config,
+            device_config=self.device_config,
+            cache_config=self.cache_config,
+            load_config=self.load_config,
+            local_rank=0,
+            rank=0,
+            distributed_init_method=distributed_init_method,
+            lora_config=self.lora_config,
+            multimodal_config=self.multimodal_config,
+            kv_cache_dtype=self.cache_config.cache_dtype,
+            is_driver_worker=True,
+        )
+        self.driver_worker.init_device()
+        self.driver_worker.load_model()
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available KV blocks by invoking the
+        underlying worker.
+        """
+        return self.driver_worker.determine_num_available_blocks()
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Initialize the KV cache by invoking the underlying worker."""
+        # NOTE: We log here to avoid multiple logs when number of workers is
+        # greater than one. We could log in the engine, but not all executors
+        # have GPUs.
+        # NOTE: `cpu block` for OpenVINO backend is located on CPU memory but is
+        # referred as `gpu block`. Because we want to reuse the existing block
+        # management procedure.
+        logger.info("# CPU blocks: %d", num_gpu_blocks)
+        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        output = self.driver_worker.execute_model(execute_model_req)
+        return output
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.driver_worker.add_lora(lora_request)
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.driver_worker.remove_lora(lora_id)
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.driver_worker.pin_lora(lora_id)
+    def list_loras(self) -> Set[int]:
+        return self.driver_worker.list_loras()
+    def add_prompt_adapter(self, prompt_adapter_request) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the OPENVINO backend.")
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the OPENVINO backend.")
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the OPENVINO backend.")
+    def list_prompt_adapters(self) -> Set[int]:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the OPENVINO backend.")
+    def check_health(self) -> None:
+        # OpenVINOExecutor will always be healthy as long as
+        # it's running.
+        return
+class OpenVINOExecutorAsync(OpenVINOExecutor, ExecutorAsyncBase):
+    async def execute_model_async(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        output = await make_async(self.driver_worker.execute_model
+                                  )(execute_model_req=execute_model_req, )
+        return output
+    async def check_health_async(self) -> None:
+        # OpenVINOExecutor will always be healthy as long as
+        # it's running.
+        return
+def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
+    if config.dtype != torch.float32:
+        logger.warning(
+            f"Only float32 dtype is supported on OpenVINO, casting from {config.dtype}."  # noqa: G004, E501
+        )
+        config.dtype = torch.float32
+    if not config.enforce_eager:
+        logger.warning(
+            "CUDA graph is not supported on OpenVINO backend, fallback to the "
+            "eager mode.")
+        config.enforce_eager = True
+    return config
+def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
+    if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8":
+        logger.info("KV cache type is overried to u8 via "
+                    "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.")
+        config.cache_dtype = ov.Type.u8
+    else:
+        core = ov.Core()
+        inference_precision = core.get_property("CPU",
+                                                hints.inference_precision)
+        if inference_precision == ov.Type.bf16:
+            config.cache_dtype = ov.Type.bf16
+        else:
+            config.cache_dtype = ov.Type.f16
+    if config.block_size != 32:
+        logger.info(
+            f"OpenVINO optimal block size is 32, overriding currently set {config.block_size}"  # noqa: G004, E501
+        )
+        config.block_size = 32
+    kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE
+    if kv_cache_space >= 0:
+        _GB = 1 << 30
+        if kv_cache_space == 0:
+            config.openvino_kvcache_space_bytes = 4 * _GB  # type: ignore
+            logger.warning(
+                "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) "
+                "for OpenVINO backend is not set, using 4 by default.")
+        else:
+            config.openvino_kvcache_space_bytes = kv_cache_space * _GB  # type: ignore
+    else:
+        raise RuntimeError(
+            "Invalid environment variable VLLM_OPENVINO_KVCACHE_SPACE"
+            f" {kv_cache_space}, expect a positive integer value.")
+    return config
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -11,7 +11,8 @@ from vllm.executor.distributed_gpu_executor import (  # yapf: disable
 from vllm.executor.ray_utils import RayWorkerWrapper, ray
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+from vllm.utils import (error_on_invalid_device_count_status,
+                        get_distributed_init_method, get_ip, get_open_port,
                        get_vllm_instance_id, make_async)
 if ray is not None:
@@ -62,7 +63,8 @@ class RayGPUExecutor(DistributedGPUExecutor):
    def _init_workers_ray(self, placement_group: "PlacementGroup",
                          **ray_remote_kwargs):
-        if self.parallel_config.tensor_parallel_size == 1:
+        if (self.parallel_config.tensor_parallel_size == 1
+                and self.parallel_config.pipeline_parallel_size == 1):
            # For single GPU case, we use a ray worker with constrained memory.
            num_gpus = self.cache_config.gpu_memory_utilization
        else:
@@ -132,11 +134,38 @@ class RayGPUExecutor(DistributedGPUExecutor):
        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
                                                    use_dummy_driver=True)
-        node_workers = defaultdict(list)
+        # the order in `worker_node_and_gpu_ids` does not necessarily match
-        node_gpus = defaultdict(list)
+        # the machine boundaries. We need to make sure that workers in the
+        # same node are assigned consecutive ranks.
-        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
+        # examples:
-            node_workers[node_id].append(i)
+        # [('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [0]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [0]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [1]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [2]), ('dfaad7adfdae57a694cc74490db45bd112c9f31243523e43ddc2e7f0', [3]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [1]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [2]), ('852a09a13c7503ef126d7c828454c741494b1be33a8627a5206604d9', [3])] # noqa
+        # initialize worker ranks with -1 (unassigned)
+        worker_ranks = [-1 for x in worker_node_and_gpu_ids]
+        current_rank = 0
+        while -1 in worker_ranks:
+            # whenever we find an unassigned worker, find the node
+            index = worker_ranks.index(-1)
+            current_node_id = worker_node_and_gpu_ids[index][0]
+            # assign ranks to all workers in the same node
+            for i, (node_id, _) in enumerate(worker_node_and_gpu_ids):
+                if node_id == current_node_id:
+                    worker_ranks[i] = current_rank
+                    current_rank += 1
+        # with the above example, worker_ranks will be [0, 4, 5, 6, 7, 1, 2, 3]
+        node_workers = defaultdict(list)  # node id -> list of worker ranks
+        node_gpus = defaultdict(list)  # node id -> list of gpu ids
+        for worker_rank, (node_id, gpu_ids) in zip(worker_ranks,
+                                                   worker_node_and_gpu_ids):
+            node_workers[node_id].append(worker_rank)
+            # `gpu_ids` can be a list of strings or integers.
+            # convert them to integers for consistency.
+            # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs),
+            # string sorting is not sufficient.
+            # see https://github.com/vllm-project/vllm/issues/5590
+            gpu_ids = [int(x) for x in gpu_ids]
            node_gpus[node_id].extend(gpu_ids)
        for node_id, gpu_ids in node_gpus.items():
            node_gpus[node_id] = sorted(gpu_ids)
@@ -155,16 +184,29 @@ class RayGPUExecutor(DistributedGPUExecutor):
        self._run_workers("update_environment_variables",
                          all_args=all_args_to_update_environment_variables)
+        if len(node_gpus) == 1:
+            # in single node case, we don't need to get the IP address.
+            # the loopback address is sufficient
+            # NOTE: a node may have several IP addresses, one for each
+            # network interface. `get_ip()` might return any of them,
+            # while they might not work for communication inside the node
+            # if the network setup is complicated. Using the loopback address
+            # solves this issue, as it always works for communication inside
+            # the node.
+            driver_ip = "127.0.0.1"
        distributed_init_method = get_distributed_init_method(
            driver_ip, get_open_port())
+        error_on_invalid_device_count_status()
        # Initialize the actual workers inside worker wrapper.
        init_worker_all_kwargs = [
            self._get_worker_kwargs(
                local_rank=node_workers[node_id].index(rank),
                rank=rank,
                distributed_init_method=distributed_init_method,
-            ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids)
+            ) for rank, (node_id,
+                         _) in zip(worker_ranks, worker_node_and_gpu_ids)
        ]
        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
@@ -173,10 +215,26 @@ class RayGPUExecutor(DistributedGPUExecutor):
                          max_concurrent_workers=self.parallel_config.
                          max_parallel_loading_workers)
+        # This is the list of workers that are rank 0 of each TP group EXCEPT
+        # global rank 0. These are the workers that will broadcast to the
+        # rest of the workers.
+        self.tp_driver_workers: List[RayWorkerWrapper] = []
+        # This is the list of workers that are not drivers and not the first
+        # worker in a TP group. These are the workers that will be
+        # broadcasted to.
+        self.non_driver_workers: List[RayWorkerWrapper] = []
+        for idx, rank in enumerate(worker_ranks[1:]):
+            # We need to skip the driver worker, which we
+            # do by skipping worker_ranks[0] which is always 0.
+            if rank % self.parallel_config.tensor_parallel_size == 0:
+                self.tp_driver_workers.append(self.workers[idx])
+            else:
+                self.non_driver_workers.append(self.workers[idx])
    def _driver_execute_model(
-        self,
+        self, execute_model_req: Optional[ExecuteModelRequest]
-        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> Optional[List[SamplerOutput]]:
-    ) -> List[SamplerOutput]:
        """Run execute_model in the driver worker.
        Passing None will cause the driver to stop the model execution
@@ -189,7 +247,7 @@ class RayGPUExecutor(DistributedGPUExecutor):
        self,
        method: str,
        *args,
-        async_run_remote_workers_only: bool = False,
+        async_run_tensor_parallel_workers_only: bool = False,
        all_args: Optional[List[Tuple[Any, ...]]] = None,
        all_kwargs: Optional[List[Dict[str, Any]]] = None,
        use_dummy_driver: bool = False,
@@ -200,10 +258,11 @@ class RayGPUExecutor(DistributedGPUExecutor):
        """Runs the given method on all workers. Can be used in the following
        ways:
-        - async_run_remote_workers_only: If True the method will be run only
+        Args:
-          in the remote workers, not the driver worker. It will also be
+        - async_run_tensor_parallel_workers_only: If True the method will be
-          run asynchronously and return a list of futures rather than blocking
+          run only in the remote TP workers, not the driver worker.
-          on the results.
+          It will also be run asynchronously and return a list of futures
+          rather than blocking on the results.
        - args/kwargs: All workers share the same args/kwargs
        - all_args/all_kwargs: args/kwargs for each worker are specified
          individually
@@ -213,7 +272,9 @@ class RayGPUExecutor(DistributedGPUExecutor):
            raise NotImplementedError(
                "max_concurrent_workers is not supported yet.")
-        count = len(self.workers)
+        count = len(self.workers) if not \
+            async_run_tensor_parallel_workers_only \
+            else len(self.non_driver_workers)
        all_worker_args = repeat(args, count) if all_args is None \
            else islice(all_args, 1, None)
        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
@@ -227,14 +288,17 @@ class RayGPUExecutor(DistributedGPUExecutor):
            ray_worker_outputs = []
        else:
            # Start the ray workers first.
+            ray_workers = self.workers
+            if async_run_tensor_parallel_workers_only:
+                ray_workers = self.non_driver_workers
            ray_worker_outputs = [
                worker.execute_method.remote(method, *worker_args,
                                             **worker_kwargs)
                for (worker, worker_args, worker_kwargs
-                     ) in zip(self.workers, all_worker_args, all_worker_kwargs)
+                     ) in zip(ray_workers, all_worker_args, all_worker_kwargs)
            ]
-        if async_run_remote_workers_only:
+        if async_run_tensor_parallel_workers_only:
            # Just return futures
            return ray_worker_outputs
@@ -304,12 +368,41 @@ class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync):
        self,
        execute_model_req: Optional[ExecuteModelRequest] = None
    ) -> List[SamplerOutput]:
-        return await self.driver_exec_method("execute_model",
+        if self.pp_locks is None:
-                                             execute_model_req)
+            # This locks each pipeline parallel stage so multiple virtual
+            # engines can't execute on the same stage at the same time
+            # We create the locks here to avoid creating them in the constructor
+            # which uses a different asyncio loop.
+            self.pp_locks = [
+                asyncio.Lock()
+                for _ in range(self.parallel_config.pipeline_parallel_size)
+            ]
+        async def _run_task_with_lock(task, lock, *args, **kwargs):
+            async with lock:
+                return await task(*args, **kwargs)
+        tasks = []
+        tasks.append(
+            asyncio.create_task(
+                _run_task_with_lock(self.driver_exec_method, self.pp_locks[0],
+                                    "execute_model", execute_model_req)))
+        for pp_rank, driver_worker in enumerate(self.tp_driver_workers,
+                                                start=1):
+            tasks.append(
+                asyncio.create_task(
+                    _run_task_with_lock(driver_worker.execute_method.remote,
+                                        self.pp_locks[pp_rank],
+                                        "execute_model", execute_model_req)))
+        results = await asyncio.gather(*tasks)
+        # Only the last PP stage has the final results.
+        return results[-1]
    async def _start_worker_execution_loop(self):
        coros = [
            worker.execute_method.remote("start_worker_execution_loop")
-            for worker in self.workers
+            for worker in self.non_driver_workers
        ]
        return await asyncio.gather(*coros)
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -3,7 +3,7 @@ from typing import List, Optional, Tuple
 from vllm.config import ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import get_ip, is_hip
+from vllm.utils import get_ip, is_hip, is_xpu
 from vllm.worker.worker_base import WorkerWrapperBase
 logger = init_logger(__name__)
@@ -42,14 +42,26 @@ try:
            output = pickle.dumps(output)
            return output
+    ray_import_err = None
 except ImportError as e:
-    logger.warning(
-        "Failed to import Ray with %r. For multi-node inference, "
-        "please install Ray with `pip install ray`.", e)
    ray = None  # type: ignore
+    ray_import_err = e
    RayWorkerWrapper = None  # type: ignore
+def ray_is_available() -> bool:
+    """Returns True if Ray is available."""
+    return ray is not None
+def assert_ray_available():
+    """Raise an exception if Ray is not available."""
+    if ray is None:
+        raise ValueError("Failed to import Ray, please install Ray with "
+                         "`pip install ray`.") from ray_import_err
 def initialize_ray_cluster(
    parallel_config: ParallelConfig,
    ray_address: Optional[str] = None,
@@ -65,13 +77,10 @@ def initialize_ray_cluster(
        ray_address: The address of the Ray cluster. If None, uses
            the default Ray cluster address.
    """
-    if ray is None:
+    assert_ray_available()
-        raise ImportError(
-            "Ray is not installed. Please install Ray to use multi-node "
-            "serving.")
    # Connect to a ray cluster.
-    if is_hip():
+    if is_hip() or is_xpu():
        ray.init(address=ray_address,
                 ignore_reinit_error=True,
                 num_gpus=parallel_config.world_size)

--- a/vllm/executor/ray_xpu_executor.py
+++ b/vllm/executor/ray_xpu_executor.py
+import asyncio
+import os
+import pickle
+from collections import defaultdict
+from itertools import islice, repeat
+from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Set,
+                    Tuple, Union)
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig)
+from vllm.executor.distributed_gpu_executor import (  # yapf: disable
+    DistributedGPUExecutor, DistributedGPUExecutorAsync)
+from vllm.executor.ray_utils import RayWorkerWrapper, ray
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async)
+if ray is not None:
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+logger = init_logger(__name__)
+# If the env var is set, it uses the Ray's compiled DAG API
+# which optimizes the control plane overhead.
+# Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it.
+USE_RAY_COMPILED_DAG = bool(os.getenv("VLLM_USE_RAY_COMPILED_DAG", 0))
+class RayXPUExecutor(DistributedGPUExecutor):
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        cache_config: CacheConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        multimodal_config: Optional[MultiModalConfig],
+        prompt_adapter_config: Optional[PromptAdapterConfig],
+        speculative_config: Optional[SpeculativeConfig],
+    ) -> None:
+        assert device_config.device_type == "xpu"
+        assert (not speculative_config
+                ), "Speculative decoding not yet supported for XPU backend"
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.load_config = load_config
+        self.lora_config = lora_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.multimodal_config = multimodal_config
+        self.prompt_adapter_config = prompt_adapter_config
+        placement_group = self.parallel_config.placement_group
+        # Disable Ray usage stats collection.
+        ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
+        if ray_usage != "1":
+            os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
+        # Create the parallel GPU workers.
+        self._init_workers_ray(placement_group)
+        # Profile the memory usage and initialize the cache.
+        self.forward_dag = None
+        if USE_RAY_COMPILED_DAG:
+            self.forward_dag = self._compiled_ray_dag()
+        # This is non-None when the execute model loop is running
+        # in the parallel workers. It's a coroutine in the AsyncLLMEngine case.
+        self.parallel_worker_tasks: Optional[Union[Any, Awaitable[Any]]] = None
+        # Updated by implementations that require additional args to be passed
+        # to the _run_workers execute_model call
+        self.extra_execute_model_run_workers_kwargs: Dict[str, Any] = {}
+    def _init_executor(self) -> None:
+        pass
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available KV blocks.
+        This invokes `determine_num_available_blocks` on each worker and takes
+        the min of the results, guaranteeing that the selected cache sizes are
+        compatible with all workers.
+        Returns:
+            - Tuple[num_gpu_blocks, num_cpu_blocks]
+        """
+        # Get the maximum number of blocks that can be allocated on GPU and CPU.
+        num_blocks = self._run_workers("determine_num_available_blocks", )
+        # Since we use a shared centralized controller, we take the minimum
+        # number of blocks across all workers to make sure all the memory
+        # operators can be applied to all workers.
+        num_gpu_blocks = min(b[0] for b in num_blocks)
+        num_cpu_blocks = min(b[1] for b in num_blocks)
+        return num_gpu_blocks, num_cpu_blocks
+    def _init_workers_ray(self, placement_group: "PlacementGroup",
+                          **ray_remote_kwargs):
+        if self.parallel_config.tensor_parallel_size == 1:
+            # For single GPU case, we use a ray worker with constrained memory.
+            num_gpus = self.cache_config.gpu_memory_utilization
+        else:
+            # Otherwise, the ray workers are allocated with a full GPU.
+            num_gpus = 1
+        # The driver dummy worker does not actually use any resources.
+        # It holds the resource for the driver worker.
+        self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
+        # The remaining workers are the actual ray actors.
+        self.workers: List[RayWorkerWrapper] = []
+        # Create the workers.
+        driver_ip = get_ip()
+        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
+            if not bundle.get("GPU", 0):
+                continue
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=placement_group,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_id,
+            )
+            worker = ray.remote(
+                num_cpus=0,
+                num_gpus=num_gpus,
+                scheduling_strategy=scheduling_strategy,
+                **ray_remote_kwargs,
+            )(RayWorkerWrapper).remote(
+                worker_module_name="vllm.worker.xpu_worker",
+                worker_class_name="XPUWorker",
+                trust_remote_code=self.model_config.trust_remote_code,
+            )
+            worker_ip = ray.get(worker.get_node_ip.remote())
+            if worker_ip == driver_ip and self.driver_dummy_worker is None:
+                # If the worker is on the same node as the driver, we use it
+                # as the resource holder for the driver process.
+                self.driver_dummy_worker = worker
+                self.driver_worker = RayWorkerWrapper(
+                    worker_module_name="vllm.worker.xpu_worker",
+                    worker_class_name="XPUWorker",
+                    trust_remote_code=self.model_config.trust_remote_code,
+                )
+            else:
+                # Else, added to the list of workers.
+                self.workers.append(worker)
+        if self.driver_dummy_worker is None:
+            raise ValueError(
+                "Ray does not allocate any GPUs on the driver node. Consider "
+                "adjusting the Ray placement group or running the driver on a "
+                "GPU node.")
+        # Get the set of GPU IDs used on each node.
+        worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
+                                                    use_dummy_driver=True)
+        node_workers = defaultdict(list)
+        node_gpus = defaultdict(list)
+        for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
+            node_workers[node_id].append(i)
+            node_gpus[node_id].extend(gpu_ids)
+        for node_id, gpu_ids in node_gpus.items():
+            node_gpus[node_id] = sorted(gpu_ids)
+        # TODO: add env var for xpu
+        distributed_init_method = get_distributed_init_method(
+            driver_ip, get_open_port())
+        def collect_arg_helper_func(**kwargs):
+            # avoid writing `{"name": value}` manually
+            return kwargs
+        init_worker_all_kwargs = []
+        # Initialize the actual workers inside worker wrapper.
+        for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids, ):
+            local_rank = node_workers[node_id].index(rank)
+            init_worker_all_kwargs.append(
+                collect_arg_helper_func(
+                    model_config=self.model_config,
+                    parallel_config=self.parallel_config,
+                    scheduler_config=self.scheduler_config,
+                    device_config=self.device_config,
+                    cache_config=self.cache_config,
+                    load_config=self.load_config,
+                    local_rank=local_rank,
+                    rank=rank,
+                    distributed_init_method=distributed_init_method,
+                    lora_config=self.lora_config,
+                    multimodal_config=self.multimodal_config,
+                    is_driver_worker=rank == 0,
+                ))
+        self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
+        self._run_workers("init_device")
+        self._run_workers(
+            "load_model",
+            max_concurrent_workers=self.parallel_config.
+            max_parallel_loading_workers,
+        )
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Initialize the KV cache in all workers.
+        """
+        # NOTE: We log here to avoid multiple logs when number of workers is
+        # greater than one. We could log in the engine, but not all executors
+        # have GPUs.
+        logger.info("# GPU blocks: %d, "
+                    "# CPU blocks: %d", num_gpu_blocks, num_cpu_blocks)
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+        self._run_workers("initialize_cache",
+                          num_gpu_blocks=num_gpu_blocks,
+                          num_cpu_blocks=num_cpu_blocks)
+    def _driver_execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        """Run execute_model in the driver worker.
+        Passing None will cause the driver to stop the model execution
+        loop running in each of the remote workers.
+        """
+        return self.driver_worker.execute_method("execute_model",
+                                                 execute_model_req)
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+        return self._run_workers(
+            "add_lora",
+            lora_request=lora_request,
+        )
+    def remove_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self._run_workers(
+            "remove_lora",
+            lora_id=lora_id,
+        )
+    def list_loras(self) -> Set[int]:
+        return self._run_workers("list_loras")
+    def _run_workers(
+        self,
+        method: str,
+        *args,
+        async_run_remote_workers_only: bool = False,
+        all_args: Optional[List[Tuple[Any, ...]]] = None,
+        all_kwargs: Optional[List[Dict[str, Any]]] = None,
+        use_dummy_driver: bool = False,
+        max_concurrent_workers: Optional[int] = None,
+        use_ray_compiled_dag: bool = False,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers. Can be used in the following
+        ways:
+        - args/kwargs: All workers share the same args/kwargs
+        - args/kwargs and driver_args/driver_kwargs: Driver worker has
+          different args
+        - all_args/all_kwargs: args/kwargs for each worker are specified
+          individually
+        """
+        if max_concurrent_workers:
+            raise NotImplementedError(
+                "max_concurrent_workers is not supported yet.")
+        count = len(self.workers)
+        all_worker_args = repeat(args, count) if all_args is None \
+            else islice(all_args, 1, None)
+        all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
+            else islice(all_kwargs, 1, None)
+        if use_ray_compiled_dag:
+            # Right now, compiled DAG can only accept a single
+            # input. TODO(sang): Fix it.
+            assert self.forward_dag is not None
+            output_channels = self.forward_dag.execute(1)
+        else:
+            # Start the ray workers first.
+            ray_worker_outputs = [
+                worker.execute_method.remote(method, *worker_args,
+                                             **worker_kwargs)
+                for (worker, worker_args, worker_kwargs
+                     ) in zip(self.workers, all_worker_args, all_worker_kwargs)
+            ]
+        if async_run_remote_workers_only:
+            # Just return futures
+            return ray_worker_outputs
+        driver_args = args if all_args is None else all_args[0]
+        driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
+        # Start the driver worker after all the ray workers.
+        if not use_dummy_driver:
+            driver_worker_output = self.driver_worker.execute_method(
+                method, *driver_args, **driver_kwargs)
+        else:
+            assert self.driver_dummy_worker is not None
+            driver_worker_output = ray.get(
+                self.driver_dummy_worker.execute_method.remote(
+                    method, *driver_args, **driver_kwargs))
+        # Get the results of the ray workers.
+        if self.workers:
+            if use_ray_compiled_dag:
+                try:
+                    ray_worker_outputs = [
+                        pickle.loads(chan.begin_read())
+                        for chan in output_channels
+                    ]
+                finally:
+                    # Has to call end_read in order to reuse the DAG.
+                    for chan in output_channels:
+                        chan.end_read()
+            else:
+                ray_worker_outputs = ray.get(ray_worker_outputs)
+        return [driver_worker_output] + ray_worker_outputs
+    def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
+        """Wait for futures returned from _run_workers() with
+        async_run_remote_workers_only to complete."""
+        ray.get(parallel_worker_tasks)
+    def _compiled_ray_dag(self):
+        import pkg_resources
+        required_version = "2.9"
+        current_version = pkg_resources.get_distribution("ray").version
+        if current_version < required_version:
+            raise ValueError(f"Ray version {required_version} or greater is "
+                             f"required, but found {current_version}")
+        from ray.dag import InputNode, MultiOutputNode
+        assert self.parallel_config.worker_use_ray
+        # Right now, compiled DAG requires at least 1 arg. We send
+        # a dummy value for now. It will be fixed soon.
+        with InputNode() as input_data:
+            forward_dag = MultiOutputNode([
+                worker.execute_model_compiled_dag_remote.
+                bind(  # type: ignore[attr-defined]
+                    input_data) for worker in self.workers
+            ])
+        return forward_dag.experimental_compile()
+    def check_health(self) -> None:
+        """Raises an error if engine is unhealthy."""
+        self._check_if_any_actor_is_dead()
+    def _check_if_any_actor_is_dead(self):
+        if not self.workers:
+            return
+        dead_actors = []
+        for actor in self.workers:
+            actor_state = ray.state.actors(actor._ray_actor_id.hex())  # pylint: disable=protected-access
+            if actor_state["State"] == "DEAD":
+                dead_actors.append(actor)
+        if dead_actors:
+            raise RuntimeError("At least one Worker is dead. "
+                               f"Dead Workers: {dead_actors}. ")
+class RayXPUExecutorAsync(RayXPUExecutor, DistributedGPUExecutorAsync):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.driver_exec_method = make_async(self.driver_worker.execute_method)
+    async def _driver_execute_model_async(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        return await self.driver_exec_method("execute_model",
+                                             execute_model_req)
+    async def _start_worker_execution_loop(self):
+        coros = [
+            worker.execute_method.remote("start_worker_execution_loop")
+            for worker in self.workers
+        ]
+        return await asyncio.gather(*coros)
--- a/vllm/executor/tpu_executor.py
+++ b/vllm/executor/tpu_executor.py
+from typing import Any, Dict, List, Optional, Set, Tuple
+import torch
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async)
+logger = init_logger(__name__)
+class TPUExecutor(ExecutorBase):
+    def _init_executor(self) -> None:
+        assert not self.scheduler_config.chunked_prefill_enabled, (
+            "Chunked prefill is not yet supported for TPU backend")
+        assert not self.speculative_config, (
+            "Speculative decoding is not yet supported for TPU backend")
+        if self.model_config.dtype in (torch.float16, torch.float32):
+            logger.warning(
+                "The TPU backend currently does not support %s. "
+                "Using bfloat16 instead.", self.model_config.dtype)
+            self.model_config.dtype = torch.bfloat16
+        # Instantiate the worker and load the model to the device.
+        self.driver_worker = self._create_worker()
+        self.driver_worker.init_device()
+        self.driver_worker.load_model()
+    def _get_worker_kwargs(
+        self,
+        local_rank: int = 0,
+        rank: int = 0,
+        distributed_init_method: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        """Return worker init args for a given rank."""
+        if distributed_init_method is None:
+            distributed_init_method = get_distributed_init_method(
+                get_ip(), get_open_port())
+        return dict(
+            model_config=self.model_config,
+            parallel_config=self.parallel_config,
+            scheduler_config=self.scheduler_config,
+            device_config=self.device_config,
+            cache_config=self.cache_config,
+            load_config=self.load_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            multimodal_config=self.multimodal_config,
+            is_driver_worker=rank == 0,
+        )
+    def _create_worker(
+        self,
+        local_rank: int = 0,
+        rank: int = 0,
+        distributed_init_method: Optional[str] = None,
+    ):
+        from vllm.worker.tpu_worker import TPUWorker
+        worker = TPUWorker(**self._get_worker_kwargs(local_rank, rank,
+                                                     distributed_init_method))
+        return worker
+    def initialize_cache(
+        self,
+        num_gpu_blocks: int,
+        num_cpu_blocks: int,
+    ) -> None:
+        """Initialize the KV cache by invoking the underlying worker."""
+        # NOTE: This is logged in the executor because there can be >1 worker
+        # with other executors. We could log in the engine level, but work
+        # remains to abstract away the device for non-GPU configurations.
+        logger.info("# TPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
+                    num_cpu_blocks)
+        self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available KV blocks by invoking the
+        underlying worker."""
+        return self.driver_worker.determine_num_available_blocks()
+    def execute_model(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> List[SamplerOutput]:
+        output = self.driver_worker.execute_model(execute_model_req)
+        return output
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        raise NotImplementedError(
+            "LoRA is currently not supported by the TPU backend.")
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError(
+            "LoRA is currently not supported by the TPU backend.")
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError(
+            "LoRA is currently not supported by the TPU backend.")
+    def list_loras(self) -> Set[int]:
+        raise NotImplementedError(
+            "LoRA is currently not supported by the TPU backend.")
+    def add_prompt_adapter(self, prompt_adapter_request) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the TPU backend.")
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the TPU backend.")
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the TPU backend.")
+    def list_prompt_adapters(self) -> Set[int]:
+        raise NotImplementedError(
+            "Soft prompt is currently not supported by the TPU backend.")
+    def check_health(self) -> None:
+        # TPUExecutor will always be healthy as long as it's running.
+        return
+class TPUExecutorAsync(TPUExecutor, ExecutorAsyncBase):
+    async def execute_model_async(
+        self,
+        sexecute_model_req: ExecuteModelRequest,
+    ) -> SamplerOutput:
+        output = await make_async(self.driver_worker.execute_model
+                                  )(sexecute_model_req)
+        return output
--- a/vllm/executor/xpu_executor.py
+++ b/vllm/executor/xpu_executor.py
+from typing import List, Optional
+import torch
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, MultiModalConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig)
+from vllm.executor.executor_base import ExecutorAsyncBase
+from vllm.executor.gpu_executor import GPUExecutor
+from vllm.logger import init_logger
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.utils import make_async
+from vllm.worker.worker_base import WorkerWrapperBase
+logger = init_logger(__name__)
+class XPUExecutor(GPUExecutor):
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        cache_config: CacheConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        load_config: LoadConfig,
+        lora_config: Optional[LoRAConfig],
+        multimodal_config: Optional[MultiModalConfig],
+        prompt_adapter_config: Optional[PromptAdapterConfig],
+        speculative_config: Optional[SpeculativeConfig],
+    ) -> None:
+        assert device_config.device_type == "xpu"
+        assert (not speculative_config
+                ), "Speculative decoding not yet supported for XPU backend"
+        model_config = _verify_and_get_model_config(model_config)
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.load_config = load_config
+        self.lora_config = lora_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.multimodal_config = multimodal_config
+        self.prompt_adapter_config = prompt_adapter_config
+        self.speculative_config = None
+        # Instantiate the worker and load the model to GPU.
+        self._init_executor()
+    def _create_worker(self,
+                       local_rank: int = 0,
+                       rank: int = 0,
+                       distributed_init_method: Optional[str] = None):
+        if self.speculative_config is None:
+            worker_module_name = "vllm.worker.xpu_worker"
+            worker_class_name = "XPUWorker"
+        else:
+            raise NotImplementedError(
+                "XPU does not support speculative decoding")
+        wrapper = WorkerWrapperBase(
+            worker_module_name=worker_module_name,
+            worker_class_name=worker_class_name,
+        )
+        wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank,
+                                                      distributed_init_method))
+        return wrapper.worker
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        output = self.driver_worker.execute_model(execute_model_req)
+        return output
+class XPUExecutorAsync(XPUExecutor, ExecutorAsyncBase):
+    async def execute_model_async(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> List[SamplerOutput]:
+        output = await make_async(self.driver_worker.execute_model
+                                  )(execute_model_req=execute_model_req)
+        return output
+def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
+    if config.dtype == torch.bfloat16:
+        logger.warning(
+            "bfloat16 is not fully supported on XPU, casting to float16.")
+        config.dtype = torch.float16
+    if not config.enforce_eager:
+        logger.warning(
+            "CUDA graph is not supported on XPU, fallback to the eager "
+            "mode.")
+        config.enforce_eager = True
+    return config