merge v0.4.0

7c4f76e3 · zhuwenwen · 2da0dd3e · 51c31bc1 · 7c4f76e3 · 7c4f76e3
Commit 7c4f76e3 authored Apr 15, 2024 by zhuwenwen
12 changed files
--- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py
+from abc import ABC, abstractmethod
+from typing import List, Optional
+
+from transformers import PreTrainedTokenizer
+
+from vllm.lora.request import LoRARequest
+
+
+class BaseTokenizerGroup(ABC):
+    """A group of tokenizers that can be used for LoRA adapters."""
+
+    @abstractmethod
+    def ping(self) -> bool:
+        """Check if the tokenizer group is alive."""
+        pass
+
+    @abstractmethod
+    def get_max_input_len(self,
+                          lora_request: Optional[LoRARequest] = None
+                          ) -> Optional[int]:
+        """Get the maximum input length for the LoRA request."""
+        pass
+
+    @abstractmethod
+    def encode(self,
+               prompt: str,
+               request_id: Optional[str] = None,
+               lora_request: Optional[LoRARequest] = None) -> List[int]:
+        """Encode a prompt using the tokenizer group."""
+        pass
+
+    @abstractmethod
+    async def encode_async(
+            self,
+            prompt: str,
+            request_id: Optional[str] = None,
+            lora_request: Optional[LoRARequest] = None) -> List[int]:
+        """Encode a prompt using the tokenizer group."""
+        pass
+
+    @abstractmethod
+    def get_lora_tokenizer(
+            self,
+            lora_request: Optional[LoRARequest] = None
+    ) -> "PreTrainedTokenizer":
+        """Get a tokenizer for a LoRA request."""
+        pass
+
+    @abstractmethod
+    async def get_lora_tokenizer_async(
+            self,
+            lora_request: Optional[LoRARequest] = None
+    ) -> "PreTrainedTokenizer":
+        """Get a tokenizer for a LoRA request."""
+        pass
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+import asyncio
+import os
+from typing import List, Optional
+
+from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
+from transformers import PreTrainedTokenizer
+
+from vllm.config import TokenizerPoolConfig
+from vllm.engine.ray_utils import ray
+from vllm.lora.request import LoRARequest
+from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
+    BaseTokenizerGroup)
+from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
+    TokenizerGroup)
+
+
+class RayTokenizerGroupPool(BaseTokenizerGroup):
+    """A Ray-based pool of TokenizerGroups for async tokenization."""
+
+    # Class to use for workers making up the pool.
+    _worker_cls = TokenizerGroup
+
+    @classmethod
+    def from_config(cls, tokenizer_pool_config: TokenizerPoolConfig,
+                    **init_kwargs) -> "RayTokenizerGroupPool":
+        ray_actor_options = (tokenizer_pool_config.extra_config or {
+            "num_cpus": 0
+        })
+        ray_actor_options.setdefault(
+            "scheduling_strategy",
+            NodeAffinitySchedulingStrategy(
+                node_id=ray.get_runtime_context().get_node_id(), soft=True))
+
+        # Carry over the env vars to the actors.
+        # This is necessary for API keys and such.
+        ray_actor_options.setdefault("runtime_env", {})
+        _carry_over_env_vars_to_runtime_env(ray_actor_options["runtime_env"])
+
+        init_kwargs["num_actors"] = tokenizer_pool_config.pool_size
+        init_kwargs["ray_actor_options"] = ray_actor_options
+
+        return cls(**init_kwargs)
+
+    def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
+                 max_input_length: Optional[int], num_actors: int,
+                 ray_actor_options: dict, **tokenizer_config):
+        # Store a local copy of the TokenizerGroup for quick access
+        # to underlying HF tokenizers.
+        self._local_tokenizer_group = self._worker_cls(
+            tokenizer_id=tokenizer_id,
+            enable_lora=enable_lora,
+            max_num_seqs=max_num_seqs,
+            max_input_length=max_input_length,
+        )
+
+        ray_tokenizer_group_cls = ray.remote(
+            self._worker_cls).options(**ray_actor_options)
+        self.tokenizer_actors = [
+            ray_tokenizer_group_cls.remote(tokenizer_id, enable_lora,
+                                           max_num_seqs, max_input_length,
+                                           **tokenizer_config)
+            for _ in range(num_actors)
+        ]
+        self._idle_actors: Optional[asyncio.Queue] = None
+
+    @property
+    def pool_size(self) -> int:
+        return len(self.tokenizer_actors)
+
+    def ping(self):
+        return ray.get(
+            [actor.ping.remote() for actor in self.tokenizer_actors])
+
+    def _ensure_queue_initialized(self):
+        if self._idle_actors is None:
+            self._idle_actors = asyncio.Queue()
+            for actor in self.tokenizer_actors:
+                self._idle_actors.put_nowait(actor)
+
+    def encode(self,
+               prompt: str,
+               request_id: Optional[str] = None,
+               lora_request: Optional[LoRARequest] = None) -> List[int]:
+        """Encode a prompt using the tokenizer group.
+
+        We pick an idle actor and use it to encode the prompt.
+        The actor is then put back in the queue for future use.
+        This is blocking.
+        """
+        self._ensure_queue_initialized()
+
+        if self._idle_actors.empty():
+            raise RuntimeError("No idle actors available.")
+        actor = self._idle_actors.get_nowait()
+        try:
+            ret = ray.get(
+                actor.encode.remote(request_id=request_id,
+                                    prompt=prompt,
+                                    lora_request=lora_request))
+        finally:
+            # Put the actor back in the queue.
+            # This is done in a finally block to ensure that the actor is
+            # always put back in the queue, even if an exception/cancellation
+            # is raised.
+            self._idle_actors.put_nowait(actor)
+        return ret
+
+    async def encode_async(
+            self,
+            prompt: str,
+            request_id: Optional[str] = None,
+            lora_request: Optional[LoRARequest] = None) -> List[int]:
+        """Encode a prompt using the tokenizer group.
+
+        We pick an idle actor and use it to encode the prompt.
+        If there are no idle actors, we wait until one becomes
+        available.
+        The actor is then put back in the queue for future use.
+        This is non-blocking.
+        """
+        self._ensure_queue_initialized()
+
+        actor = await self._idle_actors.get()
+        try:
+            ret = await actor.encode.remote(request_id=request_id,
+                                            prompt=prompt,
+                                            lora_request=lora_request)
+        finally:
+            # Put the actor back in the queue.
+            # This is done in a finally block to ensure that the actor is
+            # always put back in the queue, even if an exception/cancellation
+            # is raised.
+            self._idle_actors.put_nowait(actor)
+        return ret
+
+    def get_max_input_len(self,
+                          lora_request: Optional[LoRARequest] = None
+                          ) -> Optional[int]:
+        """Get the maximum input length for the LoRA request."""
+        return self._local_tokenizer_group.get_max_input_len(lora_request)
+
+    def get_lora_tokenizer(
+            self,
+            lora_request: Optional[LoRARequest] = None
+    ) -> "PreTrainedTokenizer":
+        return self._local_tokenizer_group.get_lora_tokenizer(lora_request)
+
+    async def get_lora_tokenizer_async(
+            self,
+            lora_request: Optional[LoRARequest] = None
+    ) -> "PreTrainedTokenizer":
+        return await self._local_tokenizer_group.get_lora_tokenizer_async(
+            lora_request)
+
+
+def _carry_over_env_vars_to_runtime_env(runtime_env: dict) -> None:
+    """Copy over all current process environment variables to the runtime_env.
+
+    The variables in runtime_env will take precedence over the current process
+    environment variables.
+
+    runtime_env will be modified in place."""
+    env_vars = os.environ.copy()
+    runtime_env.setdefault("env_vars", {})
+    env_vars.update(runtime_env["env_vars"])
+    runtime_env["env_vars"] = env_vars
--- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
+from typing import List, Optional
+
+from transformers import PreTrainedTokenizer
+
+from vllm.lora.request import LoRARequest
+from vllm.transformers_utils.tokenizer import (get_lora_tokenizer,
+                                               get_lora_tokenizer_async,
+                                               get_tokenizer)
+from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
+    BaseTokenizerGroup)
+from vllm.utils import LRUCache
+
+
+class TokenizerGroup(BaseTokenizerGroup):
+    """A group of tokenizers that can be used for LoRA adapters."""
+
+    def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
+                 max_input_length: Optional[int], **tokenizer_config):
+        self.tokenizer_id = tokenizer_id
+        self.tokenizer_config = tokenizer_config
+        self.enable_lora = enable_lora
+        self.max_input_length = max_input_length
+        self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
+        self.lora_tokenizers = LRUCache[PreTrainedTokenizer](
+            capacity=max_num_seqs) if enable_lora else None
+
+    def ping(self) -> bool:
+        """Check if the tokenizer group is alive."""
+        return True
+
+    def get_max_input_len(self,
+                          lora_request: Optional[LoRARequest] = None
+                          ) -> Optional[int]:
+        """Get the maximum input length for the LoRA request."""
+        return self.max_input_length
+
+    def encode(self,
+               prompt: str,
+               request_id: Optional[str] = None,
+               lora_request: Optional[LoRARequest] = None) -> List[int]:
+        tokenizer = self.get_lora_tokenizer(lora_request)
+        return tokenizer.encode(prompt)
+
+    async def encode_async(
+            self,
+            prompt: str,
+            request_id: Optional[str] = None,
+            lora_request: Optional[LoRARequest] = None) -> List[int]:
+        tokenizer = await self.get_lora_tokenizer_async(lora_request)
+        return tokenizer.encode(prompt)
+
+    def get_lora_tokenizer(
+            self,
+            lora_request: Optional[LoRARequest] = None
+    ) -> "PreTrainedTokenizer":
+        if not lora_request or not self.enable_lora:
+            return self.tokenizer
+        if lora_request.lora_int_id not in self.lora_tokenizers:
+            tokenizer = (get_lora_tokenizer(
+                lora_request, **self.tokenizer_config) or self.tokenizer)
+            self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
+            return tokenizer
+        else:
+            return self.lora_tokenizers.get(lora_request.lora_int_id)
+
+    async def get_lora_tokenizer_async(
+            self,
+            lora_request: Optional[LoRARequest] = None
+    ) -> "PreTrainedTokenizer":
+        if not lora_request or not self.enable_lora:
+            return self.tokenizer
+        if lora_request.lora_int_id not in self.lora_tokenizers:
+            tokenizer = (await get_lora_tokenizer_async(
+                lora_request, **self.tokenizer_config) or self.tokenizer)
+            self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
+            return tokenizer
+        else:
+            return self.lora_tokenizers.get(lora_request.lora_int_id)
--- a/vllm/transformers_utils/tokenizers/baichuan.py
+++ b/vllm/transformers_utils/tokenizers/baichuan.py
-# yapf: disable
 # Adapted from
 # https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py
 # This includes a fix suggested in
@@ -13,7 +12,6 @@ import sentencepiece as spm
 from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
 from transformers.utils import logging

-
 logger = logging.get_logger(__name__)

 VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
@@ -52,27 +50,16 @@ class BaichuanTokenizer(PreTrainedTokenizer):
        clean_up_tokenization_spaces=False,
        **kwargs,
    ):
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-        bos_token = (
-            AddedToken(bos_token, lstrip=False, rstrip=False)
-            if isinstance(bos_token, str)
-            else bos_token
-        )
-        eos_token = (
-            AddedToken(eos_token, lstrip=False, rstrip=False)
-            if isinstance(eos_token, str)
-            else eos_token
-        )
-        unk_token = (
-            AddedToken(unk_token, lstrip=False, rstrip=False)
-            if isinstance(unk_token, str)
-            else unk_token
-        )
-        pad_token = (
-            AddedToken(pad_token, lstrip=False, rstrip=False)
-            if isinstance(pad_token, str)
-            else pad_token
-        )
+        self.sp_model_kwargs = ({} if sp_model_kwargs is None else
+                                sp_model_kwargs)
+        bos_token = (AddedToken(bos_token, lstrip=False, rstrip=False)
+                     if isinstance(bos_token, str) else bos_token)
+        eos_token = (AddedToken(eos_token, lstrip=False, rstrip=False)
+                     if isinstance(eos_token, str) else eos_token)
+        unk_token = (AddedToken(unk_token, lstrip=False, rstrip=False)
+                     if isinstance(unk_token, str) else unk_token)
+        pad_token = (AddedToken(pad_token, lstrip=False, rstrip=False)
+                     if isinstance(pad_token, str) else pad_token)
        self.vocab_file = vocab_file
        self.add_bos_token = add_bos_token
        self.add_eos_token = add_eos_token
@@ -107,7 +94,10 @@ class BaichuanTokenizer(PreTrainedTokenizer):

    def get_vocab(self):
        """Returns vocab as a dict"""
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab = {
+            self.convert_ids_to_tokens(i): i
+            for i in range(self.vocab_size)
+        }
        vocab.update(self.added_tokens_encoder)
        return vocab

@@ -130,7 +120,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
        out_string = ""
        prev_is_special = False
        for i, token in enumerate(tokens):
-            # make sure that special tokens are not decoded using sentencepiece model
+            # make sure that special tokens are not decoded using
+            # sentencepiece model
            if token in self.all_special_tokens:
                if not prev_is_special and i != 0:
                    out_string += " "
@@ -143,9 +134,9 @@ class BaichuanTokenizer(PreTrainedTokenizer):
        out_string += self.sp_model.decode(current_sub_tokens)
        return out_string

-    def save_vocabulary(
-        self, save_directory, filename_prefix: Optional[str] = None
-    ) -> Tuple[str]:
+    def save_vocabulary(self,
+                        save_directory,
+                        filename_prefix: Optional[str] = None) -> Tuple[str]:
        """
        Save the vocabulary and special tokens file to a directory.

@@ -157,24 +148,24 @@ class BaichuanTokenizer(PreTrainedTokenizer):
            `Tuple(str)`: Paths to the files saved.
        """
        if not os.path.isdir(save_directory):
-            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            logger.error(f"Vocabulary path ({save_directory}) "
+                         "should be a directory")
            return
        out_vocab_file = os.path.join(
            save_directory,
-            (filename_prefix + "-" if filename_prefix else "")
-            + VOCAB_FILES_NAMES["vocab_file"],
+            (filename_prefix + "-" if filename_prefix else "") +
+            VOCAB_FILES_NAMES["vocab_file"],
        )

        if os.path.abspath(self.vocab_file) != os.path.abspath(
-            out_vocab_file
-        ) and os.path.isfile(self.vocab_file):
+                out_vocab_file) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)
        elif not os.path.isfile(self.vocab_file):
            with open(out_vocab_file, "wb") as fi:
                content_spiece_model = self.sp_model.serialized_model_proto()
                fi.write(content_spiece_model)

-        return (out_vocab_file,)
+        return (out_vocab_file, )

    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
@@ -194,7 +185,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
        already_has_special_tokens: bool = False,
    ) -> List[int]:
        """
-        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        Retrieve sequence ids from a token list that has no special tokens
+        added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
@@ -202,11 +194,14 @@ class BaichuanTokenizer(PreTrainedTokenizer):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
-                Whether or not the token list is already formatted with special tokens for the model.
+            already_has_special_tokens (`bool`, *optional*, defaults to
+            `False`):
+                Whether or not the token list is already formatted with
+                special tokens for the model.

        Returns:
-            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]:
+            1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
@@ -220,20 +215,16 @@ class BaichuanTokenizer(PreTrainedTokenizer):

        if token_ids_1 is None:
            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
-        return (
-            bos_token_id
-            + ([0] * len(token_ids_0))
-            + eos_token_id
-            + bos_token_id
-            + ([0] * len(token_ids_1))
-            + eos_token_id
-        )
+        return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
+                bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)

    def create_token_type_ids_from_sequences(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        Creates a mask from the two sequences passed to be used in a
+        sequence-pair classification task. An ALBERT
        sequence pair mask has the following format:

        ```
@@ -250,7 +241,8 @@ class BaichuanTokenizer(PreTrainedTokenizer):
                Optional second list of IDs for sequence pairs.

        Returns:
-            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids)
+            according to the given sequence(s).
        """
        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
        eos_token_id = [self.eos_token_id] if self.add_eos_token else []

--- a/vllm/usage/__init__.py
+++ b/vllm/usage/__init__.py
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
+import datetime
+import json
+import logging
+import os
+import platform
+import time
+from enum import Enum
+from pathlib import Path
+from threading import Thread
+from typing import Dict, Optional
+from uuid import uuid4
+
+import cpuinfo
+import psutil
+import requests
+import torch
+
+_config_home = os.getenv("XDG_CONFIG_HOME", os.path.expanduser("~/.config"))
+_USAGE_STATS_JSON_PATH = os.path.join(_config_home, "vllm/usage_stats.json")
+_USAGE_STATS_DO_NOT_TRACK_PATH = os.path.join(_config_home,
+                                              "vllm/do_not_track")
+_USAGE_STATS_ENABLED = None
+_USAGE_STATS_SERVER = os.environ.get("VLLM_USAGE_STATS_SERVER",
+                                     "https://stats.vllm.ai")
+
+
+def is_usage_stats_enabled():
+    """Determine whether or not we can send usage stats to the server.
+    The logic is as follows:
+    - By default, it should be enabled.
+    - Two environment variables can disable it:
+        - DO_NOT_TRACK=1
+        - VLLM_NO_USAGE_STATS=1
+    - A file in the home directory can disable it if it exists:
+        - $HOME/.config/vllm/do_not_track
+    """
+    global _USAGE_STATS_ENABLED
+    if _USAGE_STATS_ENABLED is None:
+        do_not_track = os.environ.get("DO_NOT_TRACK", "0") == "1"
+        no_usage_stats = os.environ.get("VLLM_NO_USAGE_STATS", "0") == "1"
+        do_not_track_file = os.path.exists(_USAGE_STATS_DO_NOT_TRACK_PATH)
+
+        _USAGE_STATS_ENABLED = not (do_not_track or no_usage_stats
+                                    or do_not_track_file)
+    return _USAGE_STATS_ENABLED
+
+
+def _get_current_timestamp_ns() -> int:
+    return int(datetime.datetime.now(datetime.timezone.utc).timestamp() * 1e9)
+
+
+def _detect_cloud_provider() -> str:
+    # Try detecting through vendor file
+    vendor_files = [
+        "/sys/class/dmi/id/product_version", "/sys/class/dmi/id/bios_vendor",
+        "/sys/class/dmi/id/product_name",
+        "/sys/class/dmi/id/chassis_asset_tag", "/sys/class/dmi/id/sys_vendor"
+    ]
+    # Mapping of identifiable strings to cloud providers
+    cloud_identifiers = {
+        "amazon": "AWS",
+        "microsoft corporation": "AZURE",
+        "google": "GCP",
+        "oraclecloud": "OCI",
+    }
+
+    for vendor_file in vendor_files:
+        path = Path(vendor_file)
+        if path.is_file():
+            file_content = path.read_text().lower()
+            for identifier, provider in cloud_identifiers.items():
+                if identifier in file_content:
+                    return provider
+
+    # Try detecting through environment variables
+    env_to_cloud_provider = {
+        "RUNPOD_DC_ID": "RUNPOD",
+    }
+    for env_var, provider in env_to_cloud_provider.items():
+        if os.environ.get(env_var):
+            return provider
+
+    return "UNKNOWN"
+
+
+class UsageContext(str, Enum):
+    UNKNOWN_CONTEXT = "UNKNOWN_CONTEXT"
+    LLM_CLASS = "LLM_CLASS"
+    API_SERVER = "API_SERVER"
+    OPENAI_API_SERVER = "OPENAI_API_SERVER"
+    ENGINE_CONTEXT = "ENGINE_CONTEXT"
+
+
+class UsageMessage:
+    """Collect platform information and send it to the usage stats server."""
+
+    def __init__(self) -> None:
+        # NOTE: vLLM's server _only_ support flat KV pair.
+        # Do not use nested fields.
+
+        self.uuid = str(uuid4())
+
+        # Environment Information
+        self.provider: Optional[str] = None
+        self.num_cpu: Optional[int] = None
+        self.cpu_type: Optional[str] = None
+        self.cpu_family_model_stepping: Optional[str] = None
+        self.total_memory: Optional[int] = None
+        self.architecture: Optional[str] = None
+        self.platform: Optional[str] = None
+        self.gpu_count: Optional[int] = None
+        self.gpu_type: Optional[str] = None
+        self.gpu_memory_per_device: Optional[int] = None
+
+        # vLLM Information
+        self.model_architecture: Optional[str] = None
+        self.vllm_version: Optional[str] = None
+        self.context: Optional[str] = None
+
+        # Metadata
+        self.log_time: Optional[int] = None
+        self.source: Optional[str] = None
+
+    def report_usage(self,
+                     model_architecture: str,
+                     usage_context: UsageContext,
+                     extra_kvs: Dict[str, any] = None) -> None:
+        t = Thread(target=self._report_usage_worker,
+                   args=(model_architecture, usage_context, extra_kvs or {}),
+                   daemon=True)
+        t.start()
+
+    def _report_usage_worker(self, model_architecture: str,
+                             usage_context: UsageContext,
+                             extra_kvs: Dict[str, any]) -> None:
+        self._report_usage_once(model_architecture, usage_context, extra_kvs)
+        self._report_continous_usage()
+
+    def _report_usage_once(self, model_architecture: str,
+                           usage_context: UsageContext,
+                           extra_kvs: Dict[str, any]) -> None:
+        # Platform information
+        if torch.cuda.is_available():
+            device_property = torch.cuda.get_device_properties(0)
+            self.gpu_count = torch.cuda.device_count()
+            self.gpu_type = device_property.name
+            self.gpu_memory_per_device = device_property.total_memory
+        self.provider = _detect_cloud_provider()
+        self.architecture = platform.machine()
+        self.platform = platform.platform()
+        self.total_memory = psutil.virtual_memory().total
+
+        info = cpuinfo.get_cpu_info()
+        self.num_cpu = info.get("count", None)
+        self.cpu_type = info.get("brand_raw", "")
+        self.cpu_family_model_stepping = ",".join([
+            str(info.get("family", "")),
+            str(info.get("model", "")),
+            str(info.get("stepping", ""))
+        ])
+
+        # vLLM information
+        import vllm  # delayed import to prevent circular import
+        self.context = usage_context.value
+        self.vllm_version = vllm.__version__
+        self.model_architecture = model_architecture
+
+        # Metadata
+        self.log_time = _get_current_timestamp_ns()
+        self.source = os.environ.get("VLLM_USAGE_SOURCE", "production")
+
+        data = vars(self)
+        if extra_kvs:
+            data.update(extra_kvs)
+
+        self._write_to_file(data)
+        self._send_to_server(data)
+
+    def _report_continous_usage(self):
+        """Report usage every 10 minutes.
+
+        This helps us to collect more data points for uptime of vLLM usages.
+        This function can also help send over performance metrics over time.
+        """
+        while True:
+            time.sleep(600)
+            data = {"uuid": self.uuid, "log_time": _get_current_timestamp_ns()}
+
+            self._write_to_file(data)
+            self._send_to_server(data)
+
+    def _send_to_server(self, data):
+        try:
+            requests.post(_USAGE_STATS_SERVER, json=data)
+        except requests.exceptions.RequestException:
+            # silently ignore unless we are using debug log
+            logging.debug("Failed to send usage data to server")
+
+    def _write_to_file(self, data):
+        os.makedirs(os.path.dirname(_USAGE_STATS_JSON_PATH), exist_ok=True)
+        Path(_USAGE_STATS_JSON_PATH).touch(exist_ok=True)
+        with open(_USAGE_STATS_JSON_PATH, "a") as f:
+            json.dump(data, f)
+            f.write("\n")
+
+
+usage_message = UsageMessage()
--- a/vllm/utils.py
+++ b/vllm/utils.py
+import asyncio
 import enum
+import gc
 import os
 import socket
 import subprocess
 import uuid
+import warnings
+from collections import OrderedDict
+from functools import lru_cache, partial
 from platform import uname
-from typing import List, Tuple, Union
-from packaging.version import parse, Version
+from typing import (Any, Awaitable, Callable, Generic, Hashable, List,
+                    Optional, Tuple, TypeVar, Union)

 import psutil
 import torch
-import asyncio
-from functools import partial
-from typing import (
-    Awaitable,
-    Callable,
-    TypeVar,
-)
-from collections import OrderedDict
-from typing import Any, Hashable, Optional
+from packaging.version import Version, parse

 from vllm.logger import init_logger

@@ -51,10 +48,10 @@ class Counter:
        self.counter = 0


-class LRUCache:
+class LRUCache(Generic[T]):

    def __init__(self, capacity: int):
-        self.cache = OrderedDict()
+        self.cache = OrderedDict[Hashable, T]()
        self.capacity = capacity

    def __contains__(self, key: Hashable) -> bool:
@@ -63,10 +60,10 @@ class LRUCache:
    def __len__(self) -> int:
        return len(self.cache)

-    def __getitem__(self, key: Hashable) -> Any:
+    def __getitem__(self, key: Hashable) -> T:
        return self.get(key)

-    def __setitem__(self, key: Hashable, value: Any) -> None:
+    def __setitem__(self, key: Hashable, value: T) -> None:
        self.put(key, value)

    def __delitem__(self, key: Hashable) -> None:
@@ -75,7 +72,9 @@ class LRUCache:
    def touch(self, key: Hashable) -> None:
        self.cache.move_to_end(key)

-    def get(self, key: Hashable, default_value: Optional[Any] = None) -> int:
+    def get(self,
+            key: Hashable,
+            default_value: Optional[T] = None) -> Optional[T]:
        if key in self.cache:
            value = self.cache[key]
            self.cache.move_to_end(key)
@@ -83,12 +82,12 @@ class LRUCache:
            value = default_value
        return value

-    def put(self, key: Hashable, value: Any) -> None:
+    def put(self, key: Hashable, value: T) -> None:
        self.cache[key] = value
        self.cache.move_to_end(key)
        self._remove_old_if_needed()

-    def _on_remove(self, key: Hashable, value: Any):
+    def _on_remove(self, key: Hashable, value: T):
        pass

    def remove_oldest(self):
@@ -101,7 +100,7 @@ class LRUCache:
        while len(self.cache) > self.capacity:
            self.remove_oldest()

-    def pop(self, key: int, default_value: Optional[Any] = None) -> Any:
+    def pop(self, key: Hashable, default_value: Optional[Any] = None) -> T:
        run_on_remove = key in self.cache
        value = self.cache.pop(key, default_value)
        if run_on_remove:
@@ -118,6 +117,7 @@ def is_hip() -> bool:
    return torch.version.hip is not None


+@lru_cache(maxsize=None)
 def is_neuron() -> bool:
    try:
        import transformers_neuronx
@@ -126,15 +126,17 @@ def is_neuron() -> bool:
    return transformers_neuronx is not None


+@lru_cache(maxsize=None)
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
    """Returns the maximum shared memory per thread block in bytes."""
    # NOTE: This import statement should be executed lazily since
    # the Neuron-X backend does not have the `cuda_utils` module.
    from vllm._C import cuda_utils

-    max_shared_mem = cuda_utils.get_max_shared_memory_per_block_device_attribute(
-        gpu)
-    # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py will fail
+    max_shared_mem = (
+        cuda_utils.get_max_shared_memory_per_block_device_attribute(gpu))
+    # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py
+    # will fail
    assert max_shared_mem > 0, "max_shared_mem can not be zero"
    return int(max_shared_mem)

@@ -148,6 +150,7 @@ def random_uuid() -> str:
    return str(uuid.uuid4().hex)


+@lru_cache(maxsize=None)
 def in_wsl() -> bool:
    # Reference: https://github.com/microsoft/WSL/issues/4071
    return "microsoft" in " ".join(uname()).lower()
@@ -170,20 +173,41 @@ def make_async(func: Callable[..., T]) -> Callable[..., Awaitable[T]]:


 def get_ip() -> str:
+    host_ip = os.environ.get("HOST_IP")
+    if host_ip:
+        return host_ip
+
+    # IP is not set, try to get it from the network interface
+
    # try ipv4
    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
    try:
-        s.connect(("dns.google", 80))  # Doesn't need to be reachable
+        s.connect(("8.8.8.8", 80))  # Doesn't need to be reachable
        return s.getsockname()[0]
-    except OSError:
-        # try ipv6
+    except Exception:
+        pass
+
+    # try ipv6
+    try:
        s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
-        s.connect(("dns.google", 80))
+        # Google's public DNS server, see
+        # https://developers.google.com/speed/public-dns/docs/using#addresses
+        s.connect(("2001:4860:4860::8888", 80))  # Doesn't need to be reachable
        return s.getsockname()[0]
+    except Exception:
+        pass
+
+    warnings.warn(
+        "Failed to get the IP address, using 0.0.0.0 by default."
+        "The value can be set by the environment variable HOST_IP.",
+        stacklevel=2)
+    return "0.0.0.0"


 def get_distributed_init_method(ip: str, port: int) -> str:
-    return f"tcp://{ip}:{port}"
+    # Brackets are not permitted in ipv4 addresses,
+    # see https://github.com/python/cpython/issues/103848
+    return f"tcp://[{ip}]:{port}" if ":" in ip else f"tcp://{ip}:{port}"


 def get_open_port() -> int:
@@ -203,14 +227,24 @@ def set_cuda_visible_devices(device_ids: List[int]) -> None:
    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, device_ids))


+def chunk_list(lst, chunk_size):
+    """Yield successive chunk_size chunks from lst."""
+    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def cdiv(a: int, b: int) -> int:
+    """Ceiling division."""
+    return -(a // -b)
+
+
+@lru_cache(maxsize=None)
 def get_nvcc_cuda_version() -> Optional[Version]:
    cuda_home = os.environ.get('CUDA_HOME')
    if not cuda_home:
        cuda_home = '/usr/local/cuda'
        if os.path.isfile(cuda_home + '/bin/nvcc'):
-            logger.info(
-                f'CUDA_HOME is not found in the environment. Using {cuda_home} as CUDA_HOME.'
-            )
+            logger.info(f'CUDA_HOME is not found in the environment. '
+                        f'Using {cuda_home} as CUDA_HOME.')
        else:
            logger.warning(
                f'Not found nvcc in {cuda_home}. Skip cuda version check!')
@@ -309,3 +343,98 @@ def create_kv_caches_with_random(
                f"Does not support value cache of type {cache_dtype}")
        value_caches.append(value_cache)
    return key_caches, value_caches
+
+
+@lru_cache
+def print_warning_once(msg: str) -> None:
+    logger.warning(msg)
+
+
+@lru_cache(maxsize=None)
+def is_pin_memory_available() -> bool:
+
+    if in_wsl():
+        # Pinning memory in WSL is not supported.
+        # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
+        print_warning_once("Using 'pin_memory=False' as WSL is detected. "
+                           "This may slow down the performance.")
+        return False
+    elif is_neuron():
+        print_warning_once("Pin memory is not supported on Neuron.")
+        return False
+    return True
+
+
+class CudaMemoryProfiler:
+
+    def __init__(self, device=None):
+        self.device = device
+
+    def current_memory_usage(self) -> float:
+        # Return the memory usage in bytes.
+        torch.cuda.reset_peak_memory_stats(self.device)
+        mem = torch.cuda.max_memory_allocated(self.device)
+        return mem
+
+    def __enter__(self):
+        self.initial_memory = self.current_memory_usage()
+        # This allows us to call methods of the context manager if needed
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.final_memory = self.current_memory_usage()
+        self.consumed_memory = self.final_memory - self.initial_memory
+
+        # Force garbage collection
+        gc.collect()
+
+
+def str_to_int_tuple(s: str) -> Tuple[int]:
+    """Convert a string to a tuple of integers."""
+    try:
+        return tuple(map(int, s.split(",")))
+    except ValueError as e:
+        raise ValueError(
+            "String must be a series of integers separated by commas "
+            f"(e.g., 1, 2, 3). Given input: {s}") from e
+
+
+def pad_to_max_length(x: List[int], max_len: int, pad: int) -> List[int]:
+    assert len(x) <= max_len
+    return x + [pad] * (max_len - len(x))
+
+
+def make_tensor_with_pad(
+    x: List[List[int]],
+    max_len: int,
+    pad: int,
+    dtype: torch.dtype,
+    device: Optional[Union[str, torch.device]],
+) -> torch.Tensor:
+    """Make a padded tensor of a 2D inputs.
+
+    The padding is applied to the end of each inner list until it reaches
+    `max_len`.
+    """
+    padded_x = [pad_to_max_length(x_i, max_len, pad) for x_i in x]
+    return torch.tensor(padded_x, dtype=dtype, device=device)
+
+
+def async_tensor_h2d(
+    data: list,
+    dtype: torch.dtype,
+    target_device: Union[str, torch.device],
+    pin_memory: bool,
+) -> torch.Tensor:
+    """Asynchronously create a tensor and copy it from host to device."""
+    t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory, device="cpu")
+    return t.to(device=target_device, non_blocking=True)
+
+
+def maybe_expand_dim(tensor: torch.Tensor,
+                     target_dims: int,
+                     size: int = 1) -> torch.Tensor:
+    """Expand the tensor to the target_dims."""
+    if tensor.ndim < target_dims:
+        tensor = tensor.view(-1, *([size] * (target_dims - tensor.ndim)))
+    return tensor
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
 """CacheEngine class for managing the KV cache."""
-from typing import Dict, List, Tuple
+from typing import Dict, List

 import torch

+from vllm.attention import get_attn_backend
 from vllm.config import CacheConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import in_wsl, is_neuron, STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, is_pin_memory_available

 logger = init_logger(__name__)

-KVCache = Tuple[torch.Tensor, torch.Tensor]
-

 class CacheEngine:
    """Manages the KV cache.
@@ -38,119 +37,48 @@ class CacheEngine:
        self.num_gpu_blocks = cache_config.num_gpu_blocks
        self.num_cpu_blocks = cache_config.num_cpu_blocks

-        # Skip initializing CUDA stream and buffer for Neuron backend.
-        if is_neuron():
-            return
-
        if cache_config.cache_dtype == "auto":
            self.dtype = model_config.dtype
        else:
            self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype]

+        # Get attention backend.
+        self.attn_backend = get_attn_backend(model_config.dtype)
+
        # Initialize the cache.
-        self.gpu_cache = self.allocate_gpu_cache()
-        self.cpu_cache = self.allocate_cpu_cache()
-
-        # Initialize the stream for caching operations.
-        self.cache_stream = torch.cuda.Stream()
-        assert self.cache_stream != torch.cuda.current_stream()
-        # Initialize the events for stream synchronization.
-        self.events = [torch.cuda.Event() for _ in range(self.num_layers)]
-
-    def get_key_block_shape(self) -> Tuple[int, int, int, int]:
-        element_size = torch.tensor([], dtype=self.dtype).element_size()
-        x = 16 // element_size
-        return (
-            self.num_heads,
-            self.head_size // x,
-            self.block_size,
-            x,
-        )
-
-    def get_value_block_shape(self) -> Tuple[int, int, int]:
-        return (
-            self.num_heads,
-            self.head_size,
-            self.block_size,
-        )
-
-    def allocate_gpu_cache(self) -> List[KVCache]:
-        gpu_cache: List[KVCache] = []
-        key_block_shape = self.get_key_block_shape()
-        value_block_shape = self.get_value_block_shape()
-        for _ in range(self.num_layers):
-            key_blocks = torch.empty(
-                size=(self.num_gpu_blocks, *key_block_shape),
-                dtype=self.dtype,
-                device="cuda",
-            )
-            value_blocks = torch.empty(
-                size=(self.num_gpu_blocks, *value_block_shape),
-                dtype=self.dtype,
-                device="cuda",
-            )
-            gpu_cache.append((key_blocks, value_blocks))
-        return gpu_cache
-
-    def allocate_cpu_cache(self) -> List[KVCache]:
-        cpu_cache: List[KVCache] = []
-        key_block_shape = self.get_key_block_shape()
-        value_block_shape = self.get_value_block_shape()
-        pin_memory = not in_wsl()
-        if not pin_memory:
-            # Pinning memory in WSL is not supported.
-            # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
-            logger.warning("Using 'pin_memory=False' as WSL is detected. "
-                           "This may slow down the performance.")
-        for _ in range(self.num_layers):
-            key_blocks = torch.empty(
-                size=(self.num_cpu_blocks, *key_block_shape),
-                dtype=self.dtype,
-                pin_memory=pin_memory,
-                device="cpu",
-            )
-            value_blocks = torch.empty(
-                size=(self.num_cpu_blocks, *value_block_shape),
-                dtype=self.dtype,
-                pin_memory=pin_memory,
-                device="cpu",
-            )
-            cpu_cache.append((key_blocks, value_blocks))
-        return cpu_cache
-
-    def _swap(
+        self.gpu_cache = self._allocate_kv_cache(self.num_gpu_blocks, "cuda")
+        self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks, "cpu")
+
+    def _allocate_kv_cache(
        self,
-        src: List[KVCache],
-        dst: List[KVCache],
-        src_to_dst: Dict[int, int],
-    ) -> None:
-        from vllm._C import cache_ops
-
-        with torch.cuda.stream(self.cache_stream):
-            for i in range(self.num_layers):
-                src_key_cache, src_value_cache = src[i]
-                dst_key_cache, dst_value_cache = dst[i]
-                # Copy the key blocks.
-                cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
-                # Copy the value blocks.
-                cache_ops.swap_blocks(src_value_cache, dst_value_cache,
-                                      src_to_dst)
-                event = self.events[i]
-                event.record(stream=self.cache_stream)
+        num_blocks: int,
+        device: str,
+    ) -> List[torch.Tensor]:
+        """Allocates KV cache on the specified device."""
+        kv_cache_shape = self.attn_backend.get_kv_cache_shape(
+            num_blocks, self.block_size, self.num_heads, self.head_size)
+        pin_memory = is_pin_memory_available() if device == "cpu" else False
+        kv_cache: List[torch.Tensor] = []
+        for _ in range(self.num_layers):
+            kv_cache.append(
+                torch.empty(kv_cache_shape,
+                            dtype=self.dtype,
+                            pin_memory=pin_memory,
+                            device=device))
+        return kv_cache

    def swap_in(self, src_to_dst: Dict[int, int]) -> None:
-        self._swap(self.cpu_cache, self.gpu_cache, src_to_dst)
+        for i in range(self.num_layers):
+            self.attn_backend.swap_blocks(self.cpu_cache[i], self.gpu_cache[i],
+                                          src_to_dst)

    def swap_out(self, src_to_dst: Dict[int, int]) -> None:
-        self._swap(self.gpu_cache, self.cpu_cache, src_to_dst)
+        for i in range(self.num_layers):
+            self.attn_backend.swap_blocks(self.gpu_cache[i], self.cpu_cache[i],
+                                          src_to_dst)

    def copy(self, src_to_dsts: Dict[int, List[int]]) -> None:
-        from vllm._C import cache_ops
-
-        key_caches = [key_cache for key_cache, _ in self.gpu_cache]
-        value_caches = [value_cache for _, value_cache in self.gpu_cache]
-        # NOTE(woosuk): This operation implicitly synchronizes the CPU and GPU.
-        cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts)
+        self.attn_backend.copy_blocks(self.gpu_cache, src_to_dsts)

    @staticmethod
    def get_cache_block_size(

--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
 import contextlib
 import time
-from typing import Dict, List, Optional, Tuple, Set, Union
+from typing import Dict, List, Optional, Set, Tuple

 import numpy as np
 import torch
 import torch.nn as nn

-from vllm.config import (DeviceConfig, ModelConfig, LoRAConfig, ParallelConfig,
-                         SchedulerConfig)
+from vllm.attention import AttentionMetadata, get_attn_backend
+from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig, VisionLanguageConfig)
 from vllm.logger import init_logger
-from vllm.model_executor import get_model, InputMetadata, SamplingMetadata
-from vllm.model_executor.parallel_utils import cupy_utils
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.parallel_utils import custom_all_reduce, pynccl_utils
 from vllm.model_executor.parallel_utils.communication_op import (
    broadcast_tensor_dict)
 from vllm.model_executor.parallel_utils.parallel_state import (
-    with_cupy_nccl_for_all_reduce)
-from vllm.model_executor.parallel_utils import custom_all_reduce
+    with_pynccl_for_all_reduce)
 from vllm.sampling_params import SamplingParams, SamplingType
-from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
-from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
-from vllm.lora.layers import LoRAMapping
-from vllm.lora.request import LoRARequest
-from vllm.utils import in_wsl
+from vllm.sequence import (MultiModalData, SamplerOutput, SequenceData,
+                           SequenceGroupMetadata)
+from vllm.utils import (CudaMemoryProfiler, async_tensor_h2d,
+                        is_pin_memory_available, make_tensor_with_pad,
+                        maybe_expand_dim)

 logger = init_logger(__name__)

-KVCache = Tuple[torch.Tensor, torch.Tensor]
 _PAD_SLOT_ID = -1
 LORA_WARMUP_RANK = 8
-# Capture graphs for batch size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256.
+_BATCH_SIZE_ALIGNMENT = 8
+# Capture graphs for token size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256.
 # NOTE: _get_graph_batch_size needs to be updated if this list is changed.
-_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)]
+_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
+    _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33)
+]


 class ModelRunner:
@@ -44,6 +50,7 @@ class ModelRunner:
        lora_config: Optional[LoRAConfig],
        kv_cache_dtype: Optional[str] = "auto",
        is_driver_worker: bool = False,
+        vision_language_config: Optional[VisionLanguageConfig] = None,
    ):
        self.model_config = model_config
        self.parallel_config = parallel_config
@@ -76,27 +83,31 @@ class ModelRunner:
        # The shape of the cached block table will be
        # (max batch size to capture, max context len to capture / block size).
        self.graph_block_tables = None  # Set after initial profiling.
-        # cache in_wsl result
-        self.in_wsl = in_wsl()
+        self.pin_memory = is_pin_memory_available()
        self.kv_cache_dtype = kv_cache_dtype
+        self.vision_language_config = vision_language_config

-        # Set enforce_eager to True for Neuron backend, to avoid capturing graph
-        if self.device_config.is_neuron:
-            self.model_config.enforce_eager = True
+        self.attn_backend = get_attn_backend(
+            self.model_config.dtype if model_config is not None else None)

    def load_model(self) -> None:
-        self.model = get_model(self.model_config,
-                               self.device_config,
-                               lora_config=self.lora_config,
-                               parallel_config=self.parallel_config,
-                               scheduler_config=self.scheduler_config)
-
-        vocab_size = self.model.config.vocab_size
+        with CudaMemoryProfiler() as m:
+            self.model = get_model(
+                self.model_config,
+                self.device_config,
+                lora_config=self.lora_config,
+                vision_language_config=self.vision_language_config,
+                parallel_config=self.parallel_config,
+                scheduler_config=self.scheduler_config)
+
+        self.model_memory_usage = m.consumed_memory
+        logger.info(f"Loading model weights took "
+                    f"{self.model_memory_usage / float(2**30):.4f} GB")

        if self.lora_config:
-            assert hasattr(
-                self.model, "supported_lora_modules"
-            ) and self.model.supported_lora_modules, "Model does not support LoRA"
+            assert hasattr(self.model, "supported_lora_modules"
+                           ) and self.model.supported_lora_modules, (
+                               "Model does not support LoRA")
            assert hasattr(
                self.model,
                "embedding_modules"), "Model does not have embedding_modules"
@@ -104,8 +115,7 @@ class ModelRunner:
                           ), "Model does not have embedding_padding_modules"
            self.lora_manager = LRUCacheWorkerLoRAManager(
                self.scheduler_config.max_num_seqs,
-                self.scheduler_config.max_num_batched_tokens +
-                self.scheduler_config.max_paddings, vocab_size,
+                self.scheduler_config.max_num_batched_tokens, self.vocab_size,
                self.lora_config, self.device, self.model.embedding_modules,
                self.model.embedding_padding_modules)
            self.model = self.lora_manager.create_lora_manager(self.model)
@@ -113,20 +123,24 @@ class ModelRunner:
    def set_block_size(self, block_size: int) -> None:
        self.block_size = block_size

-        max_num_blocks = (self.max_context_len_to_capture + block_size -
-                          1) // block_size
        self.graph_block_tables = np.zeros(
-            (max(_BATCH_SIZES_TO_CAPTURE), max_num_blocks), dtype=np.int32)
+            (max(_BATCH_SIZES_TO_CAPTURE), self.get_max_block_per_batch()),
+            dtype=np.int32)
+
+    def get_max_block_per_batch(self) -> int:
+        block_size = self.block_size
+        return (self.max_context_len_to_capture + block_size - 1) // block_size

    def _prepare_prompt(
        self,
        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata, List[int], List[int],
-               List[int], List[int], Set[LoRARequest]]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
+               List[int], List[int], List[int], Set[LoRARequest],
+               torch.Tensor]:
        assert len(seq_group_metadata_list) > 0
-        input_tokens: List[List[int]] = []
-        input_positions: List[List[int]] = []
-        slot_mapping: List[List[int]] = []
+        input_tokens: List[int] = []
+        input_positions: List[int] = []
+        slot_mapping: List[int] = []
        lora_index_mapping: List[int] = []
        lora_prompt_mapping: List[int] = []
        lora_requests: Set[LoRARequest] = set()
@@ -135,53 +149,82 @@ class ModelRunner:
        context_lens: List[int] = []
        subquery_lens: List[int] = []
        prefix_block_tables: List[List[int]] = []
+        multi_modal_input_list: List[torch.Tensor] = []
+
        for seq_group_metadata in seq_group_metadata_list:
            assert seq_group_metadata.is_prompt
            seq_ids = list(seq_group_metadata.seq_data.keys())
            assert len(seq_ids) == 1
            seq_id = seq_ids[0]

+            computed_block_nums = seq_group_metadata.computed_block_nums
+            if (self.scheduler_config is not None
+                    and self.scheduler_config.chunked_prefill_enabled
+                    and computed_block_nums is not None):
+                raise RuntimeError(
+                    "chunked prefill cannot be used with prefix caching "
+                    "now.")
+
+            token_chunk_size = seq_group_metadata.token_chunk_size
            seq_data = seq_group_metadata.seq_data[seq_id]
-            prompt_tokens = seq_data.get_token_ids()
+            computed_len = seq_data.get_num_computed_tokens()
+            # We should use get_len here because in case of preemption
+            # it contains output tokens.
+            prefill_end = min(seq_data.get_len(),
+                              computed_len + token_chunk_size)
+            # TODO(sang): Rename it after chunked prefill is introduced.
+            prompt_tokens = seq_data.get_token_ids()[computed_len:prefill_end]
            prompt_len = len(prompt_tokens)
+            # Right now, the prefill_end is always same as the length of
+            # sequence. However, once chunked prefill is introduced, this
+            # assumption can be changed.
+            assert prefill_end == seq_data.get_len()
            prompt_lens.append(prompt_len)
-            prefix_len = 0
-            prefix = seq_group_metadata.prefix
-            if prefix is not None and prefix.computed:
-                prefix_len = prefix.get_length()
-                prompt_tokens = prompt_tokens[prefix_len:]
-                prefix_block_tables.append(prefix.get_block_numbers())
+
+            # NOTE: This only works for oooooooxxx style attention.
+            if computed_block_nums is not None and len(
+                    computed_block_nums) > 0 and self.sliding_window is None:
+                # Prefix is not supported with sliding_window
+                computed_len = len(computed_block_nums) * self.block_size
+                prompt_tokens = prompt_tokens[computed_len:]
+                prefix_block_tables.append(computed_block_nums)
            else:
                prefix_block_tables.append([])
+                # Right now, prefill start is always 0. However, this
+                # assumption can be changed once chunked prefill is introduced.
+                assert computed_len == 0
+
            # actual prompt lens
-            context_lens.append(prefix_len)
-            subquery_lens.append(prompt_len - prefix_len)
+            context_lens.append(computed_len)
+            subquery_lens.append(prompt_len - computed_len)

-            input_tokens.append(prompt_tokens)
+            input_tokens.extend(prompt_tokens)
            # NOTE(woosuk): Here we assume that the first token in the prompt
            # is always the first token in the sequence.
-            input_positions.append(
-                list(range(prefix_len, prefix_len + len(prompt_tokens))))
+            input_positions.extend(list(range(computed_len, prefill_end)))

            lora_id = seq_group_metadata.lora_int_id

            if lora_id > 0:
                lora_requests.add(seq_group_metadata.lora_request)

-            lora_index_mapping.append([lora_id] * (prompt_len - prefix_len))
+            lora_index_mapping += [lora_id] * (prompt_len - computed_len)
            lora_prompt_mapping.extend(
                [lora_id] *
-                (prompt_len - prefix_len
+                (prompt_len - computed_len
                 if seq_group_metadata.sampling_params.prompt_logprobs else 1))

+            if seq_group_metadata.multi_modal_data:
+                multi_modal_input_list.append(
+                    seq_group_metadata.multi_modal_data.data)
+
            if seq_group_metadata.block_tables is None:
                # During memory profiling, the block tables are not initialized
                # yet. In this case, we just use a dummy slot mapping.
-                slot_mapping.append([_PAD_SLOT_ID] * prompt_len)
+                slot_mapping.extend([_PAD_SLOT_ID] * prompt_len)
                continue

            # Compute the slot mapping.
-            slot_mapping.append([])
            block_table = seq_group_metadata.block_tables[seq_id]
            # Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
            # where start_idx is max(0, prompt_len - sliding_window).
@@ -190,86 +233,116 @@ class ModelRunner:
            # mapping will be [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
            start_idx = 0
            if self.sliding_window is not None:
-                assert prefix_len == 0, (
+                assert computed_len == 0, (
                    "Prefix caching is currently not supported with "
                    "sliding window attention")
                start_idx = max(0, prompt_len - self.sliding_window)
-            for i in range(prefix_len, prompt_len):
+
+            for i in range(computed_len, prefill_end):
                if i < start_idx:
-                    slot_mapping[-1].append(_PAD_SLOT_ID)
+                    slot_mapping.append(_PAD_SLOT_ID)
                    continue

                block_number = block_table[i // self.block_size]
                block_offset = i % self.block_size
                slot = block_number * self.block_size + block_offset
-                slot_mapping[-1].append(slot)
-
-        max_prompt_len = max(subquery_lens)
-        input_tokens = _make_tensor_with_pad(input_tokens,
-                                             max_prompt_len,
-                                             pad=0,
-                                             dtype=torch.long,
-                                             device=self.device)
-        input_positions = _make_tensor_with_pad(input_positions,
-                                                max_prompt_len,
-                                                pad=0,
-                                                dtype=torch.long,
-                                                device=self.device)
-        slot_mapping = _make_tensor_with_pad(slot_mapping,
-                                             max_prompt_len,
-                                             pad=_PAD_SLOT_ID,
-                                             dtype=torch.long,
-                                             device=self.device)
-        lora_index_mapping = [
-            _pad_to_max(mapping, max_prompt_len, pad=0)
-            for mapping in lora_index_mapping
-        ]
+                slot_mapping.append(slot)
+
+        max_subquery_len = max(subquery_lens)
+        max_prompt_len = max(prompt_lens)
+        num_prompt_tokens = len(input_tokens)
+        assert max_subquery_len > 0
+
+        input_tokens = torch.tensor(input_tokens,
+                                    dtype=torch.long,
+                                    device=self.device)
+        input_positions = torch.tensor(input_positions,
+                                       dtype=torch.long,
+                                       device=self.device)
+        slot_mapping = torch.tensor(slot_mapping,
+                                    dtype=torch.long,
+                                    device=self.device)
+        lora_index_mapping = lora_index_mapping
+
        context_lens_tensor = torch.tensor(context_lens,
                                           dtype=torch.int,
                                           device=self.device)
+
+        if multi_modal_input_list:
+            assert self.vision_language_config, (
+                "Multi-modal inputs are only supported by "
+                "vision language models.")
+            multi_modal_input = torch.cat(multi_modal_input_list,
+                                          dim=0).to(self.device)
+        else:
+            multi_modal_input = None
+
        # Prepare prefix block tables
        max_prompt_block_table_len = max(len(t) for t in prefix_block_tables)
-        block_tables = _make_tensor_with_pad(
+        block_tables = make_tensor_with_pad(
            prefix_block_tables,
            max_len=max_prompt_block_table_len,
            pad=0,
            dtype=torch.int,
            device=self.device,
        )
-        start_loc_tensor = torch.arange(0,
-                                        len(prompt_lens) * max_prompt_len,
-                                        max_prompt_len,
-                                        dtype=torch.long,
-                                        device=self.device)
+
+        # Query length can be shorter than key (i.e., prompt) when prefill
+        # is chunked or prefix cached.
+        subquery_lens_tensor = torch.tensor(subquery_lens,
+                                            dtype=torch.long,
+                                            device=self.device)
+        subquery_start_loc = torch.zeros(subquery_lens_tensor.shape[0] + 1,
+                                         dtype=torch.int32,
+                                         device=self.device)
+
        prompt_lens_tensor = torch.tensor(prompt_lens,
                                          dtype=torch.long,
                                          device=self.device)
+        seq_start_loc = torch.zeros(prompt_lens_tensor.shape[0] + 1,
+                                    dtype=torch.int32,
+                                    device=self.device)
+
+        torch.cumsum(subquery_lens_tensor,
+                     dim=0,
+                     dtype=subquery_start_loc.dtype,
+                     out=subquery_start_loc[1:])

-        input_metadata = InputMetadata(
+        torch.cumsum(prompt_lens_tensor,
+                     dim=0,
+                     dtype=seq_start_loc.dtype,
+                     out=seq_start_loc[1:])
+
+        attn_metadata = self.attn_backend.make_metadata(
            is_prompt=True,
            slot_mapping=slot_mapping,
-            prompt_lens=prompt_lens_tensor,
-            max_seq_len=max_prompt_len,
-            start_loc=start_loc_tensor,
+            prompt_lens=prompt_lens,
+            prompt_lens_tensor=prompt_lens_tensor,
+            num_prompt_tokens=num_prompt_tokens,
+            num_generation_tokens=0,
+            max_subquery_len=max_subquery_len,
            max_context_len=None,
+            max_prompt_len=max_prompt_len,
+            subquery_start_loc=subquery_start_loc,
+            seq_start_loc=seq_start_loc,
            context_lens=context_lens_tensor,
            block_tables=block_tables,
            use_cuda_graph=False,
            kv_cache_dtype=self.kv_cache_dtype,
        )
-        return (input_tokens, input_positions, input_metadata, prompt_lens,
+        return (input_tokens, input_positions, attn_metadata, prompt_lens,
                subquery_lens, lora_index_mapping, lora_prompt_mapping,
-                lora_requests)
+                lora_requests, multi_modal_input)

    def _prepare_decode(
        self,
        seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata, List[int], List[int],
-               Set[LoRARequest]]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
+               List[int], Set[LoRARequest]]:
        assert len(seq_group_metadata_list) > 0
-        input_tokens: List[List[int]] = []
-        input_positions: List[List[int]] = []
-        slot_mapping: List[List[int]] = []
+        input_tokens: List[int] = []
+        input_positions: List[int] = []
+        slot_mapping: List[int] = []
        context_lens: List[int] = []
        block_tables: List[List[int]] = []
        lora_index_mapping: List[int] = []
@@ -278,6 +351,7 @@ class ModelRunner:

        for seq_group_metadata in seq_group_metadata_list:
            assert not seq_group_metadata.is_prompt
+            assert seq_group_metadata.token_chunk_size == 1

            seq_ids = list(seq_group_metadata.seq_data.keys())
            lora_id = seq_group_metadata.lora_int_id
@@ -288,11 +362,11 @@ class ModelRunner:
            for seq_id in seq_ids:
                seq_data = seq_group_metadata.seq_data[seq_id]
                generation_token = seq_data.get_last_token_id()
-                input_tokens.append([generation_token])
+                input_tokens.append(generation_token)

                seq_len = seq_data.get_len()
                position = seq_len - 1
-                input_positions.append([position])
+                input_positions.append(position)

                context_len = seq_len if self.sliding_window is None else min(
                    seq_len, self.sliding_window)
@@ -302,8 +376,8 @@ class ModelRunner:
                block_number = block_table[position // self.block_size]
                block_offset = position % self.block_size
                slot = block_number * self.block_size + block_offset
-                slot_mapping.append([slot])
-                lora_index_mapping.append([lora_id])
+                slot_mapping.append(slot)
+                lora_index_mapping.append(lora_id)
                lora_prompt_mapping.append(lora_id)

                if self.sliding_window is not None:
@@ -312,6 +386,9 @@ class ModelRunner:
                    block_table = block_table[-sliding_window_blocks:]
                block_tables.append(block_table)

+        # vLLM uses cuda graph only for decoding requests.
+        # See `capture_model` API for more details.
+        # For decoding requests, batch_size == input_tokens.
        batch_size = len(input_tokens)
        max_context_len = max(context_lens)
        use_captured_graph = (
@@ -319,38 +396,37 @@ class ModelRunner:
            and batch_size <= _BATCH_SIZES_TO_CAPTURE[-1]
            and max_context_len <= self.max_context_len_to_capture)
        if use_captured_graph:
-            # Pad the input tokens, positions, and slot mapping to match the
-            # batch size of the captured graph.
            graph_batch_size = _get_graph_batch_size(batch_size)
            assert graph_batch_size >= batch_size
            for _ in range(graph_batch_size - batch_size):
-                input_tokens.append([])
-                input_positions.append([])
-                slot_mapping.append([])
+                input_tokens.append(0)
+                input_positions.append(0)
+                slot_mapping.append(_PAD_SLOT_ID)
                context_lens.append(1)
                block_tables.append([])
+                lora_index_mapping.append(0)
            batch_size = graph_batch_size

-        input_tokens = _make_tensor_with_pad(input_tokens,
-                                             max_len=1,
-                                             pad=0,
-                                             dtype=torch.long,
-                                             device=self.device)
-        input_positions = _make_tensor_with_pad(input_positions,
-                                                max_len=1,
-                                                pad=0,
-                                                dtype=torch.long,
-                                                device=self.device)
-        slot_mapping = _make_tensor_with_pad(slot_mapping,
-                                             max_len=1,
-                                             pad=_PAD_SLOT_ID,
-                                             dtype=torch.long,
-                                             device=self.device)
+        input_tokens = torch.tensor(input_tokens,
+                                    dtype=torch.long,
+                                    device=self.device)
+        input_positions = torch.tensor(input_positions,
+                                       dtype=torch.long,
+                                       device=self.device)
+        slot_mapping = torch.tensor(slot_mapping,
+                                    dtype=torch.long,
+                                    device=self.device)
        context_lens = torch.tensor(context_lens,
                                    dtype=torch.int,
                                    device=self.device)

        if use_captured_graph:
+            # When using cuda-graph all these tensors should be
+            # padded.
+            assert context_lens.shape[0] == input_tokens.shape[0]
+            assert context_lens.shape[0] == input_positions.shape[0]
+            assert context_lens.shape[0] == slot_mapping.shape[0]
+
            # The shape of graph_block_tables is
            # [max batch size, max context len // block size].
            input_block_tables = self.graph_block_tables[:batch_size]
@@ -361,7 +437,7 @@ class ModelRunner:
        else:
            max_block_table_len = max(
                len(block_table) for block_table in block_tables)
-            block_tables = _make_tensor_with_pad(
+            block_tables = make_tensor_with_pad(
                block_tables,
                max_len=max_block_table_len,
                pad=0,
@@ -369,23 +445,24 @@ class ModelRunner:
                device=self.device,
            )

-        lora_index_mapping = [
-            _pad_to_max(mapping, 1, pad=0) for mapping in lora_index_mapping
-        ]
-
-        input_metadata = InputMetadata(
+        attn_metadata = self.attn_backend.make_metadata(
            is_prompt=False,
            slot_mapping=slot_mapping,
            prompt_lens=None,
-            max_seq_len=None,
-            start_loc=None,
+            prompt_lens_tensor=None,
+            num_prompt_tokens=0,
+            num_generation_tokens=len(input_tokens),
+            max_subquery_len=None,
            max_context_len=max_context_len,
+            max_prompt_len=None,
+            subquery_start_loc=None,
+            seq_start_loc=None,
            context_lens=context_lens,
            block_tables=block_tables,
            use_cuda_graph=use_captured_graph,
            kv_cache_dtype=self.kv_cache_dtype,
        )
-        return (input_tokens, input_positions, input_metadata,
+        return (input_tokens, input_positions, attn_metadata,
                lora_index_mapping, lora_prompt_mapping, lora_requests)

    def _prepare_sample(
@@ -400,9 +477,8 @@ class ModelRunner:
        selected_token_start_idx = 0
        categorized_sample_indices = {t: [] for t in SamplingType}
        categorized_sample_indices_start_idx = 0
-        pin_memory = not self.in_wsl and not self.device_config.is_neuron
+        categorized_sampled_token_indices_start_idx = 0

-        max_subquery_len = max(subquery_lens) if subquery_lens else 1
        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
            seq_ids = list(seq_group_metadata.seq_data.keys())
            sampling_params = seq_group_metadata.sampling_params
@@ -417,9 +493,12 @@ class ModelRunner:
                    categorized_sample_indices_start_idx += subquery_len - 1

                categorized_sample_indices[
-                    sampling_params.sampling_type].append(
-                        categorized_sample_indices_start_idx)
+                    sampling_params.sampling_type].append([
+                        categorized_sample_indices_start_idx,
+                        categorized_sampled_token_indices_start_idx
+                    ])
                categorized_sample_indices_start_idx += 1
+                categorized_sampled_token_indices_start_idx += 1

                if sampling_params.prompt_logprobs is not None:
                    selected_token_indices.extend(
@@ -427,11 +506,11 @@ class ModelRunner:
                              selected_token_start_idx + subquery_len - 1))
                selected_token_indices.append(selected_token_start_idx +
                                              subquery_len - 1)
-                selected_token_start_idx += max_subquery_len
+                selected_token_start_idx += subquery_len

                if sampling_params.seed is not None:
                    seq_group_metadata.state.generator = torch.Generator(
-                        device="cuda").manual_seed(sampling_params.seed)
+                        device=self.device).manual_seed(sampling_params.seed)
            else:
                num_seqs = len(seq_ids)
                selected_token_indices.extend(
@@ -441,22 +520,32 @@ class ModelRunner:

                categorized_sample_indices[
                    sampling_params.sampling_type].extend(
-                        range(categorized_sample_indices_start_idx,
-                              categorized_sample_indices_start_idx + num_seqs))
+                        zip(
+                            range(
+                                categorized_sample_indices_start_idx,
+                                categorized_sample_indices_start_idx +
+                                num_seqs),
+                            range(
+                                categorized_sampled_token_indices_start_idx,
+                                categorized_sampled_token_indices_start_idx +
+                                num_seqs)))
                categorized_sample_indices_start_idx += num_seqs
+                categorized_sampled_token_indices_start_idx += num_seqs

            if sampling_params.seed is not None:
                generators.append(seq_group_metadata.state.generator)

-        selected_token_indices = _async_h2d(selected_token_indices,
-                                            dtype=torch.long,
-                                            target_device=self.device,
-                                            pin_memory=pin_memory)
+        selected_token_indices = async_tensor_h2d(selected_token_indices,
+                                                  dtype=torch.long,
+                                                  target_device=self.device,
+                                                  pin_memory=self.pin_memory)
+
        categorized_sample_indices = {
-            t: _async_h2d(seq_ids,
-                          dtype=torch.int,
-                          target_device=self.device,
-                          pin_memory=pin_memory)
+            t: maybe_expand_dim(
+                async_tensor_h2d(seq_ids,
+                                 dtype=torch.int,
+                                 target_device=self.device,
+                                 pin_memory=self.pin_memory), 2, 2)
            for t, seq_ids in categorized_sample_indices.items()
        }

@@ -477,33 +566,32 @@ class ModelRunner:
    def prepare_input_tensors(
        self,
        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-    ) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata, SamplingMetadata,
-               Set[int], LoRAMapping]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
+               Set[int], LoRAMapping, torch.Tensor]:
        if self.is_driver_worker:
            # NOTE: We assume that all sequences in the group are all prompts or
            # all decodes.
            is_prompt = seq_group_metadata_list[0].is_prompt
            # Prepare input tensors.
            if is_prompt:
-                (input_tokens, input_positions, input_metadata, prompt_lens,
+                (input_tokens, input_positions, attn_metadata, prompt_lens,
                 subquery_lens, lora_index_mapping, lora_prompt_mapping,
-                 lora_requests) = self._prepare_prompt(seq_group_metadata_list)
+                 lora_requests, multi_modal_input
+                 ) = self._prepare_prompt(seq_group_metadata_list)
            else:
-                (input_tokens, input_positions, input_metadata,
+                (input_tokens, input_positions, attn_metadata,
                 lora_index_mapping, lora_prompt_mapping,
                 lora_requests) = self._prepare_decode(seq_group_metadata_list)
                prompt_lens = []
                subquery_lens = None
+                multi_modal_input = None
            sampling_metadata = self._prepare_sample(seq_group_metadata_list,
                                                     prompt_lens,
                                                     subquery_lens)

            if self.lora_config:
-                flat_lora_index_mapping = [
-                    item for sublist in lora_index_mapping for item in sublist
-                ]
                lora_mapping = LoRAMapping(
-                    flat_lora_index_mapping,
+                    lora_index_mapping,
                    lora_prompt_mapping,
                )
            else:
@@ -513,82 +601,77 @@ class ModelRunner:
            metadata_dict = {
                "input_tokens": input_tokens,
                "input_positions": input_positions,
-                "is_prompt": input_metadata.is_prompt,
-                "slot_mapping": input_metadata.slot_mapping,
-                "prompt_lens": input_metadata.prompt_lens,
-                "max_seq_len": input_metadata.max_seq_len,
-                "start_loc": input_metadata.start_loc,
-                "max_context_len": input_metadata.max_context_len,
-                "context_lens": input_metadata.context_lens,
-                "block_tables": input_metadata.block_tables,
-                "use_cuda_graph": input_metadata.use_cuda_graph,
-                "kv_cache_dtype": input_metadata.kv_cache_dtype,
                "selected_token_indices":
                sampling_metadata.selected_token_indices,
                "lora_requests": lora_requests,
                "lora_mapping": lora_mapping,
+                "multi_modal_input": multi_modal_input,
            }
+            metadata_dict.update(attn_metadata.asdict_zerocopy())
            broadcast_tensor_dict(metadata_dict, src=0)
        else:
            metadata_dict = broadcast_tensor_dict(src=0)
-            input_tokens = metadata_dict["input_tokens"]
-            input_positions = metadata_dict["input_positions"]
-            lora_mapping = metadata_dict["lora_mapping"]
-            lora_requests = metadata_dict["lora_requests"]
-            input_metadata = InputMetadata(
-                is_prompt=metadata_dict["is_prompt"],
-                slot_mapping=metadata_dict["slot_mapping"],
-                prompt_lens=metadata_dict["prompt_lens"],
-                max_seq_len=metadata_dict["max_seq_len"],
-                start_loc=metadata_dict["start_loc"],
-                max_context_len=metadata_dict["max_context_len"],
-                context_lens=metadata_dict["context_lens"],
-                block_tables=metadata_dict["block_tables"],
-                use_cuda_graph=metadata_dict["use_cuda_graph"],
-                kv_cache_dtype=metadata_dict["kv_cache_dtype"],
-            )
+            input_tokens = metadata_dict.pop("input_tokens")
+            input_positions = metadata_dict.pop("input_positions")
+            selected_token_indices = metadata_dict.pop(
+                "selected_token_indices")
+            lora_mapping = metadata_dict.pop("lora_mapping")
+            lora_requests = metadata_dict.pop("lora_requests")
+            multi_modal_input = metadata_dict.pop("multi_modal_input")
+            attn_metadata = self.attn_backend.make_metadata(**metadata_dict)
            sampling_metadata = SamplingMetadata(
                seq_groups=None,
                seq_data=None,
                prompt_lens=None,
-                selected_token_indices=metadata_dict["selected_token_indices"],
+                selected_token_indices=selected_token_indices,
                categorized_sample_indices=None,
                generators=None,
                perform_sampling=False,
            )

-        return (input_tokens, input_positions, input_metadata,
-                sampling_metadata, lora_requests, lora_mapping)
+        return (input_tokens, input_positions, attn_metadata,
+                sampling_metadata, lora_requests, lora_mapping,
+                multi_modal_input)

    @torch.inference_mode()
    def execute_model(
        self,
        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
-        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
+        kv_caches: List[torch.Tensor],
    ) -> Optional[SamplerOutput]:
-        (input_tokens, input_positions, input_metadata, sampling_metadata,
-         lora_requests,
-         lora_mapping) = self.prepare_input_tensors(seq_group_metadata_list)
+        (input_tokens, input_positions, attn_metadata, sampling_metadata,
+         lora_requests, lora_mapping, multi_modal_input
+         ) = self.prepare_input_tensors(seq_group_metadata_list)

        if self.lora_config:
            self.set_active_loras(lora_requests, lora_mapping)

        # Execute the model.
-        if input_metadata.use_cuda_graph:
+        if attn_metadata.use_cuda_graph:
            graph_batch_size = input_tokens.shape[0]
            model_executable = self.graph_runners[graph_batch_size]
        else:
            model_executable = self.model
-        hidden_states = model_executable(
-            input_ids=input_tokens,
-            positions=input_positions,
-            kv_caches=kv_caches,
-            input_metadata=input_metadata,
-        )
+        execute_model_kwargs = {
+            "input_ids": input_tokens,
+            "positions": input_positions,
+            "kv_caches": kv_caches,
+            "attn_metadata": attn_metadata,
+        }
+        if self.vision_language_config:
+            execute_model_kwargs.update({"image_input": multi_modal_input})
+        hidden_states = model_executable(**execute_model_kwargs)
+
+        # Compute the logits.
+        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+
+        # Only perform sampling in the driver worker.
+        if not sampling_metadata.perform_sampling:
+            return None

        # Sample the next token.
        output = self.model.sample(
-            hidden_states=hidden_states,
+            logits=logits,
            sampling_metadata=sampling_metadata,
        )
        return output
@@ -596,8 +679,7 @@ class ModelRunner:
    @torch.inference_mode()
    def profile_run(self) -> None:
        # Enable top-k sampling to reflect the accurate memory usage.
-        vocab_size = self.model_config.get_vocab_size()
-        sampling_params = SamplingParams(top_p=0.99, top_k=vocab_size - 1)
+        sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
        max_num_seqs = self.scheduler_config.max_num_seqs

@@ -626,10 +708,22 @@ class ModelRunner:
        # Profile memory usage with max_num_sequences sequences and the total
        # number of tokens equal to max_num_batched_tokens.
        seqs: List[SequenceGroupMetadata] = []
+        # Additional GPU memory may be needed for vision encoding, which needs
+        # to be accounted for when calculating the GPU blocks for
+        # vLLM blocker manager.
+        # To exercise the worst scenario for GPU memory consumption,
+        # the number of seqs (batch_size) is chosen to maximize the number
+        # of images processed.
+        if self.vision_language_config:
+            max_num_seqs = min(
+                max_num_seqs,
+                int(max_num_batched_tokens /
+                    self.vision_language_config.image_feature_size))
        for group_id in range(max_num_seqs):
            seq_len = (max_num_batched_tokens // max_num_seqs +
                       (group_id < max_num_batched_tokens % max_num_seqs))
-            seq_data = SequenceData([0] * seq_len)
+            seq_data, fake_multi_modal_input = _prepare_fake_inputs(
+                seq_len, self.vision_language_config)
            seq = SequenceGroupMetadata(
                request_id=str(group_id),
                is_prompt=True,
@@ -638,12 +732,13 @@ class ModelRunner:
                block_tables=None,
                lora_request=dummy_lora_requests_per_seq[group_id]
                if dummy_lora_requests_per_seq else None,
+                multi_modal_data=fake_multi_modal_input,
            )
            seqs.append(seq)

        # Run the model with the dummy inputs.
        num_layers = self.model_config.get_num_layers(self.parallel_config)
-        kv_caches = [(None, None)] * num_layers
+        kv_caches = [None] * num_layers
        self.execute_model(seqs, kv_caches)
        torch.cuda.synchronize()
        return
@@ -675,10 +770,22 @@ class ModelRunner:
        return self.lora_manager.list_loras()

    @torch.inference_mode()
-    def capture_model(self, kv_caches: List[KVCache]) -> None:
+    def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
+        """Cuda graph capture a model.
+
+        Note that CUDA graph's performance gain is negligible if number
+        of batched tokens are larger than 200. And since CUDA graph
+        requires fixed sized tensors, supporting large/variable batch
+        size requires high GPU memory overhead. Thus, vLLM only captures
+        decoding requests. Mixed batch (chunked prefill + decoding) or
+        prefill requests are not captured.
+
+        Since it is used for decoding-only, it assumes there's only 1 token
+        per sequence in the batch.
+        """
        # NOTE(woosuk): This is a hack to ensure that the NCCL backend is never
        # deleted before the CUDA graphs.
-        self.cupy_nccl_backend = cupy_utils.get_nccl_backend()
+        self.pynccl_backend = pynccl_utils.get_nccl_backend()

        assert not self.model_config.enforce_eager
        logger.info("Capturing the model for CUDA graphs. This may lead to "
@@ -694,10 +801,9 @@ class ModelRunner:

        # Prepare dummy inputs. These will be reused for all batch sizes.
        max_batch_size = max(_BATCH_SIZES_TO_CAPTURE)
-        input_tokens = torch.zeros(max_batch_size, 1, dtype=torch.long).cuda()
-        input_positions = torch.zeros(max_batch_size, 1,
-                                      dtype=torch.long).cuda()
-        slot_mapping = torch.empty(max_batch_size, 1, dtype=torch.long).cuda()
+        input_tokens = torch.zeros(max_batch_size, dtype=torch.long).cuda()
+        input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda()
+        slot_mapping = torch.empty(max_batch_size, dtype=torch.long).cuda()
        slot_mapping.fill_(_PAD_SLOT_ID)
        context_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda()
        block_tables = torch.from_numpy(self.graph_block_tables).cuda()
@@ -709,23 +815,28 @@ class ModelRunner:
        ]

        # NOTE(woosuk): There are 3 backends for all-reduce: custom all-reduce
-        # kernel, CuPy NCCL, and PyTorch NCCL. When using CUDA graph, we use
-        # either custom all-reduce kernel or CuPy NCCL. When not using CUDA
+        # kernel, pynccl, and PyTorch NCCL. When using CUDA graph, we use
+        # either custom all-reduce kernel or pynccl. When not using CUDA
        # graph, we use either custom all-reduce kernel or PyTorch NCCL.
        # We always prioritize using custom all-reduce kernel but fall back
-        # to PyTorch or CuPy NCCL if it is disabled or not supported.
+        # to PyTorch or pynccl if it is disabled or not supported.
        with custom_all_reduce.capture():
            # NOTE: Capturing the largest batch size first may help reduce the
            # memory usage of CUDA graph.
            for batch_size in reversed(batch_size_capture_list):
-                # Create dummy input_metadata.
-                input_metadata = InputMetadata(
+                # Create dummy attn_metadata.
+                attn_metadata = self.attn_backend.make_metadata(
                    is_prompt=False,
                    slot_mapping=slot_mapping[:batch_size],
                    prompt_lens=None,
-                    max_seq_len=None,
-                    start_loc=None,
+                    prompt_lens_tensor=None,
+                    num_prompt_tokens=0,
+                    num_generation_tokens=batch_size,
+                    max_subquery_len=None,
                    max_context_len=self.max_context_len_to_capture,
+                    max_prompt_len=None,
+                    subquery_start_loc=None,
+                    seq_start_loc=None,
                    context_lens=context_lens[:batch_size],
                    block_tables=block_tables[:batch_size],
                    use_cuda_graph=True,
@@ -744,7 +855,7 @@ class ModelRunner:
                    input_tokens[:batch_size],
                    input_positions[:batch_size],
                    kv_caches,
-                    input_metadata,
+                    attn_metadata,
                    memory_pool=self.graph_memory_pool,
                )
                self.graph_memory_pool = graph_runner.graph.pool()
@@ -756,12 +867,18 @@ class ModelRunner:
        logger.info(f"Graph capturing finished in {elapsed_time:.0f} secs.")

    def __del__(self) -> None:
-        # Delete the CUDA graphs before deleting the CuPy NCCL communicator.
+        # Delete the CUDA graphs before deleting the pynccl communicator.
        # NOTE(woosuk): This is necessary because otherwise deadlocks can
        # happen.
        # FIXME(woosuk): This is a bit hacky. Find a more robust solution.
+        # TODO(youkaichao): when we get enough user feedback that pynccl is
+        # more stable than cupy, we can remove this, e.g. in v0.4.1.
        self.graph_runners.clear()
-        self.cupy_nccl_backend = None
+        self.pynccl_backend = None
+
+    @property
+    def vocab_size(self) -> int:
+        return self.model_config.get_vocab_size()


 class CUDAGraphRunner:
@@ -776,20 +893,22 @@ class CUDAGraphRunner:
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
-        kv_caches: List[KVCache],
-        input_metadata: InputMetadata,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
        memory_pool,
+        **kwargs,
    ) -> None:
        assert self.graph is None
        # Run the model once without capturing the graph.
        # This is to make sure that the captured graph does not include the
        # kernel launches for initial benchmarking (e.g., Triton autotune).
-        with _maybe_cupy_nccl():
+        with _maybe_pynccl():
            self.model(
                input_ids,
                positions,
                kv_caches,
-                input_metadata,
+                attn_metadata,
+                **kwargs,
            )
        torch.cuda.synchronize()

@@ -798,12 +917,13 @@ class CUDAGraphRunner:
        # https://stackoverflow.com/questions/31039022/python-multi-line-with-statement
        self.graph = torch.cuda.CUDAGraph()
        with torch.cuda.graph(self.graph, pool=memory_pool):  # noqa: SIM117
-            with _maybe_cupy_nccl():
+            with _maybe_pynccl():
                hidden_states = self.model(
                    input_ids,
                    positions,
                    kv_caches,
-                    input_metadata,
+                    attn_metadata,
+                    **kwargs,
                )
        torch.cuda.synchronize()

@@ -812,9 +932,9 @@ class CUDAGraphRunner:
            "input_ids": input_ids,
            "positions": positions,
            "kv_caches": kv_caches,
-            "slot_mapping": input_metadata.slot_mapping,
-            "context_lens": input_metadata.context_lens,
-            "block_tables": input_metadata.block_tables,
+            "slot_mapping": attn_metadata.slot_mapping,
+            "context_lens": attn_metadata.context_lens,
+            "block_tables": attn_metadata.block_tables,
        }
        self.output_buffers = {"hidden_states": hidden_states}
        return
@@ -823,8 +943,9 @@ class CUDAGraphRunner:
        self,
        input_ids: torch.Tensor,
        positions: torch.Tensor,
-        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
-        input_metadata: InputMetadata,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        **kwargs,
    ) -> torch.Tensor:
        # KV caches are fixed tensors, so we don't need to copy them.
        del kv_caches
@@ -832,13 +953,12 @@ class CUDAGraphRunner:
        # Copy the input tensors to the input buffers.
        self.input_buffers["input_ids"].copy_(input_ids, non_blocking=True)
        self.input_buffers["positions"].copy_(positions, non_blocking=True)
-        self.input_buffers["slot_mapping"].copy_(input_metadata.slot_mapping,
+        self.input_buffers["slot_mapping"].copy_(attn_metadata.slot_mapping,
                                                 non_blocking=True)
-        self.input_buffers["context_lens"].copy_(input_metadata.context_lens,
+        self.input_buffers["context_lens"].copy_(attn_metadata.context_lens,
                                                 non_blocking=True)
-        self.input_buffers["block_tables"].copy_(input_metadata.block_tables,
+        self.input_buffers["block_tables"].copy_(attn_metadata.block_tables,
                                                 non_blocking=True)
-
        # Run the graph.
        self.graph.replay()

@@ -850,44 +970,43 @@ class CUDAGraphRunner:


 @contextlib.contextmanager
-def _maybe_cupy_nccl():
-    if cupy_utils.is_initialized() and not custom_all_reduce.is_initialized():
-        with with_cupy_nccl_for_all_reduce():
+def _maybe_pynccl():
+    if pynccl_utils.is_initialized(
+    ) and not custom_all_reduce.is_initialized():
+        with with_pynccl_for_all_reduce():
            yield
    else:
        yield


-def _pad_to_max(x: List[int], max_len: int, pad: int) -> List[int]:
-    assert len(x) <= max_len
-    return x + [pad] * (max_len - len(x))
-
-
-def _make_tensor_with_pad(
-    x: List[List[int]],
-    max_len: int,
-    pad: int,
-    dtype: torch.dtype,
-    device: Optional[Union[str, torch.device]],
-) -> torch.Tensor:
-    padded_x = [_pad_to_max(x_i, max_len, pad) for x_i in x]
-    return torch.tensor(padded_x, dtype=dtype, device=device)
-
-
 def _get_graph_batch_size(batch_size: int) -> int:
+    """Returns the padded batch size given actual batch size.
+
+    Batch sizes are 1, 2, 4, _BATCH_SIZE_ALIGNMENT,
+    2*_BATCH_SIZE_ALIGNMENT, 3*_BATCH_SIZE_ALIGNMENT...
+    """
    if batch_size <= 2:
        return batch_size
    elif batch_size <= 4:
        return 4
    else:
-        return (batch_size + 7) // 8 * 8
-
-
-def _async_h2d(
-    data: list,
-    dtype: torch.dtype,
-    target_device: Union[str, torch.device],
-    pin_memory: bool,
-) -> torch.Tensor:
-    t = torch.tensor(data, dtype=dtype, pin_memory=pin_memory, device="cpu")
-    return t.to(device=target_device, non_blocking=True)
+        return ((batch_size + _BATCH_SIZE_ALIGNMENT - 1) //
+                _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT)
+
+
+def _prepare_fake_inputs(
+        seq_len: int, vision_language_config: Optional[VisionLanguageConfig]):
+    """Prepare fake inputs for profile run."""
+    if vision_language_config:
+        prompt_tokens = [
+            vision_language_config.image_token_id
+        ] * vision_language_config.image_feature_size + [0] * (
+            seq_len - vision_language_config.image_feature_size)
+        fake_image_input = MultiModalData(
+            type=MultiModalData.Type.IMAGE,
+            data=torch.zeros(vision_language_config.image_input_shape,
+                             dtype=torch.float16))
+    else:
+        prompt_tokens = [0] * seq_len
+        fake_image_input = None
+    return SequenceData(prompt_tokens), fake_image_input
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
+from typing import Dict, List, Optional, Tuple
+
+import torch
+
+from vllm.config import (DeviceConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig)
+from vllm.logger import init_logger
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.neuron_model_loader import get_neuron_model
+from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
+from vllm.utils import (async_tensor_h2d, is_pin_memory_available,
+                        make_tensor_with_pad, maybe_expand_dim)
+
+logger = init_logger(__name__)
+
+
+class NeuronModelRunner:
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+    ):
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+
+        if model_config is not None and model_config.get_sliding_window():
+            logger.warning("Sliding window is not supported on Neuron. "
+                           "The model will run without sliding window.")
+        self.device_config = (device_config
+                              if device_config is not None else DeviceConfig())
+        self.device = self.device_config.device
+        self.model = None
+        self.pin_memory = is_pin_memory_available()
+
+    def load_model(self) -> None:
+        self.model = get_neuron_model(self.model_config,
+                                      parallel_config=self.parallel_config,
+                                      scheduler_config=self.scheduler_config)
+
+    def _prepare_prompt(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int]]:
+        assert len(seq_group_metadata_list) > 0
+        input_tokens: List[List[int]] = []
+        input_positions: List[List[int]] = []
+        input_block_ids: List[int] = []
+
+        prompt_lens: List[int] = []
+        for seq_group_metadata in seq_group_metadata_list:
+            assert seq_group_metadata.is_prompt
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            assert len(seq_ids) == 1
+            seq_id = seq_ids[0]
+
+            seq_data = seq_group_metadata.seq_data[seq_id]
+            prompt_tokens = seq_data.get_token_ids()
+            prompt_len = len(prompt_tokens)
+            prompt_lens.append(prompt_len)
+
+            input_tokens.append(prompt_tokens)
+            input_positions.append(list(range(prompt_len)))
+
+            assert seq_group_metadata.block_tables is not None
+            block_table = seq_group_metadata.block_tables[seq_id]
+            assert len(block_table) == 1
+            input_block_ids.append(block_table[0])
+
+        max_prompt_len = max(prompt_lens)
+        assert max_prompt_len > 0
+        input_tokens = make_tensor_with_pad(input_tokens,
+                                            max_prompt_len,
+                                            pad=0,
+                                            dtype=torch.long,
+                                            device=self.device)
+        input_positions = make_tensor_with_pad(input_positions,
+                                               max_prompt_len,
+                                               pad=0,
+                                               dtype=torch.long,
+                                               device=self.device)
+        input_block_ids = torch.tensor(input_block_ids,
+                                       dtype=torch.long,
+                                       device=self.device)
+
+        return input_tokens, input_positions, input_block_ids, prompt_lens
+
+    def _prepare_decode(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        assert len(seq_group_metadata_list) > 0
+        input_tokens: List[List[int]] = []
+        input_positions: List[List[int]] = []
+        input_block_ids: List[int] = []
+        context_lens: List[int] = []
+
+        for seq_group_metadata in seq_group_metadata_list:
+            assert not seq_group_metadata.is_prompt
+
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+
+            for seq_id in seq_ids:
+                seq_data = seq_group_metadata.seq_data[seq_id]
+                generation_token = seq_data.get_last_token_id()
+                input_tokens.append([generation_token])
+
+                seq_len = seq_data.get_len()
+                position = seq_len - 1
+                input_positions.append([position])
+                context_lens.append(seq_len)
+
+                assert seq_group_metadata.block_tables is not None
+                block_table = seq_group_metadata.block_tables[seq_id]
+                assert len(block_table) == 1
+                input_block_ids.append(block_table[0])
+
+        input_tokens = make_tensor_with_pad(input_tokens,
+                                            max_len=1,
+                                            pad=0,
+                                            dtype=torch.long,
+                                            device=self.device)
+        input_positions = make_tensor_with_pad(input_positions,
+                                               max_len=1,
+                                               pad=0,
+                                               dtype=torch.long,
+                                               device=self.device)
+        context_lens = torch.tensor(context_lens,
+                                    dtype=torch.int,
+                                    device=self.device)
+        input_block_ids = torch.tensor(input_block_ids,
+                                       dtype=torch.long,
+                                       device=self.device)
+
+        return input_tokens, input_positions, input_block_ids
+
+    def _prepare_sample(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        prompt_lens: List[int],
+    ) -> SamplingMetadata:
+        seq_groups: List[Tuple[List[int], SamplingParams]] = []
+        selected_token_indices: List[int] = []
+        generators: List[torch.Generator] = []
+        selected_token_start_idx = 0
+        categorized_sample_indices = {t: [] for t in SamplingType}
+        categorized_sample_indices_start_idx = 0
+        categorized_sampled_token_indices_start_idx = 0
+
+        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            sampling_params = seq_group_metadata.sampling_params
+            seq_groups.append((seq_ids, sampling_params))
+
+            if seq_group_metadata.is_prompt:
+                assert len(seq_ids) == 1
+                assert prompt_lens is not None
+                prompt_len = prompt_lens[i]
+                if sampling_params.prompt_logprobs is not None:
+                    # NOTE: prompt token positions do not need sample, skip
+                    categorized_sample_indices_start_idx += prompt_len - 1
+
+                categorized_sample_indices[
+                    sampling_params.sampling_type].append([
+                        categorized_sample_indices_start_idx,
+                        categorized_sampled_token_indices_start_idx
+                    ])
+                categorized_sample_indices_start_idx += 1
+                categorized_sampled_token_indices_start_idx += 1
+
+                if sampling_params.prompt_logprobs is not None:
+                    selected_token_indices.extend(
+                        range(selected_token_start_idx,
+                              selected_token_start_idx + prompt_len - 1))
+                selected_token_indices.append(selected_token_start_idx +
+                                              prompt_len - 1)
+                selected_token_start_idx += prompt_len
+
+                if sampling_params.seed is not None:
+                    seq_group_metadata.state.generator = torch.Generator(
+                        device=self.device).manual_seed(sampling_params.seed)
+            else:
+                num_seqs = len(seq_ids)
+                selected_token_indices.extend(
+                    range(selected_token_start_idx,
+                          selected_token_start_idx + num_seqs))
+                selected_token_start_idx += num_seqs
+
+                categorized_sample_indices[
+                    sampling_params.sampling_type].extend(
+                        zip(
+                            range(
+                                categorized_sample_indices_start_idx,
+                                categorized_sample_indices_start_idx +
+                                num_seqs),
+                            range(
+                                categorized_sampled_token_indices_start_idx,
+                                categorized_sampled_token_indices_start_idx +
+                                num_seqs)))
+                categorized_sample_indices_start_idx += num_seqs
+                categorized_sampled_token_indices_start_idx += num_seqs
+
+            if sampling_params.seed is not None:
+                generators.append(seq_group_metadata.state.generator)
+
+        selected_token_indices = async_tensor_h2d(selected_token_indices,
+                                                  dtype=torch.long,
+                                                  target_device=self.device,
+                                                  pin_memory=self.pin_memory)
+
+        categorized_sample_indices = {
+            t: maybe_expand_dim(
+                async_tensor_h2d(seq_ids,
+                                 dtype=torch.int,
+                                 target_device=self.device,
+                                 pin_memory=self.pin_memory), 2, 2)
+            for t, seq_ids in categorized_sample_indices.items()
+        }
+
+        seq_data: Dict[int, SequenceData] = {}
+        for seq_group_metadata in seq_group_metadata_list:
+            seq_data.update(seq_group_metadata.seq_data)
+
+        sampling_metadata = SamplingMetadata(
+            seq_groups=seq_groups,
+            seq_data=seq_data,
+            prompt_lens=prompt_lens,
+            selected_token_indices=selected_token_indices,
+            categorized_sample_indices=categorized_sample_indices,
+            generators=generators,
+        )
+        return sampling_metadata
+
+    def prepare_input_tensors(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, SamplingMetadata]:
+        # NOTE: We assume that all sequences in the group are all prompts or
+        # all decodes.
+        is_prompt = seq_group_metadata_list[0].is_prompt
+        # Prepare input tensors.
+        if is_prompt:
+            (input_tokens, input_positions, input_block_ids,
+             prompt_lens) = self._prepare_prompt(seq_group_metadata_list)
+        else:
+            (input_tokens, input_positions,
+             input_block_ids) = self._prepare_decode(seq_group_metadata_list)
+            prompt_lens = []
+        sampling_metadata = self._prepare_sample(seq_group_metadata_list,
+                                                 prompt_lens)
+
+        return (input_tokens, input_positions, input_block_ids,
+                sampling_metadata)
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+    ) -> Optional[SamplerOutput]:
+        (input_tokens, input_positions, input_block_ids, sampling_metadata
+         ) = self.prepare_input_tensors(seq_group_metadata_list)
+
+        hidden_states = self.model(
+            input_ids=input_tokens,
+            positions=input_positions,
+            input_block_ids=input_block_ids,
+        )
+
+        # Compute the logits.
+        logits = self.model.compute_logits(hidden_states, sampling_metadata)
+
+        # Sample the next token.
+        output = self.model.sample(
+            logits=logits,
+            sampling_metadata=sampling_metadata,
+        )
+        return output
+
+    @property
+    def vocab_size(self) -> int:
+        return self.model_config.get_vocab_size()
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
 """A Neuron worker class."""
-from typing import Dict, List, Optional, Tuple
+from typing import List, Optional

 import torch
 import torch.distributed

-from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, LoRAConfig)
+from vllm.config import (DeviceConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig)
 from vllm.model_executor import set_random_seed
-from vllm.model_executor.parallel_utils.communication_op import (
-    broadcast_tensor_dict)
-from vllm.model_executor.parallel_utils.parallel_state import (
-    ensure_model_parallel_initialized)
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
-from vllm.worker.cache_engine import CacheEngine
-from vllm.worker.model_runner import ModelRunner
+from vllm.worker.neuron_model_runner import NeuronModelRunner


-class Worker:
+class NeuronWorker:
    """A worker class that executes the model on a group of neuron cores.
    """

@@ -26,166 +21,32 @@ class Worker:
        parallel_config: ParallelConfig,
        scheduler_config: SchedulerConfig,
        device_config: DeviceConfig,
-        local_rank: int,
-        rank: int,
-        distributed_init_method: str,
-        lora_config: Optional[LoRAConfig] = None,
-        kv_cache_dtype: Optional[str] = "auto",
-        is_driver_worker: bool = False,
    ) -> None:
        self.model_config = model_config
        self.parallel_config = parallel_config
        self.scheduler_config = scheduler_config
        self.device_config = device_config
-        self.local_rank = local_rank
-        self.rank = rank
-        self.distributed_init_method = distributed_init_method
-        self.lora_config = lora_config
-        self.is_driver_worker = is_driver_worker
-        if self.is_driver_worker:
-            assert self.rank == 0, "The driver worker must have rank 0."

-        self.model_runner = ModelRunner(model_config,
-                                        parallel_config,
-                                        scheduler_config,
-                                        device_config,
-                                        lora_config=self.lora_config,
-                                        is_driver_worker=is_driver_worker)
-        # Uninitialized cache engine. Will be initialized by
-        # self.init_cache_engine().
-        self.cache_config = None
-        self.cache_engine = None
-        self.cache_events = None
-        self.gpu_cache = None
+        self.model_runner = NeuronModelRunner(model_config, parallel_config,
+                                              scheduler_config, device_config)

-    def init_model(self) -> None:
-        # Initialize the distributed environment.
-        _init_distributed_environment(self.parallel_config,
-                                      self.rank,
-                                      self.distributed_init_method,
-                                      distributed_backend="gloo")
-
-        # Initialize the model.
+    def init_device(self) -> None:
+        # Set random seed.
        set_random_seed(self.model_config.seed)

    def load_model(self):
        self.model_runner.load_model()

-    @torch.inference_mode()
-    def profile_num_available_blocks(
-        self,
-        block_size: int = 128,
-        gpu_memory_utilization: float = 0.9,
-        cpu_swap_space: int = 0,
-        cache_dtype: str = "float16",
-    ) -> Tuple[int, int]:
-        """Simply returns max_num_seqs as num_gpu_blocks, 0 as num_cpu_blocks."""
-        num_gpu_blocks = self.scheduler_config.max_num_seqs
-        num_cpu_blocks = 0
-        return num_gpu_blocks, num_cpu_blocks
-
-    def init_cache_engine(self, cache_config: CacheConfig) -> None:
-        self.cache_config = cache_config
-        self.cache_engine = CacheEngine(self.cache_config, self.model_config,
-                                        self.parallel_config)
-        self.model_runner.set_block_size(self.cache_engine.block_size)
-
-    def warm_up_model(self) -> None:
-        # Warm up is maintained in transformers-neuronx
-        pass
-
-    def cache_swap(
-        self,
-        blocks_to_swap_in: Dict[int, int],
-        blocks_to_swap_out: Dict[int, int],
-        blocks_to_copy: Dict[int, List[int]],
-    ) -> None:
-        # Issue cache operations.
-        issued_cache_op = False
-        if blocks_to_swap_in:
-            self.cache_engine.swap_in(blocks_to_swap_in)
-            issued_cache_op = True
-        if blocks_to_swap_out:
-            self.cache_engine.swap_out(blocks_to_swap_out)
-            issued_cache_op = True
-        if blocks_to_copy:
-            self.cache_engine.copy(blocks_to_copy)
-            issued_cache_op = True
-
-        cache_events = self.cache_events if issued_cache_op else None
-
-        # Wait for cache operations to finish.
-        if cache_events is not None:
-            raise NotImplementedError(
-                "cache operations are not implemented for neuron backend.")
-
    @torch.inference_mode()
    def execute_model(
        self,
-        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]] = None,
-        blocks_to_swap_in: Optional[Dict[int, int]] = None,
-        blocks_to_swap_out: Optional[Dict[int, int]] = None,
-        blocks_to_copy: Optional[Dict[int, List[int]]] = None,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
    ) -> Optional[SamplerOutput]:
-        if self.is_driver_worker:
-            assert seq_group_metadata_list is not None
-            num_seq_groups = len(seq_group_metadata_list)
-            assert blocks_to_swap_in is not None
-            assert blocks_to_swap_out is not None
-            assert blocks_to_copy is not None
-            data = {
-                "num_seq_groups": num_seq_groups,
-                "blocks_to_swap_in": blocks_to_swap_in,
-                "blocks_to_swap_out": blocks_to_swap_out,
-                "blocks_to_copy": blocks_to_copy,
-            }
-            broadcast_tensor_dict(data, src=0)
-        else:
-            data = broadcast_tensor_dict(src=0)
-            num_seq_groups = data["num_seq_groups"]
-            blocks_to_swap_in = data["blocks_to_swap_in"]
-            blocks_to_swap_out = data["blocks_to_swap_out"]
-            blocks_to_copy = data["blocks_to_copy"]
-
-        self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
+        num_seq_groups = len(seq_group_metadata_list)

        # If there is no input, we don't need to execute the model.
        if num_seq_groups == 0:
            return {}

-        output = self.model_runner.execute_model(seq_group_metadata_list,
-                                                 self.gpu_cache)
+        output = self.model_runner.execute_model(seq_group_metadata_list)
        return output
-
-
-def _init_distributed_environment(
-    parallel_config: ParallelConfig,
-    rank: int,
-    distributed_init_method: Optional[str] = None,
-    distributed_backend: Optional[str] = None,
-) -> None:
-    """Initialize the distributed environment."""
-    if torch.distributed.is_initialized():
-        torch_world_size = torch.distributed.get_world_size()
-        if torch_world_size != parallel_config.world_size:
-            raise RuntimeError(
-                "torch.distributed is already initialized but the torch world "
-                "size does not match parallel_config.world_size "
-                f"({torch_world_size} vs. {parallel_config.world_size}).")
-    elif not distributed_init_method:
-        raise ValueError(
-            "distributed_init_method must be set if torch.distributed "
-            "is not already initialized")
-    else:
-        distributed_backend = distributed_backend if distributed_backend else "nccl"
-        torch.distributed.init_process_group(
-            backend=distributed_backend,
-            world_size=parallel_config.world_size,
-            rank=rank,
-            init_method=distributed_init_method,
-        )
-
-    # A small all_reduce for warmup.
-    torch.distributed.all_reduce(torch.zeros(1))
-    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
-                                      parallel_config.pipeline_parallel_size)
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
 """A GPU worker class."""
 import gc
 import os
-from typing import Dict, List, Tuple, Set, Optional
+from typing import Dict, List, Optional, Set, Tuple

 import torch
 import torch.distributed

-from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, LoRAConfig)
+from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
+                         ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
-from vllm.model_executor.parallel_utils import cupy_utils
+from vllm.model_executor.parallel_utils import pynccl_utils
 from vllm.model_executor.parallel_utils.communication_op import (
    broadcast_tensor_dict)
 from vllm.model_executor.parallel_utils.custom_all_reduce import init_custom_ar
@@ -18,8 +19,6 @@ from vllm.model_executor.parallel_utils.parallel_state import (
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.worker.cache_engine import CacheEngine
 from vllm.worker.model_runner import ModelRunner
-from vllm.lora.request import LoRARequest
-from vllm.utils import is_hip


 class Worker:
@@ -40,6 +39,7 @@ class Worker:
        rank: int,
        distributed_init_method: str,
        lora_config: Optional[LoRAConfig] = None,
+        vision_language_config: Optional[VisionLanguageConfig] = None,
        kv_cache_dtype: Optional[str] = "auto",
        is_driver_worker: bool = False,
    ) -> None:
@@ -55,21 +55,27 @@ class Worker:
        if self.is_driver_worker:
            assert self.rank == 0, "The driver worker must have rank 0."

-        self.model_runner = ModelRunner(model_config,
-                                        parallel_config,
-                                        scheduler_config,
-                                        device_config,
-                                        lora_config=self.lora_config,
-                                        kv_cache_dtype=kv_cache_dtype,
-                                        is_driver_worker=is_driver_worker)
+        self.vision_language_config = vision_language_config
+        if self.vision_language_config:
+            assert not self.lora_config, (
+                "To be tested: vision language model with LoRA settings.")
+
+        self.model_runner = ModelRunner(
+            model_config,
+            parallel_config,
+            scheduler_config,
+            device_config,
+            lora_config=self.lora_config,
+            kv_cache_dtype=kv_cache_dtype,
+            is_driver_worker=is_driver_worker,
+            vision_language_config=vision_language_config)
        # Uninitialized cache engine. Will be initialized by
        # self.init_cache_engine().
        self.cache_config = None
        self.cache_engine = None
-        self.cache_events = None
        self.gpu_cache = None

-    def init_model(self, cupy_port: Optional[int] = None) -> None:
+    def init_device(self) -> None:
        if self.device_config.device.type == "cuda":
            # torch.distributed.all_reduce does not free the input tensor until
            # the synchronization point. This causes the memory usage to grow
@@ -92,8 +98,9 @@ class Worker:
                f"Not support device type: {self.device_config.device}")
        # Initialize the distributed environment.
        init_distributed_environment(self.parallel_config, self.rank,
-                                     cupy_port, self.distributed_init_method)
-        # Initialize the model.
+                                     self.distributed_init_method,
+                                     self.local_rank)
+        # Set random seed.
        set_random_seed(self.model_config.seed)

    def load_model(self):
@@ -130,9 +137,12 @@ class Worker:
        # NOTE(woosuk): Here we assume that the other processes using the same
        # GPU did not change their memory usage during the profiling.
        peak_memory = self.init_gpu_memory - free_gpu_memory
+        assert peak_memory > 0, (
+            "Error in memory profiling. This happens when the GPU memory was "
+            "not properly cleaned up before initializing the vLLM instance.")

-        cache_block_size = CacheEngine.get_cache_block_size(
-            block_size, cache_dtype, self.model_config, self.parallel_config)
+        cache_block_size = self.get_cache_block_size_bytes(
+            block_size, cache_dtype)
        num_gpu_blocks = int(
            (total_gpu_memory * gpu_memory_utilization - peak_memory) //
            cache_block_size)
@@ -149,7 +159,6 @@ class Worker:
        self.cache_config = cache_config
        self.cache_engine = CacheEngine(self.cache_config, self.model_config,
                                        self.parallel_config)
-        self.cache_events = self.cache_engine.events
        self.gpu_cache = self.cache_engine.gpu_cache
        self.model_runner.set_block_size(self.cache_engine.block_size)

@@ -167,24 +176,13 @@ class Worker:
        blocks_to_copy: Dict[int, List[int]],
    ) -> None:
        # Issue cache operations.
-        issued_cache_op = False
+        # TODO(woosuk): Profile swapping overhead and optimize if needed.
        if blocks_to_swap_in:
            self.cache_engine.swap_in(blocks_to_swap_in)
-            issued_cache_op = True
        if blocks_to_swap_out:
            self.cache_engine.swap_out(blocks_to_swap_out)
-            issued_cache_op = True
        if blocks_to_copy:
            self.cache_engine.copy(blocks_to_copy)
-            issued_cache_op = True
-
-        cache_events = self.cache_events if issued_cache_op else None
-
-        # Wait for cache operations to finish.
-        # TODO(woosuk): Profile swapping overhead and optimize if needed.
-        if cache_events is not None:
-            for event in cache_events:
-                event.wait()

    @torch.inference_mode()
    def execute_model(
@@ -233,12 +231,28 @@ class Worker:
    def list_loras(self) -> Set[int]:
        return self.model_runner.list_loras()

+    @property
+    def max_model_len(self) -> int:
+        return self.model_config.max_model_len
+
+    @property
+    def vocab_size(self) -> int:
+        return self.model_runner.vocab_size
+
+    def get_cache_block_size_bytes(self, block_size: int,
+                                   cache_dtype: str) -> int:
+        """Get the size of the KV cache block size in bytes.
+        """
+        return CacheEngine.get_cache_block_size(block_size, cache_dtype,
+                                                self.model_config,
+                                                self.parallel_config)
+

 def init_distributed_environment(
    parallel_config: ParallelConfig,
    rank: int,
-    cupy_port: Optional[int],
    distributed_init_method: Optional[str] = None,
+    local_rank: int = -1,
 ) -> None:
    """Initialize the distributed environment."""
    if torch.distributed.is_initialized():
@@ -260,29 +274,27 @@ def init_distributed_environment(
            init_method=distributed_init_method,
        )

-    if cupy_utils.is_initialized():
-        cupy_world_size = cupy_utils.get_world_size()
-        if cupy_world_size != parallel_config.world_size:
+    if pynccl_utils.is_initialized():
+        pynccl_world_size = pynccl_utils.get_world_size()
+        if pynccl_world_size != parallel_config.world_size:
            raise RuntimeError(
-                "cupy.distributed is already initialized but the cupy world "
+                "pynccl is already initialized but the pynccl world "
                "size does not match parallel_config.world_size "
-                f"({cupy_world_size} vs. {parallel_config.world_size}).")
-    elif (parallel_config.world_size > 1 and cupy_port is not None
-          and not is_hip()):
-        # NOTE(woosuk): We don't initialize CuPy process group when world size
+                f"({pynccl_world_size} vs. {parallel_config.world_size}).")
+    elif parallel_config.world_size > 1:
+        # NOTE(woosuk): We don't initialize pynccl process group when world size
        # is 1.
-        # TODO(woosuk): Support multi-node connection.
-        cupy_utils.init_process_group(
+        pynccl_utils.init_process_group(
            world_size=parallel_config.world_size,
+            local_rank=local_rank,
            rank=rank,
-            host="localhost",
-            port=cupy_port,
+            init_method=distributed_init_method,
        )

    # A small all_reduce for warmup.
    torch.distributed.all_reduce(torch.zeros(1).cuda())
-    if cupy_utils.is_initialized():
-        cupy_utils.all_reduce(torch.zeros(1).cuda())
+    if pynccl_utils.is_initialized():
+        pynccl_utils.all_reduce(torch.zeros(1).cuda())
    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
                                      parallel_config.pipeline_parallel_size)