merge main

1b14cd54 · zhuwenwen · 726ed56c · 1db83e31 · 1b14cd54 · 1b14cd54
Commit 1b14cd54 authored Dec 26, 2023 by zhuwenwen
20 changed files
--- a/vllm/core/policy.py
+++ b/vllm/core/policy.py
+from typing import List
+
+from vllm.sequence import SequenceGroup
+
+
+class Policy:
+
+    def get_priority(
+        self,
+        now: float,
+        seq_group: SequenceGroup,
+    ) -> float:
+        raise NotImplementedError
+
+    def sort_by_priority(
+        self,
+        now: float,
+        seq_groups: List[SequenceGroup],
+    ) -> List[SequenceGroup]:
+        return sorted(
+            seq_groups,
+            key=lambda seq_group: self.get_priority(now, seq_group),
+            reverse=True,
+        )
+
+
+class FCFS(Policy):
+
+    def get_priority(
+        self,
+        now: float,
+        seq_group: SequenceGroup,
+    ) -> float:
+        return now - seq_group.arrival_time
+
+
+class PolicyFactory:
+
+    _POLICY_REGISTRY = {
+        'fcfs': FCFS,
+    }
+
+    @classmethod
+    def get_policy(cls, policy_name: str, **kwargs) -> Policy:
+        return cls._POLICY_REGISTRY[policy_name](**kwargs)
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
+import enum
+import time
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+
+from vllm.config import CacheConfig, SchedulerConfig
+from vllm.core.block_manager import AllocStatus, BlockSpaceManager
+from vllm.core.policy import PolicyFactory
+from vllm.logger import init_logger
+from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
+                           SequenceGroupMetadata, SequenceStatus)
+
+logger = init_logger(__name__)
+
+
+class PreemptionMode(enum.Enum):
+    """Preemption modes.
+
+    1. Swapping: Swap out the blocks of the preempted sequences to CPU memory
+    and swap them back in when the sequences are resumed.
+    2. Recomputation: Discard the blocks of the preempted sequences and
+    recompute them when the sequences are resumed, treating the sequences as
+    new prompts.
+    """
+    SWAP = enum.auto()
+    RECOMPUTE = enum.auto()
+
+
+class SchedulerOutputs:
+
+    def __init__(
+        self,
+        scheduled_seq_groups: List[SequenceGroup],
+        prompt_run: bool,
+        num_batched_tokens: int,
+        blocks_to_swap_in: Dict[int, int],
+        blocks_to_swap_out: Dict[int, int],
+        blocks_to_copy: Dict[int, List[int]],
+        ignored_seq_groups: List[SequenceGroup],
+    ) -> None:
+        self.scheduled_seq_groups = scheduled_seq_groups
+        self.prompt_run = prompt_run
+        self.num_batched_tokens = num_batched_tokens
+        self.blocks_to_swap_in = blocks_to_swap_in
+        self.blocks_to_swap_out = blocks_to_swap_out
+        self.blocks_to_copy = blocks_to_copy
+        # Swap in and swap out should never happen at the same time.
+        assert not (blocks_to_swap_in and blocks_to_swap_out)
+        self.ignored_seq_groups = ignored_seq_groups
+
+    def is_empty(self) -> bool:
+        # NOTE: We do not consider the ignored sequence groups.
+        return (not self.scheduled_seq_groups and not self.blocks_to_swap_in
+                and not self.blocks_to_swap_out and not self.blocks_to_copy)
+
+
+class Scheduler:
+
+    def __init__(
+        self,
+        scheduler_config: SchedulerConfig,
+        cache_config: CacheConfig,
+    ) -> None:
+        self.scheduler_config = scheduler_config
+        self.cache_config = cache_config
+
+        self.prompt_limit = min(self.scheduler_config.max_model_len,
+                                self.scheduler_config.max_num_batched_tokens)
+
+        # Instantiate the scheduling policy.
+        self.policy = PolicyFactory.get_policy(policy_name="fcfs")
+        # Create the block space manager.
+        self.block_manager = BlockSpaceManager(
+            block_size=self.cache_config.block_size,
+            num_gpu_blocks=self.cache_config.num_gpu_blocks,
+            num_cpu_blocks=self.cache_config.num_cpu_blocks,
+            sliding_window=self.cache_config.sliding_window)
+
+        # TODO(zhuohan): Use deque instead of list for better performance.
+        # Sequence groups in the WAITING state.
+        self.waiting: List[SequenceGroup] = []
+        # Sequence groups in the RUNNING state.
+        self.running: List[SequenceGroup] = []
+        # Sequence groups in the SWAPPED state.
+        self.swapped: List[SequenceGroup] = []
+
+    def add_seq_group(self, seq_group: SequenceGroup) -> None:
+        # Add sequence groups to the waiting queue.
+        self.waiting.append(seq_group)
+
+    def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
+        if isinstance(request_id, str):
+            request_id = (request_id, )
+        request_ids = set(request_id)
+        for state_queue in [self.waiting, self.running, self.swapped]:
+            # We need to reverse the list as we are removing elements
+            # from it as we iterate over it. If we don't do it,
+            # indices will get messed up and we will skip over elements.
+            for seq_group in reversed(state_queue):
+                if seq_group.request_id in request_ids:
+                    # Remove the sequence group from the state queue.
+                    state_queue.remove(seq_group)
+                    for seq in seq_group.get_seqs():
+                        if seq.is_finished():
+                            continue
+                        seq.status = SequenceStatus.FINISHED_ABORTED
+                        self.free_seq(seq)
+                    request_ids.remove(seq_group.request_id)
+                    if not request_ids:
+                        return
+
+    def has_unfinished_seqs(self) -> bool:
+        return self.waiting or self.running or self.swapped
+
+    def get_num_unfinished_seq_groups(self) -> int:
+        return len(self.waiting) + len(self.running) + len(self.swapped)
+
+    def _schedule(self) -> SchedulerOutputs:
+        # Blocks that need to be swaped or copied before model execution.
+        blocks_to_swap_in: Dict[int, int] = {}
+        blocks_to_swap_out: Dict[int, int] = {}
+        blocks_to_copy: Dict[int, List[int]] = {}
+
+        # Fix the current time.
+        now = time.monotonic()
+
+        # Join waiting sequences if possible.
+        if not self.swapped:
+            ignored_seq_groups: List[SequenceGroup] = []
+            scheduled: List[SequenceGroup] = []
+            # The total number of sequences on the fly, including the
+            # requests in the generation phase.
+            num_curr_seqs = sum(seq_group.get_max_num_running_seqs()
+                                for seq_group in self.running)
+            seq_lens: List[int] = []
+
+            # Optimization: We do not sort the waiting queue since the preempted
+            # sequence groups are added to the front and the new sequence groups
+            # are added to the back.
+            while self.waiting:
+                seq_group = self.waiting[0]
+
+                waiting_seqs = seq_group.get_seqs(
+                    status=SequenceStatus.WAITING)
+                assert len(waiting_seqs) == 1, (
+                    "Waiting sequence group should have only one prompt "
+                    "sequence.")
+                num_prompt_tokens = waiting_seqs[0].get_len()
+                if num_prompt_tokens > self.prompt_limit:
+                    logger.warning(
+                        f"Input prompt ({num_prompt_tokens} tokens) is too long"
+                        f" and exceeds limit of {self.prompt_limit}")
+                    for seq in waiting_seqs:
+                        seq.status = SequenceStatus.FINISHED_IGNORED
+                    ignored_seq_groups.append(seq_group)
+                    self.waiting.pop(0)
+                    continue
+
+                # If the sequence group cannot be allocated, stop.
+                can_allocate = self.block_manager.can_allocate(seq_group)
+                if can_allocate == AllocStatus.LATER:
+                    break
+                elif can_allocate == AllocStatus.NEVER:
+                    logger.warning(
+                        f"Input prompt ({num_prompt_tokens} tokens) is too long"
+                        f" and exceeds the capacity of block_manager")
+                    for seq in waiting_seqs:
+                        seq.status = SequenceStatus.FINISHED_IGNORED
+                    ignored_seq_groups.append(seq_group)
+                    self.waiting.pop(0)
+                    continue
+
+                # If the number of batched tokens exceeds the limit, stop.
+                new_seq_lens = seq_lens + [num_prompt_tokens]
+                num_batched_tokens = len(new_seq_lens) * max(new_seq_lens)
+                if (num_batched_tokens >
+                        self.scheduler_config.max_num_batched_tokens):
+                    break
+
+                # The total number of sequences in the RUNNING state should not
+                # exceed the maximum number of sequences.
+                num_new_seqs = seq_group.get_max_num_running_seqs()
+                if (num_curr_seqs + num_new_seqs >
+                        self.scheduler_config.max_num_seqs):
+                    break
+
+                num_paddings = num_batched_tokens - sum(new_seq_lens)
+                if num_paddings > self.scheduler_config.max_paddings:
+                    break
+                seq_lens = new_seq_lens
+
+                seq_group = self.waiting.pop(0)
+                self._allocate(seq_group)
+                self.running.append(seq_group)
+                num_curr_seqs += num_new_seqs
+                scheduled.append(seq_group)
+
+            if scheduled or ignored_seq_groups:
+                scheduler_outputs = SchedulerOutputs(
+                    scheduled_seq_groups=scheduled,
+                    prompt_run=True,
+                    num_batched_tokens=len(seq_lens) *
+                    max(seq_lens) if seq_lens else 0,
+                    blocks_to_swap_in=blocks_to_swap_in,
+                    blocks_to_swap_out=blocks_to_swap_out,
+                    blocks_to_copy=blocks_to_copy,
+                    ignored_seq_groups=ignored_seq_groups,
+                )
+                return scheduler_outputs
+
+        # NOTE(woosuk): Preemption happens only when there is no available slot
+        # to keep all the sequence groups in the RUNNING state.
+        # In this case, the policy is responsible for deciding which sequence
+        # groups to preempt.
+        self.running = self.policy.sort_by_priority(now, self.running)
+
+        # Reserve new token slots for the running sequence groups.
+        running: List[SequenceGroup] = []
+        preempted: List[SequenceGroup] = []
+        while self.running:
+            seq_group = self.running.pop(0)
+            while not self.block_manager.can_append_slot(seq_group):
+                if self.running:
+                    # Preempt the lowest-priority sequence groups.
+                    victim_seq_group = self.running.pop(-1)
+                    self._preempt(victim_seq_group, blocks_to_swap_out)
+                    preempted.append(victim_seq_group)
+                else:
+                    # No other sequence groups can be preempted.
+                    # Preempt the current sequence group.
+                    self._preempt(seq_group, blocks_to_swap_out)
+                    preempted.append(seq_group)
+                    break
+            else:
+                # Append new slots to the sequence group.
+                self._append_slot(seq_group, blocks_to_copy)
+                running.append(seq_group)
+        self.running = running
+
+        # Swap in the sequence groups in the SWAPPED state if possible.
+        self.swapped = self.policy.sort_by_priority(now, self.swapped)
+        if not preempted:
+            num_curr_seqs = sum(seq_group.get_max_num_running_seqs()
+                                for seq_group in self.running)
+
+            while self.swapped:
+                seq_group = self.swapped[0]
+                # If the sequence group cannot be swapped in, stop.
+                if not self.block_manager.can_swap_in(seq_group):
+                    break
+
+                # The total number of sequences in the RUNNING state should not
+                # exceed the maximum number of sequences.
+                num_new_seqs = seq_group.get_max_num_running_seqs()
+                if (num_curr_seqs + num_new_seqs >
+                        self.scheduler_config.max_num_seqs):
+                    break
+
+                seq_group = self.swapped.pop(0)
+                self._swap_in(seq_group, blocks_to_swap_in)
+                self._append_slot(seq_group, blocks_to_copy)
+                num_curr_seqs += num_new_seqs
+                self.running.append(seq_group)
+
+        # Each sequence in the generation phase only takes one token slot.
+        # Therefore, the number of batched tokens is equal to the number of
+        # sequences in the RUNNING state.
+        num_batched_tokens = sum(
+            seq_group.num_seqs(status=SequenceStatus.RUNNING)
+            for seq_group in self.running)
+
+        scheduler_outputs = SchedulerOutputs(
+            scheduled_seq_groups=self.running,
+            prompt_run=False,
+            num_batched_tokens=num_batched_tokens,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+            ignored_seq_groups=[],
+        )
+        return scheduler_outputs
+
+    def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
+        # Schedule sequence groups.
+        # This function call changes the internal states of the scheduler
+        # such as self.running, self.swapped, and self.waiting.
+        scheduler_outputs = self._schedule()
+
+        # Create input data structures.
+        seq_group_metadata_list: List[SequenceGroupMetadata] = []
+        for seq_group in scheduler_outputs.scheduled_seq_groups:
+            seq_data: Dict[int, SequenceData] = {}
+            block_tables: Dict[int, List[int]] = {}
+            for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+                seq_id = seq.seq_id
+                seq_data[seq_id] = seq.data
+                block_tables[seq_id] = self.block_manager.get_block_table(seq)
+
+            seq_group_metadata = SequenceGroupMetadata(
+                request_id=seq_group.request_id,
+                is_prompt=scheduler_outputs.prompt_run,
+                seq_data=seq_data,
+                sampling_params=seq_group.sampling_params,
+                block_tables=block_tables,
+            )
+            seq_group_metadata_list.append(seq_group_metadata)
+        return seq_group_metadata_list, scheduler_outputs
+
+    def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None:
+        self.block_manager.fork(parent_seq, child_seq)
+
+    def free_seq(self, seq: Sequence) -> None:
+        self.block_manager.free(seq)
+
+    def free_finished_seq_groups(self) -> None:
+        self.running = [
+            seq_group for seq_group in self.running
+            if not seq_group.is_finished()
+        ]
+
+    def _allocate(self, seq_group: SequenceGroup) -> None:
+        self.block_manager.allocate(seq_group)
+        for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
+            seq.status = SequenceStatus.RUNNING
+
+    def _append_slot(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_copy: Dict[int, List[int]],
+    ) -> None:
+        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+            ret = self.block_manager.append_slot(seq)
+            if ret is not None:
+                src_block, dst_block = ret
+                if src_block in blocks_to_copy:
+                    blocks_to_copy[src_block].append(dst_block)
+                else:
+                    blocks_to_copy[src_block] = [dst_block]
+
+    def _preempt(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_swap_out: Dict[int, int],
+        preemption_mode: Optional[PreemptionMode] = None,
+    ) -> None:
+        # If preemption mode is not specified, we determine the mode as follows:
+        # We use recomputation by default since it incurs lower overhead than
+        # swapping. However, when the sequence group has multiple sequences
+        # (e.g., beam search), recomputation is not currently supported. In
+        # such a case, we use swapping instead.
+        # FIXME(woosuk): This makes our scheduling policy a bit bizarre.
+        # As swapped sequences are prioritized over waiting sequences,
+        # sequence groups with multiple sequences are implicitly prioritized
+        # over sequence groups with a single sequence.
+        # TODO(woosuk): Support recomputation for sequence groups with multiple
+        # sequences. This may require a more sophisticated CUDA kernel.
+        if preemption_mode is None:
+            if seq_group.get_max_num_running_seqs() == 1:
+                preemption_mode = PreemptionMode.RECOMPUTE
+            else:
+                preemption_mode = PreemptionMode.SWAP
+        if preemption_mode == PreemptionMode.RECOMPUTE:
+            self._preempt_by_recompute(seq_group)
+        elif preemption_mode == PreemptionMode.SWAP:
+            self._preempt_by_swap(seq_group, blocks_to_swap_out)
+        else:
+            raise AssertionError("Invalid preemption mode.")
+
+    def _preempt_by_recompute(
+        self,
+        seq_group: SequenceGroup,
+    ) -> None:
+        seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
+        assert len(seqs) == 1
+        for seq in seqs:
+            seq.status = SequenceStatus.WAITING
+            self.block_manager.free(seq)
+        # NOTE: For FCFS, we insert the preempted sequence group to the front
+        # of the waiting queue.
+        self.waiting.insert(0, seq_group)
+
+    def _preempt_by_swap(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_swap_out: Dict[int, int],
+    ) -> None:
+        self._swap_out(seq_group, blocks_to_swap_out)
+        self.swapped.append(seq_group)
+
+    def _swap_in(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_swap_in: Dict[int, int],
+    ) -> None:
+        mapping = self.block_manager.swap_in(seq_group)
+        blocks_to_swap_in.update(mapping)
+        for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
+            seq.status = SequenceStatus.RUNNING
+
+    def _swap_out(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_swap_out: Dict[int, int],
+    ) -> None:
+        if not self.block_manager.can_swap_out(seq_group):
+            # FIXME(woosuk): Abort the sequence group instead of aborting the
+            # entire engine.
+            raise RuntimeError(
+                "Aborted due to the lack of CPU swap space. Please increase "
+                "the swap space to avoid this error.")
+        mapping = self.block_manager.swap_out(seq_group)
+        blocks_to_swap_out.update(mapping)
+        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+            seq.status = SequenceStatus.SWAPPED
--- a/vllm/engine/__init__.py
+++ b/vllm/engine/__init__.py
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
+import argparse
+import dataclasses
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig)
+
+
+@dataclass
+class EngineArgs:
+    """Arguments for vLLM engine."""
+    model: str
+    tokenizer: Optional[str] = None
+    tokenizer_mode: str = 'auto'
+    trust_remote_code: bool = False
+    download_dir: Optional[str] = None
+    load_format: str = 'auto'
+    dtype: str = 'auto'
+    seed: int = 0
+    max_model_len: Optional[int] = None
+    worker_use_ray: bool = False
+    pipeline_parallel_size: int = 1
+    tensor_parallel_size: int = 1
+    max_parallel_loading_workers: Optional[int] = None
+    block_size: int = 16
+    swap_space: int = 4  # GiB
+    gpu_memory_utilization: float = 0.90
+    max_num_batched_tokens: Optional[int] = None
+    max_num_seqs: int = 256
+    max_paddings: int = 256
+    disable_log_stats: bool = False
+    revision: Optional[str] = None
+    tokenizer_revision: Optional[str] = None
+    quantization: Optional[str] = None
+    enforce_eager: bool = False
+    max_context_len_to_capture: int = 8192
+
+    def __post_init__(self):
+        if self.tokenizer is None:
+            self.tokenizer = self.model
+
+    @staticmethod
+    def add_cli_args(
+            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+        """Shared CLI arguments for vLLM engine."""
+
+        # NOTE: If you update any of the arguments below, please also
+        # make sure to update docs/source/models/engine_args.rst
+
+        # Model arguments
+        parser.add_argument(
+            '--model',
+            type=str,
+            default='facebook/opt-125m',
+            help='name or path of the huggingface model to use')
+        parser.add_argument(
+            '--tokenizer',
+            type=str,
+            default=EngineArgs.tokenizer,
+            help='name or path of the huggingface tokenizer to use')
+        parser.add_argument(
+            '--revision',
+            type=str,
+            default=None,
+            help='the specific model version to use. It can be a branch '
+            'name, a tag name, or a commit id. If unspecified, will use '
+            'the default version.')
+        parser.add_argument(
+            '--tokenizer-revision',
+            type=str,
+            default=None,
+            help='the specific tokenizer version to use. It can be a branch '
+            'name, a tag name, or a commit id. If unspecified, will use '
+            'the default version.')
+        parser.add_argument('--tokenizer-mode',
+                            type=str,
+                            default=EngineArgs.tokenizer_mode,
+                            choices=['auto', 'slow'],
+                            help='tokenizer mode. "auto" will use the fast '
+                            'tokenizer if available, and "slow" will '
+                            'always use the slow tokenizer.')
+        parser.add_argument('--trust-remote-code',
+                            action='store_true',
+                            help='trust remote code from huggingface')
+        parser.add_argument('--download-dir',
+                            type=str,
+                            default=EngineArgs.download_dir,
+                            help='directory to download and load the weights, '
+                            'default to the default cache dir of '
+                            'huggingface')
+        parser.add_argument(
+            '--load-format',
+            type=str,
+            default=EngineArgs.load_format,
+            choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'],
+            help='The format of the model weights to load. '
+            '"auto" will try to load the weights in the safetensors format '
+            'and fall back to the pytorch bin format if safetensors format '
+            'is not available. '
+            '"pt" will load the weights in the pytorch bin format. '
+            '"safetensors" will load the weights in the safetensors format. '
+            '"npcache" will load the weights in pytorch format and store '
+            'a numpy cache to speed up the loading. '
+            '"dummy" will initialize the weights with random values, '
+            'which is mainly for profiling.')
+        parser.add_argument(
+            '--dtype',
+            type=str,
+            default=EngineArgs.dtype,
+            choices=[
+                'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
+            ],
+            help='data type for model weights and activations. '
+            'The "auto" option will use FP16 precision '
+            'for FP32 and FP16 models, and BF16 precision '
+            'for BF16 models.')
+        parser.add_argument('--max-model-len',
+                            type=int,
+                            default=None,
+                            help='model context length. If unspecified, '
+                            'will be automatically derived from the model.')
+        # Parallel arguments
+        parser.add_argument('--worker-use-ray',
+                            action='store_true',
+                            help='use Ray for distributed serving, will be '
+                            'automatically set when using more than 1 GPU')
+        parser.add_argument('--pipeline-parallel-size',
+                            '-pp',
+                            type=int,
+                            default=EngineArgs.pipeline_parallel_size,
+                            help='number of pipeline stages')
+        parser.add_argument('--tensor-parallel-size',
+                            '-tp',
+                            type=int,
+                            default=EngineArgs.tensor_parallel_size,
+                            help='number of tensor parallel replicas')
+        parser.add_argument(
+            '--max-parallel-loading-workers',
+            type=int,
+            help='load model sequentially in multiple batches, '
+            'to avoid RAM OOM when using tensor '
+            'parallel and large models')
+        # KV cache arguments
+        parser.add_argument('--block-size',
+                            type=int,
+                            default=EngineArgs.block_size,
+                            choices=[8, 16, 32],
+                            help='token block size')
+        # TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
+        parser.add_argument('--seed',
+                            type=int,
+                            default=EngineArgs.seed,
+                            help='random seed')
+        parser.add_argument('--swap-space',
+                            type=int,
+                            default=EngineArgs.swap_space,
+                            help='CPU swap space size (GiB) per GPU')
+        parser.add_argument(
+            '--gpu-memory-utilization',
+            type=float,
+            default=EngineArgs.gpu_memory_utilization,
+            help='the fraction of GPU memory to be used for '
+            'the model executor, which can range from 0 to 1.'
+            'If unspecified, will use the default value of 0.9.')
+        parser.add_argument('--max-num-batched-tokens',
+                            type=int,
+                            default=EngineArgs.max_num_batched_tokens,
+                            help='maximum number of batched tokens per '
+                            'iteration')
+        parser.add_argument('--max-num-seqs',
+                            type=int,
+                            default=EngineArgs.max_num_seqs,
+                            help='maximum number of sequences per iteration')
+        parser.add_argument('--max-paddings',
+                            type=int,
+                            default=EngineArgs.max_paddings,
+                            help='maximum number of paddings in a batch')
+        parser.add_argument('--disable-log-stats',
+                            action='store_true',
+                            help='disable logging statistics')
+        # Quantization settings.
+        parser.add_argument('--quantization',
+                            '-q',
+                            type=str,
+                            choices=['awq', 'gptq', 'squeezellm', None],
+                            default=None,
+                            help='Method used to quantize the weights. If '
+                            'None, we first check the `quantization_config` '
+                            'attribute in the model config file. If that is '
+                            'None, we assume the model weights are not '
+                            'quantized and use `dtype` to determine the data '
+                            'type of the weights.')
+        parser.add_argument('--enforce-eager',
+                            action='store_true',
+                            help='Always use eager-mode PyTorch. If False, '
+                            'will use eager mode and CUDA graph in hybrid '
+                            'for maximal performance and flexibility.')
+        parser.add_argument('--max-context-len-to-capture',
+                            type=int,
+                            default=EngineArgs.max_context_len_to_capture,
+                            help='maximum context length covered by CUDA '
+                            'graphs. When a sequence has context length '
+                            'larger than this, we fall back to eager mode.')
+        return parser
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
+        # Get the list of attributes of this dataclass.
+        attrs = [attr.name for attr in dataclasses.fields(cls)]
+        # Set the attributes from the parsed arguments.
+        engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
+        return engine_args
+
+    def create_engine_configs(
+        self,
+    ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig]:
+        model_config = ModelConfig(self.model, self.tokenizer,
+                                   self.tokenizer_mode, self.trust_remote_code,
+                                   self.download_dir, self.load_format,
+                                   self.dtype, self.seed, self.revision,
+                                   self.tokenizer_revision, self.max_model_len,
+                                   self.quantization, self.enforce_eager,
+                                   self.max_context_len_to_capture)
+        cache_config = CacheConfig(self.block_size,
+                                   self.gpu_memory_utilization,
+                                   self.swap_space,
+                                   model_config.get_sliding_window())
+        parallel_config = ParallelConfig(self.pipeline_parallel_size,
+                                         self.tensor_parallel_size,
+                                         self.worker_use_ray,
+                                         self.max_parallel_loading_workers)
+        scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
+                                           self.max_num_seqs,
+                                           model_config.max_model_len,
+                                           self.max_paddings)
+        return model_config, cache_config, parallel_config, scheduler_config
+
+
+@dataclass
+class AsyncEngineArgs(EngineArgs):
+    """Arguments for asynchronous vLLM engine."""
+    engine_use_ray: bool = False
+    disable_log_requests: bool = False
+    max_log_len: Optional[int] = None
+
+    @staticmethod
+    def add_cli_args(
+            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+        parser = EngineArgs.add_cli_args(parser)
+        parser.add_argument('--engine-use-ray',
+                            action='store_true',
+                            help='use Ray to start the LLM engine in a '
+                            'separate process as the server process.')
+        parser.add_argument('--disable-log-requests',
+                            action='store_true',
+                            help='disable logging requests')
+        parser.add_argument('--max-log-len',
+                            type=int,
+                            default=None,
+                            help='max number of prompt characters or prompt '
+                            'ID numbers being printed in log. '
+                            'Default: unlimited.')
+        return parser
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
+import asyncio
+import time
+from functools import partial
+from typing import (Any, Dict, Iterable, List, Optional, Set, Tuple, Type,
+                    Union, AsyncIterator)
+
+from vllm.config import ModelConfig
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.engine.ray_utils import initialize_cluster, ray
+from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+
+logger = init_logger(__name__)
+
+
+class AsyncEngineDeadError(RuntimeError):
+    pass
+
+
+def _raise_exception_on_finish(task: asyncio.Task,
+                               request_tracker: "RequestTracker") -> None:
+    msg = ("Task finished unexpectedly. This should never happen! "
+           "Please open an issue on Github.")
+    try:
+        try:
+            task.result()
+        except asyncio.CancelledError:
+            return
+        except Exception as exc:
+            raise AsyncEngineDeadError(
+                msg + " See stack trace above for the actual cause.") from exc
+        raise AsyncEngineDeadError(msg)
+    except Exception as exc:
+        request_tracker.propagate_exception(exc)
+        raise exc
+
+
+class AsyncStream:
+    """A stream of RequestOutputs for a request that can be
+    iterated over asynchronously."""
+
+    def __init__(self, request_id: str) -> None:
+        self.request_id = request_id
+        self._queue = asyncio.Queue()
+        self._finished = False
+
+    def put(self, item: RequestOutput) -> None:
+        if self._finished:
+            return
+        self._queue.put_nowait(item)
+
+    def finish(self) -> None:
+        self._queue.put_nowait(StopIteration)
+        self._finished = True
+
+    @property
+    def finished(self) -> bool:
+        return self._finished
+
+    def __aiter__(self):
+        return self
+
+    async def __anext__(self) -> RequestOutput:
+        result = await self._queue.get()
+        if result is StopIteration:
+            raise StopAsyncIteration
+        elif isinstance(result, Exception):
+            raise result
+        return result
+
+
+class RequestTracker:
+    """Synchronous abstraction for tracking requests."""
+
+    def __init__(self) -> None:
+        self._request_streams: Dict[str, AsyncStream] = {}
+        self._finished_requests: asyncio.Queue[str] = asyncio.Queue()
+        self._new_requests: asyncio.Queue[Tuple[AsyncStream,
+                                                dict]] = asyncio.Queue()
+        self.new_requests_event = None
+
+    def __contains__(self, item):
+        return item in self._request_streams
+
+    def init_event(self):
+        self.new_requests_event = asyncio.Event()
+
+    def propagate_exception(self,
+                            exc: Exception,
+                            request_id: Optional[str] = None) -> None:
+        """Propagate an exception to request streams
+        (all if request_id is None)."""
+        if request_id is not None:
+            self._request_streams[request_id].put(exc)
+        else:
+            for stream in self._request_streams.values():
+                stream.put(exc)
+
+    def process_request_output(self,
+                               request_output: RequestOutput,
+                               *,
+                               verbose: bool = False) -> None:
+        """Process a request output from the engine."""
+        request_id = request_output.request_id
+
+        self._request_streams[request_id].put(request_output)
+        if request_output.finished:
+            if verbose:
+                logger.info(f"Finished request {request_id}.")
+            self.abort_request(request_id)
+
+    def add_request(self, request_id: str,
+                    **engine_add_request_kwargs) -> AsyncStream:
+        """Add a request to be sent to the engine on the next background
+        loop iteration."""
+        if request_id in self._request_streams:
+            raise KeyError(f"Request {request_id} already exists.")
+
+        stream = AsyncStream(request_id)
+        self._new_requests.put_nowait((stream, {
+            "request_id": request_id,
+            **engine_add_request_kwargs
+        }))
+
+        self.new_requests_event.set()
+
+        return stream
+
+    def abort_request(self, request_id: str, *, verbose: bool = False) -> None:
+        """Abort a request during next background loop iteration."""
+        if verbose:
+            logger.info(f"Aborted request {request_id}.")
+
+        self._finished_requests.put_nowait(request_id)
+
+        if request_id not in self._request_streams or self._request_streams[
+                request_id].finished:
+            # The request has already finished or been aborted.
+            return
+
+        self._request_streams[request_id].finish()
+
+    def get_new_and_finished_requests(self) -> Tuple[List[Dict], Set[str]]:
+        """Get the new requests and finished requests to be
+        sent to the engine."""
+        new_requests: List[Dict] = []
+        finished_requests: Set[str] = set()
+
+        while not self._finished_requests.empty():
+            request_id = self._finished_requests.get_nowait()
+            finished_requests.add(request_id)
+            self._request_streams.pop(request_id, None)
+
+        while not self._new_requests.empty():
+            stream, new_request = self._new_requests.get_nowait()
+            if stream.request_id in finished_requests:
+                # The request has already been aborted.
+                stream.finish()
+                continue
+            self._request_streams[stream.request_id] = stream
+            new_requests.append(new_request)
+
+        self.new_requests_event.clear()
+
+        return new_requests, finished_requests
+
+    async def wait_for_new_requests(self):
+        await self.new_requests_event.wait()
+
+
+class _AsyncLLMEngine(LLMEngine):
+    """Extension of LLMEngine to add async methods."""
+
+    async def step_async(self) -> List[RequestOutput]:
+        """Performs one decoding iteration and returns newly generated results.
+        The workers are ran asynchronously if possible.
+
+        This function performs one decoding iteration of the engine. It first
+        schedules the sequences to be executed in the next iteration and the
+        token blocks to be swapped in/out/copy. Then, it executes the model
+        and updates the scheduler with the model outputs. Finally, it decodes
+        the sequences and returns the newly generated results.
+        """
+        seq_group_metadata_list, scheduler_outputs, ignored = self._schedule()
+        if scheduler_outputs.is_empty():
+            return ignored
+
+        # Execute the model.
+        output = await self._run_workers_async(
+            "execute_model",
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
+            blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
+            blocks_to_copy=scheduler_outputs.blocks_to_copy,
+        )
+
+        return self._process_model_outputs(output, scheduler_outputs) + ignored
+
+    async def _run_workers_async(
+        self,
+        method: str,
+        *args,
+        get_all_outputs: bool = False,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers."""
+        coros = []
+        for worker in self.workers:
+            if self.parallel_config.worker_use_ray:
+                coros.append(
+                    worker.execute_method.remote(method, *args, **kwargs))
+            else:
+                executor = getattr(worker, method)
+                coros.append(asyncio.get_event_loop().run_in_executor(
+                    None, partial(executor, *args, **kwargs)))
+
+        all_outputs = await asyncio.gather(*coros)
+
+        if get_all_outputs:
+            return all_outputs
+
+        # Make sure all workers have the same results.
+        output = all_outputs[0]
+        for other_output in all_outputs[1:]:
+            assert output == other_output
+        return output
+
+
+class AsyncLLMEngine:
+    """An asynchronous wrapper for LLMEngine.
+
+    This class is used to wrap the LLMEngine class to make it asynchronous. It
+    uses asyncio to create a background loop that keeps processing incoming
+    requests. The LLMEngine is kicked by the generate method when there
+    are requests in the waiting queue. The generate method yields the outputs
+    from the LLMEngine to the caller.
+
+    NOTE: For the comprehensive list of arguments, see `LLMEngine`.
+
+    Args:
+        worker_use_ray: Whether to use Ray for model workers. Required for
+            distributed execution. Should be the same as
+            `parallel_config.worker_use_ray`.
+        engine_use_ray: Whether to make LLMEngine a Ray actor. If so, the
+            async frontend will be executed in a separate process as the
+            model workers.
+        log_requests: Whether to log the requests.
+        start_engine_loop: If True, the background task to run the engine
+            will be automatically started in the generate call.
+        *args, *kwargs: Arguments for LLMEngine.
+    """
+
+    _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
+
+    def __init__(self,
+                 worker_use_ray: bool,
+                 engine_use_ray: bool,
+                 *args,
+                 log_requests: bool = True,
+                 max_log_len: Optional[int] = None,
+                 start_engine_loop: bool = True,
+                 **kwargs) -> None:
+        self.worker_use_ray = worker_use_ray
+        self.engine_use_ray = engine_use_ray
+        self.log_requests = log_requests
+        self.max_log_len = max_log_len
+        self.engine = self._init_engine(*args, **kwargs)
+
+        self.background_loop = None
+        # We need to keep a reference to unshielded
+        # task as well to prevent it from being garbage
+        # collected
+        self._background_loop_unshielded = None
+        self.start_engine_loop = start_engine_loop
+        self._request_tracker = RequestTracker()
+
+    @property
+    def is_running(self) -> bool:
+        return (self.background_loop is not None
+                and not self.background_loop.done())
+
+    def start_background_loop(self) -> None:
+        """Start the background loop."""
+        if self.is_running:
+            raise RuntimeError("Background loop is already running.")
+        self._request_tracker.init_event()
+
+        self._background_loop_unshielded = asyncio.get_event_loop(
+        ).create_task(self.run_engine_loop())
+        self._background_loop_unshielded.add_done_callback(
+            partial(_raise_exception_on_finish,
+                    request_tracker=self._request_tracker))
+        self.background_loop = asyncio.shield(self._background_loop_unshielded)
+
+    def _init_engine(self, *args,
+                     **kwargs) -> Union[_AsyncLLMEngine, "ray.ObjectRef"]:
+        if not self.engine_use_ray:
+            engine_class = self._engine_class
+        elif self.worker_use_ray:
+            engine_class = ray.remote(num_cpus=0)(self._engine_class).remote
+        else:
+            # FIXME(woosuk): This is a bit hacky. Be careful when changing the
+            # order of the arguments.
+            cache_config = args[1]
+            parallel_config = args[2]
+            if parallel_config.tensor_parallel_size == 1:
+                num_gpus = cache_config.gpu_memory_utilization
+            else:
+                num_gpus = 1
+            engine_class = ray.remote(num_gpus=num_gpus)(
+                self._engine_class).remote
+        return engine_class(*args, **kwargs)
+
+    async def engine_step(self) -> bool:
+        """Kick the engine to process the waiting requests.
+
+        Returns True if there are in-progress requests."""
+
+        new_requests, finished_requests = (
+            self._request_tracker.get_new_and_finished_requests())
+
+        for new_request in new_requests:
+            # Add the request into the vLLM engine's waiting queue.
+            # TODO: Maybe add add_request_batch to reduce Ray overhead
+            if self.engine_use_ray:
+                await self.engine.add_request.remote(**new_request)
+            else:
+                self.engine.add_request(**new_request)
+
+        if finished_requests:
+            await self._engine_abort(finished_requests)
+
+        if self.engine_use_ray:
+            request_outputs = await self.engine.step.remote()
+        else:
+            request_outputs = await self.engine.step_async()
+
+        # Put the outputs into the corresponding streams.
+        for request_output in request_outputs:
+            self._request_tracker.process_request_output(
+                request_output, verbose=self.log_requests)
+
+        return len(request_outputs) > 0
+
+    async def _engine_abort(self, request_ids: Iterable[str]):
+        if self.engine_use_ray:
+            await self.engine.abort_request.remote(request_ids)
+        else:
+            self.engine.abort_request(request_ids)
+
+    async def run_engine_loop(self):
+        # Initialize the RequestTracker here so it uses the right event loop.
+        has_requests_in_progress = False
+        while True:
+            if not has_requests_in_progress:
+                await self._request_tracker.wait_for_new_requests()
+            has_requests_in_progress = await self.engine_step()
+            await asyncio.sleep(0)
+
+    async def add_request(
+        self,
+        request_id: str,
+        prompt: Optional[str],
+        sampling_params: SamplingParams,
+        prompt_token_ids: Optional[List[int]] = None,
+        arrival_time: Optional[float] = None,
+    ) -> AsyncStream:
+        if self.log_requests:
+            shortened_prompt = prompt
+            shortened_token_ids = prompt_token_ids
+            if self.max_log_len is not None:
+                if shortened_prompt is not None:
+                    shortened_prompt = shortened_prompt[:self.max_log_len]
+                if shortened_token_ids is not None:
+                    shortened_token_ids = shortened_token_ids[:self.
+                                                              max_log_len]
+            logger.info(f"Received request {request_id}: "
+                        f"prompt: {shortened_prompt!r}, "
+                        f"sampling params: {sampling_params}, "
+                        f"prompt token ids: {shortened_token_ids}.")
+
+        if not self.is_running:
+            if self.start_engine_loop:
+                self.start_background_loop()
+            else:
+                raise AsyncEngineDeadError(
+                    "Background loop is not running. If it was running, "
+                    "inspect the output to find the stacktrace of the "
+                    "error that caused the background loop to stop "
+                    "(AsyncEngineDeadError).")
+
+        stream = self._request_tracker.add_request(
+            request_id,
+            prompt=prompt,
+            sampling_params=sampling_params,
+            prompt_token_ids=prompt_token_ids,
+            arrival_time=arrival_time)
+
+        return stream
+
+    async def generate(
+        self,
+        prompt: Optional[str],
+        sampling_params: SamplingParams,
+        request_id: str,
+        prompt_token_ids: Optional[List[int]] = None
+    ) -> AsyncIterator[RequestOutput]:
+        """Generate outputs for a request.
+
+        Generate outputs for a request. This method is a coroutine. It adds the
+        request into the waiting queue of the LLMEngine and streams the outputs
+        from the LLMEngine to the caller.
+
+        Args:
+            prompt: The prompt string. Can be None if prompt_token_ids is
+                provided.
+            sampling_params: The sampling parameters of the request.
+            request_id: The unique id of the request.
+            prompt_token_ids: The token IDs of the prompt. If None, we
+                use the tokenizer to convert the prompts to token IDs.
+
+        Yields:
+            The output `RequestOutput` objects from the LLMEngine for the
+            request.
+        """
+        # Preprocess the request.
+        # This should not be used for logging, as it is monotonic time.
+        arrival_time = time.monotonic()
+
+        try:
+            stream = await self.add_request(request_id,
+                                            prompt,
+                                            sampling_params,
+                                            prompt_token_ids=prompt_token_ids,
+                                            arrival_time=arrival_time)
+
+            async for request_output in stream:
+                yield request_output
+        except (Exception, asyncio.CancelledError) as e:
+            # If there is an exception or coroutine is cancelled, abort the
+            # request.
+            self._abort(request_id)
+            raise e
+
+    async def abort(self, request_id: str) -> None:
+        """Abort a request.
+
+        Abort a submitted request. If the request is finished or not found,
+        this method will be a no-op.
+
+        Args:
+            request_id: The unique id of the request.
+        """
+        if not self.is_running:
+            raise AsyncEngineDeadError(
+                "Background loop is not running. If it was running, "
+                "inspect the output to find the stacktrace of the "
+                "error that caused the background loop to stop "
+                "(AsyncEngineDeadError).")
+
+        return self._abort(request_id)
+
+    def _abort(self, request_id: str) -> None:
+        """Abort a request.
+
+        Abort a submitted request. If the request is finished or not found,
+        this method will be a no-op.
+
+        Args:
+            request_id: The unique id of the request.
+        """
+        self._request_tracker.abort_request(request_id,
+                                            verbose=self.log_requests)
+
+    async def get_model_config(self) -> ModelConfig:
+        """Get the model configuration of the vLLM engine."""
+        if self.engine_use_ray:
+            return await self.engine.get_model_config.remote()
+        else:
+            return self.engine.get_model_config()
+
+    @classmethod
+    def from_engine_args(cls,
+                         engine_args: AsyncEngineArgs,
+                         start_engine_loop: bool = True) -> "AsyncLLMEngine":
+        """Creates an async LLM engine from the engine arguments."""
+        # Create the engine configs.
+        engine_configs = engine_args.create_engine_configs()
+        parallel_config = engine_configs[2]
+        # Initialize the cluster.
+        distributed_init_method, placement_group = initialize_cluster(
+            parallel_config, engine_args.engine_use_ray)
+        # Create the async LLM engine.
+        engine = cls(parallel_config.worker_use_ray,
+                     engine_args.engine_use_ray,
+                     *engine_configs,
+                     distributed_init_method,
+                     placement_group,
+                     log_requests=not engine_args.disable_log_requests,
+                     log_stats=not engine_args.disable_log_stats,
+                     max_log_len=engine_args.max_log_len,
+                     start_engine_loop=start_engine_loop)
+        return engine
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
+import copy
+import os
+import time
+from functools import partial
+from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union
+
+from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig)
+from vllm.core.scheduler import Scheduler, SchedulerOutputs
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.metrics import record_metrics
+from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray
+from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup,
+                           SequenceGroupMetadata, SequenceGroupOutput,
+                           SequenceOutput, SequenceStatus)
+from vllm.transformers_utils.tokenizer import (detokenize_incrementally,
+                                               get_tokenizer)
+from vllm.utils import Counter
+
+if ray:
+    from ray.air.util.torch_dist import init_torch_dist_process_group
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+logger = init_logger(__name__)
+
+_LOGGING_INTERVAL_SEC = 5
+
+
+class LLMEngine:
+    """An LLM engine that receives requests and generates texts.
+
+    This is the main class for the vLLM engine. It receives requests
+    from clients and generates texts from the LLM. It includes a tokenizer, a
+    language model (possibly distributed across multiple GPUs), and GPU memory
+    space allocated for intermediate states (aka KV cache). This class utilizes
+    iteration-level scheduling and efficient memory management to maximize the
+    serving throughput.
+
+    The `LLM` class wraps this class for offline batched inference and the
+    `AsyncLLMEngine` class wraps this class for online serving.
+
+    NOTE: The config arguments are derived from the `EngineArgs` class. For the
+    comprehensive list of arguments, see `EngineArgs`.
+
+    Args:
+        model_config: The configuration related to the LLM model.
+        cache_config: The configuration related to the KV cache memory
+            management.
+        parallel_config: The configuration related to distributed execution.
+        scheduler_config: The configuration related to the request scheduler.
+        distributed_init_method: The initialization method for distributed
+            execution. See `torch.distributed.init_process_group` for details.
+        placement_group: Ray placement group for distributed execution.
+            Required for distributed execution.
+        log_stats: Whether to log statistics.
+    """
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        cache_config: CacheConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        distributed_init_method: str,
+        placement_group: Optional["PlacementGroup"],
+        log_stats: bool,
+    ) -> None:
+        logger.info(
+            "Initializing an LLM engine with config: "
+            f"model={model_config.model!r}, "
+            f"tokenizer={model_config.tokenizer!r}, "
+            f"tokenizer_mode={model_config.tokenizer_mode}, "
+            f"revision={model_config.revision}, "
+            f"tokenizer_revision={model_config.tokenizer_revision}, "
+            f"trust_remote_code={model_config.trust_remote_code}, "
+            f"dtype={model_config.dtype}, "
+            f"max_seq_len={model_config.max_model_len}, "
+            f"download_dir={model_config.download_dir!r}, "
+            f"load_format={model_config.load_format}, "
+            f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
+            f"quantization={model_config.quantization}, "
+            f"enforce_eager={model_config.enforce_eager}, "
+            f"seed={model_config.seed})")
+        # TODO(woosuk): Print more configs in debug mode.
+
+        self.model_config = model_config
+        self.cache_config = cache_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        self.log_stats = log_stats
+        self._verify_args()
+
+        self.tokenizer = get_tokenizer(
+            model_config.tokenizer,
+            tokenizer_mode=model_config.tokenizer_mode,
+            trust_remote_code=model_config.trust_remote_code,
+            tokenizer_revision=model_config.tokenizer_revision,
+            revision=model_config.revision)
+        self.seq_counter = Counter()
+
+        # Create the parallel GPU workers.
+        if self.parallel_config.worker_use_ray:
+            # Disable Ray usage stats collection.
+            ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0")
+            if ray_usage != "1":
+                os.environ["RAY_USAGE_STATS_ENABLED"] = "0"
+            self._init_workers_ray(placement_group)
+        else:
+            self._init_workers(distributed_init_method)
+
+        # Profile the memory usage and initialize the cache.
+        self._init_cache()
+
+        # Create the scheduler.
+        self.scheduler = Scheduler(scheduler_config, cache_config)
+
+        # Logging.
+        self.last_logging_time = 0.0
+        # List of (timestamp, num_tokens)
+        self.num_prompt_tokens: List[Tuple[float, int]] = []
+        # List of (timestamp, num_tokens)
+        self.num_generation_tokens: List[Tuple[float, int]] = []
+
+    def _init_workers(self, distributed_init_method: str):
+        # Lazy import the Worker to avoid importing torch.cuda/xformers
+        # before CUDA_VISIBLE_DEVICES is set in the Worker
+        from vllm.worker.worker import Worker
+
+        assert self.parallel_config.world_size == 1, (
+            "Ray is required if parallel_config.world_size > 1.")
+
+        self.workers: List[Worker] = []
+        worker = Worker(
+            self.model_config,
+            self.parallel_config,
+            self.scheduler_config,
+            0,
+            distributed_init_method,
+        )
+        self.workers.append(worker)
+        self._run_workers(
+            "init_model",
+            get_all_outputs=True,
+        )
+        self._run_workers(
+            "load_model",
+            get_all_outputs=True,
+            max_concurrent_workers=self.parallel_config.
+            max_parallel_loading_workers,
+        )
+
+    def _init_workers_ray(self, placement_group: "PlacementGroup",
+                          **ray_remote_kwargs):
+        # Lazy import the Worker to avoid importing torch.cuda/xformers
+        # before CUDA_VISIBLE_DEVICES is set in the Worker
+        from vllm.worker.worker import Worker
+
+        self.workers: List[Worker] = []
+        for bundle in placement_group.bundle_specs:
+            if not bundle.get("GPU", 0):
+                continue
+            if self.parallel_config.tensor_parallel_size == 1:
+                num_gpus = self.cache_config.gpu_memory_utilization
+            else:
+                num_gpus = 1
+            worker = ray.remote(
+                num_cpus=0,
+                num_gpus=num_gpus,
+                scheduling_strategy=PlacementGroupSchedulingStrategy(
+                    placement_group=placement_group,
+                    placement_group_capture_child_tasks=True),
+                **ray_remote_kwargs,
+            )(RayWorkerVllm).remote(self.model_config.trust_remote_code)
+            self.workers.append(worker)
+
+        # Initialize torch distributed process group for the workers.
+        init_torch_dist_process_group(self.workers, backend="nccl")
+        model_config = copy.deepcopy(self.model_config)
+        parallel_config = copy.deepcopy(self.parallel_config)
+        scheduler_config = copy.deepcopy(self.scheduler_config)
+        self._run_workers("init_worker",
+                          get_all_outputs=True,
+                          worker_init_fn=lambda: Worker(
+                              model_config,
+                              parallel_config,
+                              scheduler_config,
+                              None,
+                              None,
+                          ))
+        self._run_workers(
+            "init_model",
+            get_all_outputs=True,
+        )
+        self._run_workers(
+            "load_model",
+            get_all_outputs=True,
+            max_concurrent_workers=self.parallel_config.
+            max_parallel_loading_workers,
+        )
+
+    def _verify_args(self) -> None:
+        self.model_config.verify_with_parallel_config(self.parallel_config)
+        self.cache_config.verify_with_parallel_config(self.parallel_config)
+
+    def _init_cache(self) -> None:
+        """Profiles the memory usage and initializes the KV cache."""
+        # Get the maximum number of blocks that can be allocated on GPU and CPU.
+        num_blocks = self._run_workers(
+            "profile_num_available_blocks",
+            get_all_outputs=True,
+            block_size=self.cache_config.block_size,
+            gpu_memory_utilization=self.cache_config.gpu_memory_utilization,
+            cpu_swap_space=self.cache_config.swap_space_bytes,
+        )
+
+        # Since we use a shared centralized controller, we take the minimum
+        # number of blocks across all workers to make sure all the memory
+        # operators can be applied to all workers.
+        num_gpu_blocks = min(b[0] for b in num_blocks)
+        num_cpu_blocks = min(b[1] for b in num_blocks)
+        # FIXME(woosuk): Change to debug log.
+        logger.info(f"# GPU blocks: {num_gpu_blocks}, "
+                    f"# CPU blocks: {num_cpu_blocks}")
+
+        if num_gpu_blocks <= 0:
+            raise ValueError("No available memory for the cache blocks. "
+                             "Try increasing `gpu_memory_utilization` when "
+                             "initializing the engine.")
+        max_seq_len = self.cache_config.block_size * num_gpu_blocks
+        if self.model_config.max_model_len > max_seq_len:
+            raise ValueError(
+                f"The model's max seq len ({self.model_config.max_model_len}) "
+                "is larger than the maximum number of tokens that can be "
+                f"stored in KV cache ({max_seq_len}). Try increasing "
+                "`gpu_memory_utilization` or decreasing `max_model_len` when "
+                "initializing the engine.")
+
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+        # Initialize the cache.
+        self._run_workers("init_cache_engine", cache_config=self.cache_config)
+        # Warm up the model. This includes capturing the model into CUDA graph
+        # if enforce_eager is False.
+        self._run_workers("warm_up_model")
+
+    @classmethod
+    def from_engine_args(cls, engine_args: EngineArgs) -> "LLMEngine":
+        """Creates an LLM engine from the engine arguments."""
+        # Create the engine configs.
+        engine_configs = engine_args.create_engine_configs()
+        parallel_config = engine_configs[2]
+        # Initialize the cluster.
+        distributed_init_method, placement_group = initialize_cluster(
+            parallel_config)
+        # Create the LLM engine.
+        engine = cls(*engine_configs,
+                     distributed_init_method,
+                     placement_group,
+                     log_stats=not engine_args.disable_log_stats)
+        return engine
+
+    def add_request(
+        self,
+        request_id: str,
+        prompt: Optional[str],
+        sampling_params: SamplingParams,
+        prompt_token_ids: Optional[List[int]] = None,
+        arrival_time: Optional[float] = None,
+    ) -> None:
+        """Add a request to the engine's request pool.
+
+        The request is added to the request pool and will be processed by the
+        scheduler as `engine.step()` is called. The exact scheduling policy is
+        determined by the scheduler.
+
+        Args:
+            request_id: The unique ID of the request.
+            prompt: The prompt string. Can be None if prompt_token_ids is
+                provided.
+            sampling_params: The sampling parameters for text generation.
+            prompt_token_ids: The token IDs of the prompt. If None, we
+                use the tokenizer to convert the prompts to token IDs.
+            arrival_time: The arrival time of the request. If None, we use
+                the current monotonic time.
+        """
+        if arrival_time is None:
+            arrival_time = time.monotonic()
+        if prompt_token_ids is None:
+            assert prompt is not None
+            prompt_token_ids = self.tokenizer.encode(prompt)
+
+        # Create the sequences.
+        block_size = self.cache_config.block_size
+        seq_id = next(self.seq_counter)
+        seq = Sequence(seq_id, prompt, prompt_token_ids, block_size)
+
+        # Create the sequence group.
+        seq_group = SequenceGroup(request_id, [seq], sampling_params,
+                                  arrival_time)
+
+        # Add the sequence group to the scheduler.
+        self.scheduler.add_seq_group(seq_group)
+
+    def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
+        """Aborts a request(s) with the given ID.
+
+        Args:
+            request_id: The ID(s) of the request to abort.
+        """
+        self.scheduler.abort_seq_group(request_id)
+
+    def get_model_config(self) -> ModelConfig:
+        """Gets the model configuration."""
+        return self.model_config
+
+    def get_num_unfinished_requests(self) -> int:
+        """Gets the number of unfinished requests."""
+        return self.scheduler.get_num_unfinished_seq_groups()
+
+    def has_unfinished_requests(self) -> bool:
+        """Returns True if there are unfinished requests."""
+        return self.scheduler.has_unfinished_seqs()
+
+    def _schedule(
+        self
+    ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs,
+               List[RequestOutput]]:
+        seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
+        return seq_group_metadata_list, scheduler_outputs, [
+            RequestOutput.from_seq_group(seq_group)
+            for seq_group in scheduler_outputs.ignored_seq_groups
+        ]
+
+    def _check_beam_search_early_stopping(
+        self,
+        early_stopping: Union[bool, str],
+        sampling_params: SamplingParams,
+        best_running_seq: Sequence,
+        current_worst_seq: Sequence,
+    ) -> bool:
+        assert sampling_params.use_beam_search
+        length_penalty = sampling_params.length_penalty
+        if early_stopping is True:
+            return True
+
+        current_worst_score = (current_worst_seq.get_beam_search_score(
+            length_penalty=length_penalty,
+            eos_token_id=self.tokenizer.eos_token_id))
+        if early_stopping is False:
+            highest_attainable_score = (best_running_seq.get_beam_search_score(
+                length_penalty=length_penalty,
+                eos_token_id=self.tokenizer.eos_token_id))
+        else:
+            assert early_stopping == "never"
+            if length_penalty > 0.0:
+                # If length_penalty > 0.0, beam search will prefer longer
+                # sequences. The highest attainable score calculation is
+                # based on the longest possible sequence length in this case.
+                max_possible_length = max(
+                    best_running_seq.get_prompt_len() +
+                    sampling_params.max_tokens,
+                    self.scheduler_config.max_model_len)
+                highest_attainable_score = (
+                    best_running_seq.get_beam_search_score(
+                        length_penalty=length_penalty,
+                        eos_token_id=self.tokenizer.eos_token_id,
+                        seq_len=max_possible_length))
+            else:
+                # Otherwise, beam search will prefer shorter sequences. The
+                # highest attainable score calculation is based on the current
+                # sequence length.
+                highest_attainable_score = (
+                    best_running_seq.get_beam_search_score(
+                        length_penalty=length_penalty,
+                        eos_token_id=self.tokenizer.eos_token_id))
+        return current_worst_score >= highest_attainable_score
+
+    def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
+                                        outputs: SequenceGroupOutput) -> None:
+        # Process prompt logprobs
+        prompt_logprobs = outputs.prompt_logprobs
+        if prompt_logprobs is not None:
+            seq_group.prompt_logprobs = prompt_logprobs
+
+        # Process samples
+        samples = outputs.samples
+        parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
+        existing_finished_seqs = seq_group.get_finished_seqs()
+        parent_child_dict = {
+            parent_seq.seq_id: []
+            for parent_seq in parent_seqs
+        }
+        for sample in samples:
+            parent_child_dict[sample.parent_seq_id].append(sample)
+        # List of (child, parent)
+        child_seqs: List[Tuple[Sequence, Sequence]] = []
+
+        # Process the child samples for each parent sequence
+        for parent in parent_seqs:
+            child_samples: List[SequenceOutput] = parent_child_dict[
+                parent.seq_id]
+            if len(child_samples) == 0:
+                # This parent sequence has no children samples. Remove
+                # the parent sequence from the sequence group since it will
+                # not be used in the future iterations.
+                parent.status = SequenceStatus.FINISHED_ABORTED
+                seq_group.remove(parent.seq_id)
+                self.scheduler.free_seq(parent)
+                continue
+            # Fork the parent sequence if there are multiple child samples.
+            for child_sample in child_samples[:-1]:
+                new_child_seq_id = next(self.seq_counter)
+                child = parent.fork(new_child_seq_id)
+                child.append_token_id(child_sample.output_token,
+                                      child_sample.logprobs)
+                child_seqs.append((child, parent))
+            # Continue the parent sequence for the last child sample.
+            # We reuse the parent sequence here to reduce redundant memory
+            # copies, especially when using non-beam search sampling methods.
+            last_child_sample = child_samples[-1]
+            parent.append_token_id(last_child_sample.output_token,
+                                   last_child_sample.logprobs)
+            child_seqs.append((parent, parent))
+
+        for seq, _ in child_seqs:
+            self._decode_sequence(seq, seq_group.sampling_params)
+            self._check_stop(seq, seq_group.sampling_params)
+
+        # Non-beam search case
+        if not seq_group.sampling_params.use_beam_search:
+            # For newly created child sequences, add them to the sequence group
+            # and fork them in block manager if they are not finished.
+            for seq, parent in child_seqs:
+                if seq is not parent:
+                    seq_group.add(seq)
+                    if not seq.is_finished():
+                        self.scheduler.fork_seq(parent, seq)
+
+            # Free the finished and selected parent sequences' memory in block
+            # manager. Keep them in the sequence group as candidate output.
+            # NOTE: we need to fork the new sequences before freeing the
+            # old sequences.
+            for seq, parent in child_seqs:
+                if seq is parent and seq.is_finished():
+                    self.scheduler.free_seq(seq)
+            return
+
+        # Beam search case
+        # Select the child sequences to keep in the sequence group.
+        selected_child_seqs = []
+        unselected_child_seqs = []
+        beam_width = seq_group.sampling_params.best_of
+        length_penalty = seq_group.sampling_params.length_penalty
+
+        # Select the newly finished sequences with the highest scores
+        # to replace existing finished sequences.
+        # Tuple of (seq, parent, is_new)
+        existing_finished_seqs = [(seq, None, False)
+                                  for seq in existing_finished_seqs]
+        new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs
+                             if seq.is_finished()]
+        all_finished_seqs = existing_finished_seqs + new_finished_seqs
+        # Sort the finished sequences by their scores.
+        all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score(
+            length_penalty=length_penalty,
+            eos_token_id=self.tokenizer.eos_token_id),
+                               reverse=True)
+        for seq, parent, is_new in all_finished_seqs[:beam_width]:
+            if is_new:
+                # A newly generated child sequence finishes and has a high
+                # score, so we will add it into the sequence group.
+                selected_child_seqs.append((seq, parent))
+        for seq, parent, is_new in all_finished_seqs[beam_width:]:
+            if is_new:
+                # A newly generated child sequence finishes but has a low
+                # score, so we will not add it into the sequence group.
+                # Additionally, if this sequence is a continuation of a
+                # parent sequence, we will need remove the parent sequence
+                # from the sequence group.
+                unselected_child_seqs.append((seq, parent))
+            else:
+                # An existing finished sequence has a low score, so we will
+                # remove it from the sequence group.
+                seq_group.remove(seq.seq_id)
+
+        # select the top beam_width sequences from the running
+        # sequences for the next iteration to continue the beam
+        # search.
+        running_child_seqs = [(seq, parent) for seq, parent in child_seqs
+                              if not seq.is_finished()]
+        # Sort the running sequences by their scores.
+        running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score(
+            length_penalty=length_penalty,
+            eos_token_id=self.tokenizer.eos_token_id),
+                                reverse=True)
+
+        # Check if we can stop the beam search.
+        if len(running_child_seqs) == 0:
+            # No running sequences, stop the beam search.
+            stop_beam_search = True
+        elif len(all_finished_seqs) < beam_width:
+            # Not enough finished sequences, continue the beam search.
+            stop_beam_search = False
+        else:
+            # Check the early stopping criteria
+            best_running_seq = running_child_seqs[0][0]
+            current_worst_seq = all_finished_seqs[beam_width - 1][0]
+            stop_beam_search = self._check_beam_search_early_stopping(
+                seq_group.sampling_params.early_stopping,
+                seq_group.sampling_params, best_running_seq, current_worst_seq)
+
+        if stop_beam_search:
+            # Stop the beam search and remove all the running sequences from
+            # the sequence group.
+            unselected_child_seqs.extend(running_child_seqs)
+        else:
+            # Continue the beam search and select the top beam_width sequences
+            # to continue the beam search.
+            selected_child_seqs.extend(running_child_seqs[:beam_width])
+            # The remaining running sequences will not be used in the next
+            # iteration. Again, if these sequences are continuations of
+            # parent sequences, we will need to remove the parent sequences
+            # from the sequence group.
+            unselected_child_seqs.extend(running_child_seqs[beam_width:])
+
+        # For newly created child sequences, add them to the sequence group
+        # and fork them in block manager if they are not finished.
+        for seq, parent in selected_child_seqs:
+            if seq is not parent:
+                seq_group.add(seq)
+                if not seq.is_finished():
+                    self.scheduler.fork_seq(parent, seq)
+
+        # Free the finished and selected parent sequences' memory in block
+        # manager. Keep them in the sequence group as candidate output.
+        for seq, parent in selected_child_seqs:
+            if seq is parent and seq.is_finished():
+                self.scheduler.free_seq(seq)
+
+        # Remove the unselected parent sequences from the sequence group and
+        # free their memory in block manager.
+        for seq, parent in unselected_child_seqs:
+            if seq is parent:
+                # Remove the parent sequence if it is not selected for next
+                # iteration
+                seq_group.remove(seq.seq_id)
+                self.scheduler.free_seq(seq)
+
+    def _process_model_outputs(
+            self, output: SamplerOutput,
+            scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]:
+        # Update the scheduled sequence groups with the model outputs.
+        scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
+        for seq_group, outputs in zip(scheduled_seq_groups, output):
+            self._process_sequence_group_outputs(seq_group, outputs)
+
+        # Free the finished sequence groups.
+        self.scheduler.free_finished_seq_groups()
+
+        # Create the outputs.
+        request_outputs: List[RequestOutput] = []
+        for seq_group in (scheduled_seq_groups +
+                          scheduler_outputs.ignored_seq_groups):
+            request_output = RequestOutput.from_seq_group(seq_group)
+            request_outputs.append(request_output)
+
+        if self.log_stats:
+            # Log the system stats.
+            self._log_system_stats(scheduler_outputs.prompt_run,
+                                   scheduler_outputs.num_batched_tokens)
+        return request_outputs
+
+    def step(self) -> List[RequestOutput]:
+        """Performs one decoding iteration and returns newly generated results.
+
+        This function performs one decoding iteration of the engine. It first
+        schedules the sequences to be executed in the next iteration and the
+        token blocks to be swapped in/out/copy. Then, it executes the model
+        and updates the scheduler with the model outputs. Finally, it decodes
+        the sequences and returns the newly generated results.
+        """
+        seq_group_metadata_list, scheduler_outputs, ignored = self._schedule()
+        if scheduler_outputs.is_empty():
+            return ignored
+
+        # Execute the model.
+        output = self._run_workers(
+            "execute_model",
+            seq_group_metadata_list=seq_group_metadata_list,
+            blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
+            blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
+            blocks_to_copy=scheduler_outputs.blocks_to_copy,
+        )
+
+        return self._process_model_outputs(output, scheduler_outputs)
+
+    def _log_system_stats(
+        self,
+        prompt_run: bool,
+        num_batched_tokens: int,
+    ) -> None:
+        now = time.monotonic()
+        # Log the number of batched input tokens.
+        if prompt_run:
+            self.num_prompt_tokens.append((now, num_batched_tokens))
+        else:
+            self.num_generation_tokens.append((now, num_batched_tokens))
+
+        should_log = now - self.last_logging_time >= _LOGGING_INTERVAL_SEC
+        if not should_log:
+            return
+
+        # Discard the old stats.
+        self.num_prompt_tokens = [(t, n) for t, n in self.num_prompt_tokens
+                                  if now - t < _LOGGING_INTERVAL_SEC]
+        self.num_generation_tokens = [(t, n)
+                                      for t, n in self.num_generation_tokens
+                                      if now - t < _LOGGING_INTERVAL_SEC]
+
+        if len(self.num_prompt_tokens) > 1:
+            total_num_tokens = sum(n for _, n in self.num_prompt_tokens[:-1])
+            window = now - self.num_prompt_tokens[0][0]
+            avg_prompt_throughput = total_num_tokens / window
+        else:
+            avg_prompt_throughput = 0.0
+        if len(self.num_generation_tokens) > 1:
+            total_num_tokens = sum(n
+                                   for _, n in self.num_generation_tokens[:-1])
+            window = now - self.num_generation_tokens[0][0]
+            avg_generation_throughput = total_num_tokens / window
+        else:
+            avg_generation_throughput = 0.0
+
+        total_num_gpu_blocks = self.cache_config.num_gpu_blocks
+        num_free_gpu_blocks = (
+            self.scheduler.block_manager.get_num_free_gpu_blocks())
+        num_used_gpu_blocks = total_num_gpu_blocks - num_free_gpu_blocks
+        gpu_cache_usage = num_used_gpu_blocks / total_num_gpu_blocks
+
+        total_num_cpu_blocks = self.cache_config.num_cpu_blocks
+        if total_num_cpu_blocks > 0:
+            num_free_cpu_blocks = (
+                self.scheduler.block_manager.get_num_free_cpu_blocks())
+            num_used_cpu_blocks = total_num_cpu_blocks - num_free_cpu_blocks
+            cpu_cache_usage = num_used_cpu_blocks / total_num_cpu_blocks
+        else:
+            cpu_cache_usage = 0.0
+
+        record_metrics(
+            avg_prompt_throughput=avg_prompt_throughput,
+            avg_generation_throughput=avg_generation_throughput,
+            scheduler_running=len(self.scheduler.running),
+            scheduler_swapped=len(self.scheduler.swapped),
+            scheduler_waiting=len(self.scheduler.waiting),
+            gpu_cache_usage=gpu_cache_usage,
+            cpu_cache_usage=cpu_cache_usage,
+        )
+
+        logger.info("Avg prompt throughput: "
+                    f"{avg_prompt_throughput:.1f} tokens/s, "
+                    "Avg generation throughput: "
+                    f"{avg_generation_throughput:.1f} tokens/s, "
+                    f"Running: {len(self.scheduler.running)} reqs, "
+                    f"Swapped: {len(self.scheduler.swapped)} reqs, "
+                    f"Pending: {len(self.scheduler.waiting)} reqs, "
+                    f"GPU KV cache usage: {gpu_cache_usage * 100:.1f}%, "
+                    f"CPU KV cache usage: {cpu_cache_usage * 100:.1f}%")
+        self.last_logging_time = now
+
+    def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None:
+        """Decodes the new token for a sequence."""
+        (new_tokens, new_output_text, prefix_offset,
+         read_offset) = detokenize_incrementally(
+             self.tokenizer,
+             all_input_ids=seq.get_token_ids(),
+             prev_tokens=seq.tokens,
+             prefix_offset=seq.prefix_offset,
+             read_offset=seq.read_offset,
+             skip_special_tokens=prms.skip_special_tokens,
+             spaces_between_special_tokens=prms.spaces_between_special_tokens,
+         )
+        if seq.tokens is None:
+            seq.tokens = new_tokens
+        else:
+            seq.tokens.extend(new_tokens)
+        seq.prefix_offset = prefix_offset
+        seq.read_offset = read_offset
+        seq.output_text += new_output_text
+
+    def _check_stop(self, seq: Sequence,
+                    sampling_params: SamplingParams) -> None:
+        """Stop the finished sequences."""
+        for stop_str in sampling_params.stop:
+            if seq.output_text.endswith(stop_str):
+                if not sampling_params.include_stop_str_in_output:
+                    # Truncate the output text so that the stop string is
+                    # not included in the output.
+                    seq.output_text = seq.output_text[:-len(stop_str)]
+                seq.status = SequenceStatus.FINISHED_STOPPED
+                return
+        if seq.get_last_token_id() in sampling_params.stop_token_ids:
+            seq.status = SequenceStatus.FINISHED_STOPPED
+            return
+
+        # Check if the sequence has reached max_model_len.
+        if seq.get_len() > self.scheduler_config.max_model_len:
+            seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
+            return
+
+        # Check if the sequence has reached max_tokens.
+        if seq.get_output_len() == sampling_params.max_tokens:
+            seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
+            return
+
+        # Check if the sequence has generated the EOS token.
+        if ((not sampling_params.ignore_eos)
+                and seq.get_last_token_id() == self.tokenizer.eos_token_id):
+            seq.status = SequenceStatus.FINISHED_STOPPED
+            return
+
+    def _run_workers_in_batch(
+        self,
+        workers,
+        method: str,
+        *args,
+        **kwargs,
+    ):
+        all_outputs = []
+        for worker in workers:
+            if self.parallel_config.worker_use_ray:
+                executor = partial(worker.execute_method.remote, method)
+            else:
+                executor = getattr(worker, method)
+
+            output = executor(*args, **kwargs)
+            all_outputs.append(output)
+        if self.parallel_config.worker_use_ray:
+            all_outputs = ray.get(all_outputs)
+        return all_outputs
+
+    def _run_workers(
+        self,
+        method: str,
+        *args,
+        get_all_outputs: bool = False,
+        max_concurrent_workers: Optional[int] = None,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers."""
+        all_outputs = []
+        if max_concurrent_workers:
+            work_groups = [
+                self.workers[i:i + max_concurrent_workers]
+                for i in range(0, len(self.workers), max_concurrent_workers)
+            ]
+        else:
+            work_groups = [self.workers]
+
+        for workers in work_groups:
+            all_outputs.extend(
+                self._run_workers_in_batch(workers, method, *args, **kwargs))
+
+        if get_all_outputs:
+            return all_outputs
+
+        # Make sure all workers have the same results.
+        output = all_outputs[0]
+        for other_output in all_outputs[1:]:
+            assert output == other_output
+        return output
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
+from aioprometheus import Gauge
+
+# The begin-* and end* here are used by the documentation generator
+# to extract the metrics definitions.
+
+# begin-metrics-definitions
+gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput_toks_per_s",
+                                    "Average prefill throughput in tokens/s.")
+gauge_avg_generation_throughput = Gauge(
+    "vllm:avg_generation_throughput_toks_per_s",
+    "Average generation throughput in tokens/s.")
+
+gauge_scheduler_running = Gauge(
+    "vllm:num_requests_running",
+    "Number of requests that is currently running for inference.")
+gauge_scheduler_swapped = Gauge("vllm:num_requests_swapped",
+                                "Number requests swapped to CPU.")
+gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting",
+                                "Number of requests waiting to be processed.")
+
+gauge_gpu_cache_usage = Gauge(
+    "vllm:gpu_cache_usage_perc",
+    "GPU KV-cache usage. 1 means 100 percent usage.")
+gauge_cpu_cache_usage = Gauge(
+    "vllm:cpu_cache_usage_perc",
+    "CPU KV-cache usage. 1 means 100 percent usage.")
+# end-metrics-definitions
+
+labels = {}
+
+
+def add_global_metrics_labels(**kwargs):
+    labels.update(kwargs)
+
+
+def record_metrics(
+    avg_prompt_throughput: float,
+    avg_generation_throughput: float,
+    scheduler_running: int,
+    scheduler_swapped: int,
+    scheduler_waiting: int,
+    gpu_cache_usage: float,
+    cpu_cache_usage: float,
+):
+    gauge_avg_prompt_throughput.set(labels, avg_prompt_throughput)
+    gauge_avg_generation_throughput.set(labels, avg_generation_throughput)
+    gauge_scheduler_running.set(labels, scheduler_running)
+    gauge_scheduler_swapped.set(labels, scheduler_swapped)
+    gauge_scheduler_waiting.set(labels, scheduler_waiting)
+    gauge_gpu_cache_usage.set(labels, gpu_cache_usage)
+    gauge_cpu_cache_usage.set(labels, cpu_cache_usage)
--- a/vllm/engine/ray_utils.py
+++ b/vllm/engine/ray_utils.py
+from typing import Optional, Tuple, TYPE_CHECKING
+
+from vllm.config import ParallelConfig
+from vllm.logger import init_logger
+from vllm.utils import get_open_port, is_hip
+
+logger = init_logger(__name__)
+
+try:
+    import ray
+    from ray.air.util.torch_dist import TorchDistributedWorker
+
+    class RayWorkerVllm(TorchDistributedWorker):
+        """Ray wrapper for vllm.worker.Worker, allowing Worker to be
+        lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
+
+        def __init__(self, init_cached_hf_modules=False) -> None:
+            if init_cached_hf_modules:
+                from transformers.dynamic_module_utils import init_hf_modules
+                init_hf_modules()
+            self.worker = None
+
+        def init_worker(self, worker_init_fn):
+            self.worker = worker_init_fn()
+
+        def __getattr__(self, name):
+            return getattr(self.worker, name)
+
+        def execute_method(self, method, *args, **kwargs):
+            executor = getattr(self, method)
+            return executor(*args, **kwargs)
+
+except ImportError as e:
+    logger.warning(f"Failed to import Ray with {e!r}. "
+                   "For distributed inference, please install Ray with "
+                   "`pip install ray pandas pyarrow`.")
+    ray = None
+    TorchDistributedWorker = None
+    RayWorkerVllm = None
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+
+def initialize_cluster(
+    parallel_config: ParallelConfig,
+    engine_use_ray: bool = False,
+    ray_address: Optional[str] = None,
+) -> Tuple[str, Optional["PlacementGroup"]]:
+    """Initialize the distributed cluster probably with Ray.
+
+    Args:
+        parallel_config: The configurations for parallel execution.
+        engine_use_ray: Whether to use Ray for async engine.
+        ray_address: The address of the Ray cluster. If None, uses
+            the default Ray cluster address.
+
+    Returns:
+        A tuple of (`distributed_init_method`, `placement_group`). The
+        `distributed_init_method` is the address for initializing the
+        distributed backend. `placement_group` includes the specification
+        of the resources for each distributed worker.
+    """
+    if parallel_config.worker_use_ray or engine_use_ray:
+        if ray is None:
+            raise ImportError(
+                "Ray is not installed. Please install Ray to use distributed "
+                "serving.")
+        # Connect to a ray cluster.
+        if is_hip():
+            ray.init(address=ray_address,
+                     ignore_reinit_error=True,
+                     num_gpus=parallel_config.world_size)
+        else:
+            ray.init(address=ray_address, ignore_reinit_error=True)
+
+    if not parallel_config.worker_use_ray:
+        # Initialize cluster locally.
+        port = get_open_port()
+        # We need to setup the distributed init method to make sure
+        # the distributed megatron code (e.g., get world size) works correctly.
+        distributed_init_method = f"tcp://localhost:{port}"
+        return distributed_init_method, None
+
+    current_placement_group = ray.util.get_current_placement_group()
+    if current_placement_group:
+        # We are in a placement group
+        bundles = current_placement_group.bundle_specs
+        # Verify that we can use the placement group.
+        gpu_bundles = 0
+        for bundle in bundles:
+            bundle_gpus = bundle.get("GPU", 0)
+            if bundle_gpus > 1:
+                raise ValueError(
+                    "Placement group bundle cannot have more than 1 GPU.")
+            if bundle_gpus:
+                gpu_bundles += 1
+        if parallel_config.world_size > gpu_bundles:
+            raise ValueError(
+                "The number of required GPUs exceeds the total number of "
+                "available GPUs in the placement group.")
+    else:
+        num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0)
+        if parallel_config.world_size > num_gpus_in_cluster:
+            raise ValueError(
+                "The number of required GPUs exceeds the total number of "
+                "available GPUs in the cluster.")
+        # Create a new placement group
+        current_placement_group = ray.util.placement_group([{
+            "GPU": 1
+        }] * parallel_config.world_size)
+        # Wait until PG is ready - this will block until all
+        # requested resources are available, and will timeout
+        # if they cannot be provisioned.
+        ray.get(current_placement_group.ready(), timeout=1800)
+
+    return None, current_placement_group
--- a/vllm/entrypoints/__init__.py
+++ b/vllm/entrypoints/__init__.py
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
+import argparse
+import json
+from typing import AsyncGenerator
+
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse, Response, StreamingResponse
+import uvicorn
+
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.sampling_params import SamplingParams
+from vllm.utils import random_uuid
+
+TIMEOUT_KEEP_ALIVE = 5  # seconds.
+TIMEOUT_TO_PREVENT_DEADLOCK = 1  # seconds.
+app = FastAPI()
+engine = None
+
+
+@app.get("/health")
+async def health() -> Response:
+    """Health check."""
+    return Response(status_code=200)
+
+
+@app.post("/generate")
+async def generate(request: Request) -> Response:
+    """Generate completion for the request.
+
+    The request should be a JSON object with the following fields:
+    - prompt: the prompt to use for the generation.
+    - stream: whether to stream the results or not.
+    - other fields: the sampling parameters (See `SamplingParams` for details).
+    """
+    request_dict = await request.json()
+    prompt = request_dict.pop("prompt")
+    stream = request_dict.pop("stream", False)
+    sampling_params = SamplingParams(**request_dict)
+    request_id = random_uuid()
+
+    results_generator = engine.generate(prompt, sampling_params, request_id)
+
+    # Streaming case
+    async def stream_results() -> AsyncGenerator[bytes, None]:
+        async for request_output in results_generator:
+            prompt = request_output.prompt
+            text_outputs = [
+                prompt + output.text for output in request_output.outputs
+            ]
+            ret = {"text": text_outputs}
+            yield (json.dumps(ret) + "\0").encode("utf-8")
+
+    if stream:
+        return StreamingResponse(stream_results())
+
+    # Non-streaming case
+    final_output = None
+    async for request_output in results_generator:
+        if await request.is_disconnected():
+            # Abort the request if the client disconnects.
+            await engine.abort(request_id)
+            return Response(status_code=499)
+        final_output = request_output
+
+    assert final_output is not None
+    prompt = final_output.prompt
+    text_outputs = [prompt + output.text for output in final_output.outputs]
+    ret = {"text": text_outputs}
+    return JSONResponse(ret)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default=None)
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--ssl-keyfile", type=str, default=None)
+    parser.add_argument("--ssl-certfile", type=str, default=None)
+    parser = AsyncEngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = AsyncLLMEngine.from_engine_args(engine_args)
+
+    uvicorn.run(app,
+                host=args.host,
+                port=args.port,
+                log_level="debug",
+                timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
+                ssl_keyfile=args.ssl_keyfile,
+                ssl_certfile=args.ssl_certfile)
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
+from typing import List, Optional, Union
+
+from tqdm import tqdm
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.utils import Counter
+
+
+class LLM:
+    """An LLM for generating texts from given prompts and sampling parameters.
+
+    This class includes a tokenizer, a language model (possibly distributed
+    across multiple GPUs), and GPU memory space allocated for intermediate
+    states (aka KV cache). Given a batch of prompts and sampling parameters,
+    this class generates texts from the model, using an intelligent batching
+    mechanism and efficient memory management.
+
+    NOTE: This class is intended to be used for offline inference. For online
+    serving, use the `AsyncLLMEngine` class instead.
+    NOTE: For the comprehensive list of arguments, see `EngineArgs`.
+
+    Args:
+        model: The name or path of a HuggingFace Transformers model.
+        tokenizer: The name or path of a HuggingFace Transformers tokenizer.
+        tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
+            if available, and "slow" will always use the slow tokenizer.
+        trust_remote_code: Trust remote code (e.g., from HuggingFace) when
+            downloading the model and tokenizer.
+        tensor_parallel_size: The number of GPUs to use for distributed
+            execution with tensor parallelism.
+        dtype: The data type for the model weights and activations. Currently,
+            we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
+            the `torch_dtype` attribute specified in the model config file.
+            However, if the `torch_dtype` in the config is `float32`, we will
+            use `float16` instead.
+        quantization: The method used to quantize the model weights. Currently,
+            we support "awq", "gptq" and "squeezellm". If None, we first check
+            the `quantization_config` attribute in the model config file. If
+            that is None, we assume the model weights are not quantized and use
+            `dtype` to determine the data type of the weights.
+        revision: The specific model version to use. It can be a branch name,
+            a tag name, or a commit id.
+        tokenizer_revision: The specific tokenizer version to use. It can be a
+            branch name, a tag name, or a commit id.
+        seed: The seed to initialize the random number generator for sampling.
+        gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
+            reserve for the model weights, activations, and KV cache. Higher
+            values will increase the KV cache size and thus improve the model's
+            throughput. However, if the value is too high, it may cause out-of-
+            memory (OOM) errors.
+        swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
+            This can be used for temporarily storing the states of the requests
+            when their `best_of` sampling parameters are larger than 1. If all
+            requests will have `best_of=1`, you can safely set this to 0.
+            Otherwise, too small values may cause out-of-memory (OOM) errors.
+        enforce_eager: Whether to enforce eager execution. If True, we will
+            disable CUDA graph and always execute the model in eager mode.
+            If False, we will use CUDA graph and eager execution in hybrid.
+        max_context_len_to_capture: Maximum context len covered by CUDA graphs.
+            When a sequence has context length larger than this, we fall back
+            to eager mode.
+    """
+
+    def __init__(
+        self,
+        model: str,
+        tokenizer: Optional[str] = None,
+        tokenizer_mode: str = "auto",
+        trust_remote_code: bool = False,
+        tensor_parallel_size: int = 1,
+        dtype: str = "auto",
+        quantization: Optional[str] = None,
+        revision: Optional[str] = None,
+        tokenizer_revision: Optional[str] = None,
+        seed: int = 0,
+        gpu_memory_utilization: float = 0.9,
+        swap_space: int = 4,
+        enforce_eager: bool = False,
+        max_context_len_to_capture: int = 8192,
+        **kwargs,
+    ) -> None:
+        if "disable_log_stats" not in kwargs:
+            kwargs["disable_log_stats"] = True
+        engine_args = EngineArgs(
+            model=model,
+            tokenizer=tokenizer,
+            tokenizer_mode=tokenizer_mode,
+            trust_remote_code=trust_remote_code,
+            tensor_parallel_size=tensor_parallel_size,
+            dtype=dtype,
+            quantization=quantization,
+            revision=revision,
+            tokenizer_revision=tokenizer_revision,
+            seed=seed,
+            gpu_memory_utilization=gpu_memory_utilization,
+            swap_space=swap_space,
+            enforce_eager=enforce_eager,
+            max_context_len_to_capture=max_context_len_to_capture,
+            **kwargs,
+        )
+        self.llm_engine = LLMEngine.from_engine_args(engine_args)
+        self.request_counter = Counter()
+
+    def get_tokenizer(
+            self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+        return self.llm_engine.tokenizer
+
+    def set_tokenizer(
+        self,
+        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    ) -> None:
+        self.llm_engine.tokenizer = tokenizer
+
+    def generate(
+        self,
+        prompts: Optional[Union[str, List[str]]] = None,
+        sampling_params: Optional[SamplingParams] = None,
+        prompt_token_ids: Optional[List[List[int]]] = None,
+        use_tqdm: bool = True,
+    ) -> List[RequestOutput]:
+        """Generates the completions for the input prompts.
+
+        NOTE: This class automatically batches the given prompts, considering
+        the memory constraint. For the best performance, put all of your prompts
+        into a single list and pass it to this method.
+
+        Args:
+            prompts: A list of prompts to generate completions for.
+            sampling_params: The sampling parameters for text generation. If
+                None, we use the default sampling parameters.
+            prompt_token_ids: A list of token IDs for the prompts. If None, we
+                use the tokenizer to convert the prompts to token IDs.
+            use_tqdm: Whether to use tqdm to display the progress bar.
+
+        Returns:
+            A list of `RequestOutput` objects containing the generated
+            completions in the same order as the input prompts.
+        """
+        if prompts is None and prompt_token_ids is None:
+            raise ValueError("Either prompts or prompt_token_ids must be "
+                             "provided.")
+        if isinstance(prompts, str):
+            # Convert a single prompt to a list.
+            prompts = [prompts]
+        if (prompts is not None and prompt_token_ids is not None
+                and len(prompts) != len(prompt_token_ids)):
+            raise ValueError("The lengths of prompts and prompt_token_ids "
+                             "must be the same.")
+        if sampling_params is None:
+            # Use default sampling params.
+            sampling_params = SamplingParams()
+
+        # Add requests to the engine.
+        num_requests = len(prompts) if prompts is not None else len(
+            prompt_token_ids)
+        for i in range(num_requests):
+            prompt = prompts[i] if prompts is not None else None
+            token_ids = None if prompt_token_ids is None else prompt_token_ids[
+                i]
+            self._add_request(prompt, sampling_params, token_ids)
+        return self._run_engine(use_tqdm)
+
+    def _add_request(
+        self,
+        prompt: Optional[str],
+        sampling_params: SamplingParams,
+        prompt_token_ids: Optional[List[int]],
+    ) -> None:
+        request_id = str(next(self.request_counter))
+        self.llm_engine.add_request(request_id, prompt, sampling_params,
+                                    prompt_token_ids)
+
+    def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
+        # Initialize tqdm.
+        if use_tqdm:
+            num_requests = self.llm_engine.get_num_unfinished_requests()
+            pbar = tqdm(total=num_requests, desc="Processed prompts")
+        # Run the engine.
+        outputs: List[RequestOutput] = []
+        while self.llm_engine.has_unfinished_requests():
+            step_outputs = self.llm_engine.step()
+            for output in step_outputs:
+                if output.finished:
+                    outputs.append(output)
+                    if use_tqdm:
+                        pbar.update(1)
+        if use_tqdm:
+            pbar.close()
+        # Sort the outputs by request ID.
+        # This is necessary because some requests may be finished earlier than
+        # its previous requests.
+        outputs = sorted(outputs, key=lambda x: int(x.request_id))
+        return outputs
--- a/vllm/entrypoints/openai/__init__.py
+++ b/vllm/entrypoints/openai/__init__.py
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
+# Adapted from
+# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/serve/openai_api_server.py
+
+import argparse
+import asyncio
+import codecs
+import json
+import time
+from http import HTTPStatus
+from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union
+
+from aioprometheus import MetricsMiddleware
+from aioprometheus.asgi.starlette import metrics
+import fastapi
+import uvicorn
+from fastapi import Request
+from fastapi.exceptions import RequestValidationError
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, StreamingResponse, Response
+
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.metrics import add_global_metrics_labels
+from vllm.entrypoints.openai.protocol import (
+    CompletionRequest, CompletionResponse, CompletionResponseChoice,
+    CompletionResponseStreamChoice, CompletionStreamResponse,
+    ChatCompletionRequest, ChatCompletionResponse,
+    ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
+    ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse,
+    LogProbs, ModelCard, ModelList, ModelPermission, UsageInfo)
+from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils import random_uuid
+
+TIMEOUT_KEEP_ALIVE = 5  # seconds
+
+logger = init_logger(__name__)
+served_model = None
+app = fastapi.FastAPI()
+engine = None
+response_role = None
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="vLLM OpenAI-Compatible RESTful API server.")
+    parser.add_argument("--host", type=str, default=None, help="host name")
+    parser.add_argument("--port", type=int, default=8000, help="port number")
+    parser.add_argument("--allow-credentials",
+                        action="store_true",
+                        help="allow credentials")
+    parser.add_argument("--allowed-origins",
+                        type=json.loads,
+                        default=["*"],
+                        help="allowed origins")
+    parser.add_argument("--allowed-methods",
+                        type=json.loads,
+                        default=["*"],
+                        help="allowed methods")
+    parser.add_argument("--allowed-headers",
+                        type=json.loads,
+                        default=["*"],
+                        help="allowed headers")
+    parser.add_argument("--served-model-name",
+                        type=str,
+                        default=None,
+                        help="The model name used in the API. If not "
+                        "specified, the model name will be the same as "
+                        "the huggingface name.")
+    parser.add_argument("--chat-template",
+                        type=str,
+                        default=None,
+                        help="The file path to the chat template, "
+                        "or the template in single-line form "
+                        "for the specified model")
+    parser.add_argument("--response-role",
+                        type=str,
+                        default="assistant",
+                        help="The role name to return if "
+                        "`request.add_generation_prompt=true`.")
+    parser.add_argument("--ssl-keyfile",
+                        type=str,
+                        default=None,
+                        help="The file path to the SSL key file")
+    parser.add_argument("--ssl-certfile",
+                        type=str,
+                        default=None,
+                        help="The file path to the SSL cert file")
+
+    parser = AsyncEngineArgs.add_cli_args(parser)
+    return parser.parse_args()
+
+
+app.add_middleware(MetricsMiddleware)  # Trace HTTP server metrics
+app.add_route("/metrics", metrics)  # Exposes HTTP metrics
+
+
+def create_error_response(status_code: HTTPStatus,
+                          message: str) -> JSONResponse:
+    return JSONResponse(ErrorResponse(message=message,
+                                      type="invalid_request_error").dict(),
+                        status_code=status_code.value)
+
+
+def load_chat_template(args, tokenizer):
+    if args.chat_template is not None:
+        try:
+            with open(args.chat_template, "r") as f:
+                chat_template = f.read()
+        except OSError:
+            # If opening a file fails, set chat template to be args to
+            # ensure we decode so our escape are interpreted correctly
+            chat_template = codecs.decode(args.chat_template, "unicode_escape")
+
+        tokenizer.chat_template = chat_template
+        logger.info(
+            f"Using supplied chat template:\n{tokenizer.chat_template}")
+    elif tokenizer.chat_template is not None:
+        logger.info(f"Using default chat template:\n{tokenizer.chat_template}")
+    else:
+        logger.warning("No chat template provided. Chat API will not work.")
+
+
+@app.exception_handler(RequestValidationError)
+async def validation_exception_handler(_, exc):
+    return create_error_response(HTTPStatus.BAD_REQUEST, str(exc))
+
+
+async def check_model(request) -> Optional[JSONResponse]:
+    if request.model == served_model:
+        return
+    ret = create_error_response(
+        HTTPStatus.NOT_FOUND,
+        f"The model `{request.model}` does not exist.",
+    )
+    return ret
+
+
+async def check_length(
+    request: Union[ChatCompletionRequest, CompletionRequest],
+    prompt: Optional[str] = None,
+    prompt_ids: Optional[List[int]] = None
+) -> Tuple[List[int], Optional[JSONResponse]]:
+    assert (not (prompt is None and prompt_ids is None)
+            and not (prompt is not None and prompt_ids is not None)
+            ), "Either prompt or prompt_ids should be provided."
+    input_ids = prompt_ids if prompt_ids is not None else tokenizer(
+        prompt).input_ids
+    token_num = len(input_ids)
+
+    if request.max_tokens is None:
+        request.max_tokens = max_model_len - token_num
+    if token_num + request.max_tokens > max_model_len:
+        return input_ids, create_error_response(
+            HTTPStatus.BAD_REQUEST,
+            f"This model's maximum context length is {max_model_len} tokens. "
+            f"However, you requested {request.max_tokens + token_num} tokens "
+            f"({token_num} in the messages, "
+            f"{request.max_tokens} in the completion). "
+            f"Please reduce the length of the messages or completion.",
+        )
+    else:
+        return input_ids, None
+
+
+@app.get("/health")
+async def health() -> Response:
+    """Health check."""
+    return Response(status_code=200)
+
+
+@app.get("/v1/models")
+async def show_available_models():
+    """Show available models. Right now we only have one model."""
+    model_cards = [
+        ModelCard(id=served_model,
+                  root=served_model,
+                  permission=[ModelPermission()])
+    ]
+    return ModelList(data=model_cards)
+
+
+def create_logprobs(
+    token_ids: List[int],
+    top_logprobs: Optional[List[Optional[Dict[int, float]]]] = None,
+    num_output_top_logprobs: Optional[int] = None,
+    initial_text_offset: int = 0,
+) -> LogProbs:
+    """Create OpenAI-style logprobs."""
+    logprobs = LogProbs()
+    last_token_len = 0
+    if num_output_top_logprobs:
+        logprobs.top_logprobs = []
+    for i, token_id in enumerate(token_ids):
+        step_top_logprobs = top_logprobs[i]
+        if step_top_logprobs is not None:
+            token_logprob = step_top_logprobs[token_id]
+        else:
+            token_logprob = None
+        token = tokenizer.convert_ids_to_tokens(token_id)
+        logprobs.tokens.append(token)
+        logprobs.token_logprobs.append(token_logprob)
+        if len(logprobs.text_offset) == 0:
+            logprobs.text_offset.append(initial_text_offset)
+        else:
+            logprobs.text_offset.append(logprobs.text_offset[-1] +
+                                        last_token_len)
+        last_token_len = len(token)
+
+        if num_output_top_logprobs:
+            logprobs.top_logprobs.append({
+                tokenizer.convert_ids_to_tokens(i): p
+                for i, p in step_top_logprobs.items()
+            } if step_top_logprobs else None)
+    return logprobs
+
+
+@app.post("/v1/chat/completions")
+async def create_chat_completion(request: ChatCompletionRequest,
+                                 raw_request: Request):
+    """Completion API similar to OpenAI's API.
+
+    See  https://platform.openai.com/docs/api-reference/chat/create
+    for the API specification. This API mimics the OpenAI ChatCompletion API.
+
+    NOTE: Currently we do not support the following features:
+        - function_call (Users should implement this by themselves)
+        - logit_bias (to be supported by vLLM engine)
+    """
+    error_check_ret = await check_model(request)
+    if error_check_ret is not None:
+        return error_check_ret
+
+    if request.logit_bias is not None and len(request.logit_bias) > 0:
+        # TODO: support logit_bias in vLLM engine.
+        return create_error_response(HTTPStatus.BAD_REQUEST,
+                                     "logit_bias is not currently supported")
+
+    try:
+        prompt = tokenizer.apply_chat_template(
+            conversation=request.messages,
+            tokenize=False,
+            add_generation_prompt=request.add_generation_prompt)
+    except Exception as e:
+        logger.error(f"Error in applying chat template from request: {str(e)}")
+        return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
+
+    token_ids, error_check_ret = await check_length(request, prompt=prompt)
+    if error_check_ret is not None:
+        return error_check_ret
+
+    model_name = request.model
+    request_id = f"cmpl-{random_uuid()}"
+    created_time = int(time.monotonic())
+    chunk_object_type = "chat.completion.chunk"
+    try:
+        spaces_between_special_tokens = request.spaces_between_special_tokens
+        sampling_params = SamplingParams(
+            n=request.n,
+            presence_penalty=request.presence_penalty,
+            frequency_penalty=request.frequency_penalty,
+            repetition_penalty=request.repetition_penalty,
+            temperature=request.temperature,
+            top_p=request.top_p,
+            min_p=request.min_p,
+            stop=request.stop,
+            stop_token_ids=request.stop_token_ids,
+            max_tokens=request.max_tokens,
+            best_of=request.best_of,
+            top_k=request.top_k,
+            ignore_eos=request.ignore_eos,
+            use_beam_search=request.use_beam_search,
+            skip_special_tokens=request.skip_special_tokens,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+        )
+    except ValueError as e:
+        return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
+
+    result_generator = engine.generate(prompt, sampling_params, request_id,
+                                       token_ids)
+
+    def get_role() -> str:
+        if request.add_generation_prompt:
+            return response_role
+        else:
+            return request.messages[-1]["role"]
+
+    async def completion_stream_generator() -> AsyncGenerator[str, None]:
+        # Send first response for each request.n (index) with the role
+        role = get_role()
+        for i in range(request.n):
+            choice_data = ChatCompletionResponseStreamChoice(
+                index=i, delta=DeltaMessage(role=role), finish_reason=None)
+            chunk = ChatCompletionStreamResponse(id=request_id,
+                                                 object=chunk_object_type,
+                                                 created=created_time,
+                                                 choices=[choice_data],
+                                                 model=model_name)
+            data = chunk.json(exclude_unset=True, ensure_ascii=False)
+            yield f"data: {data}\n\n"
+
+        # Send response to echo the input portion of the last message
+        if request.echo:
+            last_msg_content = ""
+            if request.messages and isinstance(
+                    request.messages, list) and request.messages[-1].get(
+                        "content") and request.messages[-1].get(
+                            "role") == role:
+                last_msg_content = request.messages[-1]["content"]
+            if last_msg_content:
+                for i in range(request.n):
+                    choice_data = ChatCompletionResponseStreamChoice(
+                        index=i,
+                        delta=DeltaMessage(content=last_msg_content),
+                        finish_reason=None)
+                    chunk = ChatCompletionStreamResponse(
+                        id=request_id,
+                        object=chunk_object_type,
+                        created=created_time,
+                        choices=[choice_data],
+                        model=model_name)
+                    data = chunk.json(exclude_unset=True, ensure_ascii=False)
+                    yield f"data: {data}\n\n"
+
+        # Send response for each token for each request.n (index)
+        previous_texts = [""] * request.n
+        previous_num_tokens = [0] * request.n
+        finish_reason_sent = [False] * request.n
+        async for res in result_generator:
+            res: RequestOutput
+            for output in res.outputs:
+                i = output.index
+
+                if finish_reason_sent[i]:
+                    continue
+
+                if output.finish_reason is None:
+                    # Send token-by-token response for each request.n
+                    delta_text = output.text[len(previous_texts[i]):]
+                    previous_texts[i] = output.text
+                    previous_num_tokens[i] = len(output.token_ids)
+                    choice_data = ChatCompletionResponseStreamChoice(
+                        index=i,
+                        delta=DeltaMessage(content=delta_text),
+                        finish_reason=None)
+                    chunk = ChatCompletionStreamResponse(
+                        id=request_id,
+                        object=chunk_object_type,
+                        created=created_time,
+                        choices=[choice_data],
+                        model=model_name)
+                    data = chunk.json(exclude_unset=True, ensure_ascii=False)
+                    yield f"data: {data}\n\n"
+                else:
+                    # Send the finish response for each request.n only once
+                    prompt_tokens = len(res.prompt_token_ids)
+                    final_usage = UsageInfo(
+                        prompt_tokens=prompt_tokens,
+                        completion_tokens=previous_num_tokens[i],
+                        total_tokens=prompt_tokens + previous_num_tokens[i],
+                    )
+                    choice_data = ChatCompletionResponseStreamChoice(
+                        index=i, delta=[], finish_reason=output.finish_reason)
+                    chunk = ChatCompletionStreamResponse(
+                        id=request_id,
+                        object=chunk_object_type,
+                        created=created_time,
+                        choices=[choice_data],
+                        model=model_name)
+                    if final_usage is not None:
+                        chunk.usage = final_usage
+                    data = chunk.json(exclude_unset=True,
+                                      exclude_none=True,
+                                      ensure_ascii=False)
+                    yield f"data: {data}\n\n"
+                    finish_reason_sent[i] = True
+        # Send the final done message after all response.n are finished
+        yield "data: [DONE]\n\n"
+
+    async def completion_full_generator():
+        final_res: RequestOutput = None
+        async for res in result_generator:
+            if await raw_request.is_disconnected():
+                # Abort the request if the client disconnects.
+                await engine.abort(request_id)
+                return create_error_response(HTTPStatus.BAD_REQUEST,
+                                             "Client disconnected")
+            final_res = res
+        assert final_res is not None
+
+        choices = []
+        role = get_role()
+        for output in final_res.outputs:
+            choice_data = ChatCompletionResponseChoice(
+                index=output.index,
+                message=ChatMessage(role=role, content=output.text),
+                finish_reason=output.finish_reason,
+            )
+            choices.append(choice_data)
+
+        if request.echo:
+            last_msg_content = ""
+            if request.messages and isinstance(
+                    request.messages, list) and request.messages[-1].get(
+                        "content") and request.messages[-1].get(
+                            "role") == role:
+                last_msg_content = request.messages[-1]["content"]
+
+            for choice in choices:
+                full_message = last_msg_content + choice.message.content
+                choice.message.content = full_message
+
+        num_prompt_tokens = len(final_res.prompt_token_ids)
+        num_generated_tokens = sum(
+            len(output.token_ids) for output in final_res.outputs)
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            completion_tokens=num_generated_tokens,
+            total_tokens=num_prompt_tokens + num_generated_tokens,
+        )
+        response = ChatCompletionResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            choices=choices,
+            usage=usage,
+        )
+
+        return response
+
+    # Streaming response
+    if request.stream:
+        return StreamingResponse(completion_stream_generator(),
+                                 media_type="text/event-stream")
+    else:
+        return await completion_full_generator()
+
+
+@app.post("/v1/completions")
+async def create_completion(request: CompletionRequest, raw_request: Request):
+    """Completion API similar to OpenAI's API.
+
+    See https://platform.openai.com/docs/api-reference/completions/create
+    for the API specification. This API mimics the OpenAI Completion API.
+
+    NOTE: Currently we do not support the following features:
+        - suffix (the language models we currently support do not support
+          suffix)
+        - logit_bias (to be supported by vLLM engine)
+    """
+
+    error_check_ret = await check_model(request)
+    if error_check_ret is not None:
+        return error_check_ret
+
+    # OpenAI API supports echoing the prompt when max_tokens is 0.
+    echo_without_generation = request.echo and request.max_tokens == 0
+
+    if request.suffix is not None:
+        # The language models we currently support do not support suffix.
+        return create_error_response(HTTPStatus.BAD_REQUEST,
+                                     "suffix is not currently supported")
+
+    if request.logit_bias is not None and len(request.logit_bias) > 0:
+        # TODO: support logit_bias in vLLM engine.
+        return create_error_response(HTTPStatus.BAD_REQUEST,
+                                     "logit_bias is not currently supported")
+
+    model_name = request.model
+    request_id = f"cmpl-{random_uuid()}"
+
+    use_token_ids = False
+    if isinstance(request.prompt, list):
+        if len(request.prompt) == 0:
+            return create_error_response(HTTPStatus.BAD_REQUEST,
+                                         "please provide at least one prompt")
+        first_element = request.prompt[0]
+        if isinstance(first_element, int):
+            use_token_ids = True
+            prompt = request.prompt
+        elif isinstance(first_element, (str, list)):
+            # TODO: handles multiple prompt case in list[list[int]]
+            if len(request.prompt) > 1:
+                return create_error_response(
+                    HTTPStatus.BAD_REQUEST,
+                    "multiple prompts in a batch is not currently supported")
+            use_token_ids = not isinstance(first_element, str)
+            prompt = request.prompt[0]
+    else:
+        prompt = request.prompt
+
+    if use_token_ids:
+        _, error_check_ret = await check_length(request, prompt_ids=prompt)
+    else:
+        token_ids, error_check_ret = await check_length(request, prompt=prompt)
+    if error_check_ret is not None:
+        return error_check_ret
+
+    created_time = int(time.monotonic())
+    try:
+        spaces_between_special_tokens = request.spaces_between_special_tokens
+        sampling_params = SamplingParams(
+            n=request.n,
+            best_of=request.best_of,
+            presence_penalty=request.presence_penalty,
+            frequency_penalty=request.frequency_penalty,
+            repetition_penalty=request.repetition_penalty,
+            temperature=request.temperature,
+            top_p=request.top_p,
+            top_k=request.top_k,
+            min_p=request.min_p,
+            stop=request.stop,
+            stop_token_ids=request.stop_token_ids,
+            ignore_eos=request.ignore_eos,
+            max_tokens=request.max_tokens
+            if not echo_without_generation else 1,
+            logprobs=request.logprobs,
+            use_beam_search=request.use_beam_search,
+            prompt_logprobs=request.logprobs if request.echo else None,
+            skip_special_tokens=request.skip_special_tokens,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+        )
+    except ValueError as e:
+        return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
+
+    if use_token_ids:
+        result_generator = engine.generate(None,
+                                           sampling_params,
+                                           request_id,
+                                           prompt_token_ids=prompt)
+    else:
+        result_generator = engine.generate(prompt, sampling_params, request_id,
+                                           token_ids)
+
+    # Similar to the OpenAI API, when n != best_of, we do not stream the
+    # results. In addition, we do not stream the results when use beam search.
+    stream = (request.stream
+              and (request.best_of is None or request.n == request.best_of)
+              and not request.use_beam_search)
+
+    def create_stream_response_json(
+        index: int,
+        text: str,
+        logprobs: Optional[LogProbs] = None,
+        finish_reason: Optional[str] = None,
+        usage: Optional[UsageInfo] = None,
+    ) -> str:
+        choice_data = CompletionResponseStreamChoice(
+            index=index,
+            text=text,
+            logprobs=logprobs,
+            finish_reason=finish_reason,
+        )
+        response = CompletionStreamResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            choices=[choice_data],
+        )
+        if usage is not None:
+            response.usage = usage
+        response_json = response.json(exclude_unset=True, ensure_ascii=False)
+
+        return response_json
+
+    async def completion_stream_generator() -> AsyncGenerator[str, None]:
+        previous_texts = [""] * request.n
+        previous_num_tokens = [0] * request.n
+        has_echoed = [False] * request.n
+        async for res in result_generator:
+            res: RequestOutput
+            for output in res.outputs:
+                i = output.index
+                delta_text = output.text[len(previous_texts[i]):]
+                token_ids = output.token_ids[previous_num_tokens[i]:]
+                if request.logprobs is not None:
+                    top_logprobs = output.logprobs[previous_num_tokens[i]:]
+                else:
+                    top_logprobs = None
+                offsets = len(previous_texts[i])
+                if request.echo and not has_echoed[i]:
+                    if not echo_without_generation:
+                        delta_text = res.prompt + delta_text
+                        token_ids = res.prompt_token_ids + token_ids
+                        if top_logprobs:
+                            top_logprobs = res.prompt_logprobs + top_logprobs
+                    else:  # only just return the prompt
+                        delta_text = res.prompt
+                        token_ids = res.prompt_token_ids
+                        if top_logprobs:
+                            top_logprobs = res.prompt_logprobs
+                    has_echoed[i] = True
+                if request.logprobs is not None:
+                    logprobs = create_logprobs(
+                        token_ids=token_ids,
+                        top_logprobs=top_logprobs,
+                        num_output_top_logprobs=request.logprobs,
+                        initial_text_offset=offsets,
+                    )
+                else:
+                    logprobs = None
+                previous_texts[i] = output.text
+                previous_num_tokens[i] = len(output.token_ids)
+                finish_reason = output.finish_reason
+                response_json = create_stream_response_json(
+                    index=i,
+                    text=delta_text,
+                    logprobs=logprobs,
+                    finish_reason=finish_reason,
+                )
+                yield f"data: {response_json}\n\n"
+                if output.finish_reason is not None:
+                    logprobs = (LogProbs()
+                                if request.logprobs is not None else None)
+                    prompt_tokens = len(res.prompt_token_ids)
+                    completion_tokens = len(output.token_ids)
+                    final_usage = UsageInfo(
+                        prompt_tokens=prompt_tokens,
+                        completion_tokens=completion_tokens,
+                        total_tokens=prompt_tokens + completion_tokens,
+                    )
+                    response_json = create_stream_response_json(
+                        index=i,
+                        text="",
+                        logprobs=logprobs,
+                        finish_reason=output.finish_reason,
+                        usage=final_usage,
+                    )
+                    yield f"data: {response_json}\n\n"
+        yield "data: [DONE]\n\n"
+
+    # Streaming response
+    if stream:
+        return StreamingResponse(completion_stream_generator(),
+                                 media_type="text/event-stream")
+
+    # Non-streaming response
+    final_res: RequestOutput = None
+    async for res in result_generator:
+        if await raw_request.is_disconnected():
+            # Abort the request if the client disconnects.
+            await engine.abort(request_id)
+            return create_error_response(HTTPStatus.BAD_REQUEST,
+                                         "Client disconnected")
+        final_res = res
+    assert final_res is not None
+    choices = []
+    prompt_token_ids = final_res.prompt_token_ids
+    prompt_logprobs = final_res.prompt_logprobs
+    prompt_text = final_res.prompt
+    for output in final_res.outputs:
+        if request.logprobs is not None:
+            if not echo_without_generation:
+                token_ids = output.token_ids
+                top_logprobs = output.logprobs
+                if request.echo:
+                    token_ids = prompt_token_ids + token_ids
+                    top_logprobs = prompt_logprobs + top_logprobs
+            else:
+                token_ids = prompt_token_ids
+                top_logprobs = prompt_logprobs
+            logprobs = create_logprobs(
+                token_ids=token_ids,
+                top_logprobs=top_logprobs,
+                num_output_top_logprobs=request.logprobs,
+            )
+        else:
+            logprobs = None
+        if not echo_without_generation:
+            output_text = output.text
+            if request.echo:
+                output_text = prompt_text + output_text
+        else:
+            output_text = prompt_text
+        choice_data = CompletionResponseChoice(
+            index=output.index,
+            text=output_text,
+            logprobs=logprobs,
+            finish_reason=output.finish_reason,
+        )
+        choices.append(choice_data)
+
+    num_prompt_tokens = len(final_res.prompt_token_ids)
+    num_generated_tokens = sum(
+        len(output.token_ids) for output in final_res.outputs)
+    usage = UsageInfo(
+        prompt_tokens=num_prompt_tokens,
+        completion_tokens=num_generated_tokens,
+        total_tokens=num_prompt_tokens + num_generated_tokens,
+    )
+    response = CompletionResponse(
+        id=request_id,
+        created=created_time,
+        model=model_name,
+        choices=choices,
+        usage=usage,
+    )
+
+    if request.stream:
+        # When user requests streaming but we don't stream, we still need to
+        # return a streaming response with a single event.
+        response_json = response.json(ensure_ascii=False)
+
+        async def fake_stream_generator() -> AsyncGenerator[str, None]:
+            yield f"data: {response_json}\n\n"
+            yield "data: [DONE]\n\n"
+
+        return StreamingResponse(fake_stream_generator(),
+                                 media_type="text/event-stream")
+
+    return response
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=args.allowed_origins,
+        allow_credentials=args.allow_credentials,
+        allow_methods=args.allowed_methods,
+        allow_headers=args.allowed_headers,
+    )
+
+    logger.info(f"args: {args}")
+
+    if args.served_model_name is not None:
+        served_model = args.served_model_name
+    else:
+        served_model = args.model
+
+    response_role = args.response_role
+
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = AsyncLLMEngine.from_engine_args(engine_args)
+    engine_model_config = asyncio.run(engine.get_model_config())
+    max_model_len = engine_model_config.max_model_len
+
+    # A separate tokenizer to map token IDs to strings.
+    tokenizer = get_tokenizer(
+        engine_model_config.tokenizer,
+        tokenizer_mode=engine_model_config.tokenizer_mode,
+        trust_remote_code=engine_model_config.trust_remote_code)
+    load_chat_template(args, tokenizer)
+
+    # Register labels for metrics
+    add_global_metrics_labels(model_name=engine_args.model)
+
+    uvicorn.run(app,
+                host=args.host,
+                port=args.port,
+                log_level="info",
+                timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
+                ssl_keyfile=args.ssl_keyfile,
+                ssl_certfile=args.ssl_certfile)
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
+# Adapted from
+# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
+import time
+from typing import Dict, List, Literal, Optional, Union
+
+from pydantic import BaseModel, Field
+
+from vllm.utils import random_uuid
+
+
+class ErrorResponse(BaseModel):
+    object: str = "error"
+    message: str
+    type: str
+    param: Optional[str] = None
+    code: Optional[str] = None
+
+
+class ModelPermission(BaseModel):
+    id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
+    object: str = "model_permission"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    allow_create_engine: bool = False
+    allow_sampling: bool = True
+    allow_logprobs: bool = True
+    allow_search_indices: bool = False
+    allow_view: bool = True
+    allow_fine_tuning: bool = False
+    organization: str = "*"
+    group: Optional[str] = None
+    is_blocking: str = False
+
+
+class ModelCard(BaseModel):
+    id: str
+    object: str = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: str = "vllm"
+    root: Optional[str] = None
+    parent: Optional[str] = None
+    permission: List[ModelPermission] = Field(default_factory=list)
+
+
+class ModelList(BaseModel):
+    object: str = "list"
+    data: List[ModelCard] = Field(default_factory=list)
+
+
+class UsageInfo(BaseModel):
+    prompt_tokens: int = 0
+    total_tokens: int = 0
+    completion_tokens: Optional[int] = 0
+
+
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: Union[str, List[Dict[str, str]]]
+    temperature: Optional[float] = 0.7
+    top_p: Optional[float] = 1.0
+    n: Optional[int] = 1
+    max_tokens: Optional[int] = None
+    stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
+    stream: Optional[bool] = False
+    presence_penalty: Optional[float] = 0.0
+    frequency_penalty: Optional[float] = 0.0
+    logit_bias: Optional[Dict[str, float]] = None
+    user: Optional[str] = None
+    # Additional parameters supported by vLLM
+    best_of: Optional[int] = None
+    top_k: Optional[int] = -1
+    ignore_eos: Optional[bool] = False
+    use_beam_search: Optional[bool] = False
+    stop_token_ids: Optional[List[int]] = Field(default_factory=list)
+    skip_special_tokens: Optional[bool] = True
+    spaces_between_special_tokens: Optional[bool] = True
+    add_generation_prompt: Optional[bool] = True
+    echo: Optional[bool] = False
+    repetition_penalty: Optional[float] = 1.0
+    min_p: Optional[float] = 0.0
+
+
+class CompletionRequest(BaseModel):
+    model: str
+    # a string, array of strings, array of tokens, or array of token arrays
+    prompt: Union[List[int], List[List[int]], str, List[str]]
+    suffix: Optional[str] = None
+    max_tokens: Optional[int] = 16
+    temperature: Optional[float] = 1.0
+    top_p: Optional[float] = 1.0
+    n: Optional[int] = 1
+    stream: Optional[bool] = False
+    logprobs: Optional[int] = None
+    echo: Optional[bool] = False
+    stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
+    presence_penalty: Optional[float] = 0.0
+    frequency_penalty: Optional[float] = 0.0
+    best_of: Optional[int] = None
+    logit_bias: Optional[Dict[str, float]] = None
+    user: Optional[str] = None
+    # Additional parameters supported by vLLM
+    top_k: Optional[int] = -1
+    ignore_eos: Optional[bool] = False
+    use_beam_search: Optional[bool] = False
+    stop_token_ids: Optional[List[int]] = Field(default_factory=list)
+    skip_special_tokens: Optional[bool] = True
+    spaces_between_special_tokens: Optional[bool] = True
+    repetition_penalty: Optional[float] = 1.0
+    min_p: Optional[float] = 0.0
+
+
+class LogProbs(BaseModel):
+    text_offset: List[int] = Field(default_factory=list)
+    token_logprobs: List[Optional[float]] = Field(default_factory=list)
+    tokens: List[str] = Field(default_factory=list)
+    top_logprobs: Optional[List[Optional[Dict[int, float]]]] = None
+
+
+class CompletionResponseChoice(BaseModel):
+    index: int
+    text: str
+    logprobs: Optional[LogProbs] = None
+    finish_reason: Optional[Literal["stop", "length"]] = None
+
+
+class CompletionResponse(BaseModel):
+    id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
+    object: str = "text_completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[CompletionResponseChoice]
+    usage: UsageInfo
+
+
+class CompletionResponseStreamChoice(BaseModel):
+    index: int
+    text: str
+    logprobs: Optional[LogProbs] = None
+    finish_reason: Optional[Literal["stop", "length"]] = None
+
+
+class CompletionStreamResponse(BaseModel):
+    id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
+    object: str = "text_completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[CompletionResponseStreamChoice]
+    usage: Optional[UsageInfo]
+
+
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+
+
+class ChatCompletionResponseChoice(BaseModel):
+    index: int
+    message: ChatMessage
+    finish_reason: Optional[Literal["stop", "length"]] = None
+
+
+class ChatCompletionResponse(BaseModel):
+    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
+    object: str = "chat.completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[ChatCompletionResponseChoice]
+    usage: UsageInfo
+
+
+class DeltaMessage(BaseModel):
+    role: Optional[str] = None
+    content: Optional[str] = None
+
+
+class ChatCompletionResponseStreamChoice(BaseModel):
+    index: int
+    delta: DeltaMessage
+    finish_reason: Optional[Literal["stop", "length"]] = None
+
+
+class ChatCompletionStreamResponse(BaseModel):
+    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
+    object: str = "chat.completion.chunk"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[ChatCompletionResponseStreamChoice]
+    usage: Optional[UsageInfo] = Field(
+        default=None, description="data about request and response")
--- a/vllm/logger.py
+++ b/vllm/logger.py
+# Adapted from
+# https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py
+"""Logging configuration for vLLM."""
+import logging
+import sys
+
+_FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
+_DATE_FORMAT = "%m-%d %H:%M:%S"
+
+
+class NewLineFormatter(logging.Formatter):
+    """Adds logging prefix to newlines to align multi-line messages."""
+
+    def __init__(self, fmt, datefmt=None):
+        logging.Formatter.__init__(self, fmt, datefmt)
+
+    def format(self, record):
+        msg = logging.Formatter.format(self, record)
+        if record.message != "":
+            parts = msg.split(record.message)
+            msg = msg.replace("\n", "\r\n" + parts[0])
+        return msg
+
+
+_root_logger = logging.getLogger("vllm")
+_default_handler = None
+
+
+def _setup_logger():
+    _root_logger.setLevel(logging.DEBUG)
+    global _default_handler
+    if _default_handler is None:
+        _default_handler = logging.StreamHandler(sys.stdout)
+        _default_handler.flush = sys.stdout.flush  # type: ignore
+        _default_handler.setLevel(logging.INFO)
+        _root_logger.addHandler(_default_handler)
+    fmt = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT)
+    _default_handler.setFormatter(fmt)
+    # Setting this will avoid the message
+    # being propagated to the parent logger.
+    _root_logger.propagate = False
+
+
+# The logger is initialized when the module is imported.
+# This is thread-safe as the module is only imported once,
+# guaranteed by the Python GIL.
+_setup_logger()
+
+
+def init_logger(name: str):
+    # Use the same settings as above for root logger
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    logger.addHandler(_default_handler)
+    logger.propagate = False
+    return logger
--- a/vllm/model_executor/__init__.py
+++ b/vllm/model_executor/__init__.py
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.model_executor.utils import set_random_seed
+
+__all__ = [
+    "InputMetadata",
+    "get_model",
+    "SamplingMetadata",
+    "set_random_seed",
+]
--- a/vllm/model_executor/input_metadata.py
+++ b/vllm/model_executor/input_metadata.py
+from typing import List, Optional
+
+import torch
+
+
+class InputMetadata:
+    """Metadata for input sequences. Used in PagedAttention.
+
+    Args:
+        prompt_lens: Lengths of prompts.
+        slot_mapping: The address to write the new KV to of each token.
+        max_context_len: The maximum context length.
+        context_lens: the length of attention context for each sequence.
+        block_tables: The block tables. (Seq id -> list of physical block)
+    """
+
+    def __init__(
+        self,
+        prompt_lens: List[int],
+        slot_mapping: torch.Tensor,
+        max_context_len: Optional[int],
+        context_lens: Optional[torch.Tensor],
+        block_tables: Optional[torch.Tensor],
+        use_cuda_graph: bool,
+    ) -> None:
+        self.prompt_lens = prompt_lens
+        self.max_context_len = max_context_len
+        self.slot_mapping = slot_mapping
+        self.context_lens = context_lens
+        self.block_tables = block_tables
+        self.use_cuda_graph = use_cuda_graph
+
+        self.is_prompt = len(prompt_lens) > 0
+        # Set during the execution of the first attention op.
+        # FIXME(woosuk): This is a hack.
+        self.attn_bias = None
+
+    def __repr__(self) -> str:
+        return ("InputMetadata("
+                f"prompt_lens={self.prompt_lens}, "
+                f"max_context_len={self.max_context_len}, "
+                f"slot_mapping={self.slot_mapping}, "
+                f"context_lens={self.context_lens}, "
+                f"block_tables={self.block_tables}, "
+                f"use_cuda_graph={self.use_cuda_graph})")
--- a/vllm/model_executor/layers/__init__.py
+++ b/vllm/model_executor/layers/__init__.py
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
+"""Custom activation functions."""
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from vllm._C import ops
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.model_executor.parallel_utils.utils import divide
+from vllm.model_executor.utils import set_weight_attrs
+
+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
+
+    Shapes:
+        x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
+        return: (batch_size, seq_len, d) or (num_tokens, d)
+    """
+
+    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        d = x.shape[-1] // 2
+        return F.silu(x[..., :d]) * x[..., d:]
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        ops.silu_and_mul(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+
+    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        c = math.sqrt(2.0 / math.pi)
+        return 0.5 * x * (1.0 + torch.tanh(c *
+                                           (x + 0.044715 * torch.pow(x, 3.0))))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_new(out, x)
+        return out
+
+
+class FastGELU(nn.Module):
+
+    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 *
+                                           (1.0 + 0.044715 * x * x)))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch.empty_like(x)
+        ops.gelu_fast(out, x)
+        return out
+
+
+class ScaledActivation(nn.Module):
+    """An activation function with post-scale parameters.
+
+    This is used for some quantization methods like AWQ.
+    """
+
+    def __init__(
+        self,
+        act_module: nn.Module,
+        intermediate_size: int,
+        input_is_parallel: bool = True,
+        params_dtype: Optional[torch.dtype] = None,
+    ):
+        super().__init__()
+        self.act = act_module
+        self.input_is_parallel = input_is_parallel
+        if input_is_parallel:
+            tp_size = get_tensor_model_parallel_world_size()
+            intermediate_size_per_partition = divide(intermediate_size,
+                                                     tp_size)
+        else:
+            intermediate_size_per_partition = intermediate_size
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
+        self.scales = nn.Parameter(
+            torch.empty(intermediate_size_per_partition,
+                        dtype=params_dtype,
+                        device="cuda"))
+        set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.act(x) / self.scales
+
+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param_data = param.data
+        if self.input_is_parallel:
+            tp_rank = get_tensor_model_parallel_rank()
+            shard_size = param_data.shape[0]
+            start_idx = tp_rank * shard_size
+            loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+
+
+_ACTIVATION_REGISTRY = {
+    "gelu": nn.GELU(),
+    "gelu_fast": FastGELU(),
+    "gelu_new": NewGELU(),
+    "gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
+    "relu": nn.ReLU(),
+}
+
+
+def get_act_fn(
+    act_fn_name: str,
+    quant_config: Optional[QuantizationConfig] = None,
+    intermediate_size: Optional[int] = None,
+    input_is_parallel: bool = True,
+    params_dtype: Optional[torch.dtype] = None,
+) -> nn.Module:
+    """Get an activation function by name."""
+    act_fn_name = act_fn_name.lower()
+    if act_fn_name not in _ACTIVATION_REGISTRY:
+        raise ValueError(
+            f"Activation function {act_fn_name!r} is not supported.")
+
+    act_fn = _ACTIVATION_REGISTRY[act_fn_name]
+    if (quant_config is not None
+            and act_fn_name in quant_config.get_scaled_act_names()):
+        if intermediate_size is None:
+            raise ValueError("intermediate_size must be specified for scaled "
+                             "activation functions.")
+        return ScaledActivation(act_fn, intermediate_size, input_is_parallel,
+                                params_dtype)
+    return act_fn
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
+"""Multi-head attention."""
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+from xformers import ops as xops
+from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask,
+                                         LowerTriangularMaskWithTensorBias)
+
+from vllm._C import ops
+from vllm._C import cache_ops
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.utils import is_hip
+
+_SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256]
+# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
+_PARTITION_SIZE = 512
+
+
+class PagedAttention(nn.Module):
+    """MHA/MQA/GQA layer with PagedAttention.
+
+    This class takes query, key, and value tensors as input. The input tensors
+    can either contain prompt tokens or generation tokens.
+    The class does the following:
+
+    1. Reshape and store the input key and value tensors in the KV cache.
+    2. Perform (multi-head/multi-query/grouped-query) attention using either
+        xformers or the PagedAttention custom op.
+    3. Return the output tensor.
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: Optional[int] = None,
+        alibi_slopes: Optional[List[float]] = None,
+        sliding_window: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+        self.sliding_window = sliding_window
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.register_buffer("alibi_slopes", alibi_slopes, persistent=False)
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        if self.head_size not in _SUPPORTED_HEAD_SIZES:
+            raise ValueError(f"head_size ({self.head_size}) is not supported. "
+                             f"Supported head sizes: {_SUPPORTED_HEAD_SIZES}.")
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: Optional[torch.Tensor],
+        value_cache: Optional[torch.Tensor],
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        """PagedAttention forward pass.
+
+        Args:
+            query: shape = [batch_size, seq_len, num_heads * head_size]
+            key: shape = [batch_size, seq_len, num_kv_heads * head_size]
+            value: shape = [batch_size, seq_len, num_kv_heads * head_size]
+            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
+                block_size, x]
+            value_cache: shape = [num_blocks, num_kv_heads, head_size,
+                block_size]
+            input_metadata: metadata for the inputs.
+        Returns:
+            shape = [batch_size, seq_len, num_heads * head_size]
+        """
+        batch_size, seq_len, hidden_size = query.shape
+        # Reshape the query, key, and value tensors.
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_kv_heads, self.head_size)
+        value = value.view(-1, self.num_kv_heads, self.head_size)
+
+        # Reshape the keys and values and store them in the cache.
+        # If key_cache and value_cache are not provided, the new key and value
+        # vectors will not be cached. This happens during the initial memory
+        # profiling run.
+        if key_cache is not None and value_cache is not None:
+            cache_ops.reshape_and_cache(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                input_metadata.slot_mapping.flatten(),
+            )
+
+        if input_metadata.is_prompt:
+            # Prompt run.
+            if self.num_kv_heads != self.num_heads:
+                # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
+                # project the key and value tensors to the desired number of
+                # heads.
+                # TODO(woosuk): Use MQA/GQA kernels for higher performance.
+                query = query.view(query.shape[0], self.num_kv_heads,
+                                   self.num_queries_per_kv, query.shape[-1])
+                key = key[:, :,
+                          None, :].expand(key.shape[0], self.num_kv_heads,
+                                          self.num_queries_per_kv,
+                                          key.shape[-1])
+                value = value[:, :, None, :].expand(value.shape[0],
+                                                    self.num_kv_heads,
+                                                    self.num_queries_per_kv,
+                                                    value.shape[-1])
+
+            # Set attention bias if not provided. This typically happens at the
+            # very attention layer of every iteration.
+            # FIXME(woosuk): This is a hack.
+            if input_metadata.attn_bias is None:
+                if self.alibi_slopes is None:
+                    attn_bias = BlockDiagonalCausalMask.from_seqlens(
+                        [seq_len] * batch_size)
+                    if self.sliding_window is not None:
+                        attn_bias = attn_bias.make_local_attention(
+                            self.sliding_window)
+                    input_metadata.attn_bias = attn_bias
+                else:
+                    input_metadata.attn_bias = _make_alibi_bias(
+                        self.alibi_slopes, self.num_kv_heads, batch_size,
+                        seq_len, query.dtype)
+
+            # TODO(woosuk): Too many view operations. Let's try to reduce them
+            # in the future for code readability.
+            if self.alibi_slopes is None:
+                query = query.unsqueeze(0)
+                key = key.unsqueeze(0)
+                value = value.unsqueeze(0)
+            else:
+                query = query.unflatten(0, (batch_size, seq_len))
+                key = key.unflatten(0, (batch_size, seq_len))
+                value = value.unflatten(0, (batch_size, seq_len))
+
+            out = xops.memory_efficient_attention_forward(
+                query,
+                key,
+                value,
+                attn_bias=input_metadata.attn_bias,
+                p=0.0,
+                scale=self.scale,
+                op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if
+                (is_hip()) else None,
+            )
+            output = out.view_as(query)
+        else:
+            # Decoding run.
+            if key_cache is not None and value_cache is not None:
+                output = _paged_attention(
+                    query,
+                    key_cache,
+                    value_cache,
+                    input_metadata,
+                    self.num_kv_heads,
+                    self.scale,
+                    self.alibi_slopes,
+                )
+            else:
+                # This happens during the initial memory profiling run for
+                # CUDA graphs.
+                output = torch.zeros_like(query)
+
+        # Reshape the output tensor.
+        return output.view(batch_size, seq_len, hidden_size)
+
+
+def _make_alibi_bias(
+    alibi_slopes: torch.Tensor,
+    num_kv_heads: int,
+    batch_size: int,
+    seq_len: int,
+    dtype: torch.dtype,
+) -> LowerTriangularMaskWithTensorBias:
+    bias = torch.arange(seq_len, dtype=dtype, device="cuda")
+    # NOTE(zhuohan): HF uses
+    #     `bias = bias[None, :].repeat(prompt_len, 1)`
+    # here. We find that both biases give the same results, but
+    # the bias below more accurately follows the original ALiBi
+    # paper.
+    bias = bias[None, :] - bias[:, None]
+
+    # When using custom attention bias, xformers requires the bias to
+    # be sliced from a tensor whose length is a multiple of 8.
+    padded_len = (seq_len + 7) // 8 * 8
+    num_heads = alibi_slopes.shape[0]
+    bias = torch.empty(
+        batch_size,
+        num_heads,
+        seq_len,
+        padded_len,
+        device=alibi_slopes.device,
+        dtype=dtype,
+    )[:, :, :, :seq_len].copy_(bias)
+    bias.mul_(alibi_slopes[:, None, None])
+    if num_heads != num_kv_heads:
+        bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
+    attn_bias = LowerTriangularMaskWithTensorBias(bias)
+    return attn_bias
+
+
+def _paged_attention(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    input_metadata: InputMetadata,
+    num_kv_heads: int,
+    scale: float,
+    alibi_slopes: Optional[torch.Tensor],
+) -> torch.Tensor:
+    output = torch.empty_like(query)
+
+    block_size = value_cache.shape[3]
+    num_seqs, num_heads, head_size = query.shape
+    max_num_partitions = (
+        (input_metadata.max_context_len + _PARTITION_SIZE - 1) //
+        _PARTITION_SIZE)
+    # NOTE(woosuk): We use a simple heuristic to decide whether to use
+    # PagedAttention V1 or V2. If the number of partitions is 1, we use
+    # V1 to avoid the overhead of reduction. Also, if the number of
+    # sequences or heads is large, we use V1 since there is enough work
+    # to parallelize.
+    # TODO(woosuk): Tune this heuristic.
+    # For context len > 8192, use V2 kernel to avoid shared memory shortage.
+    use_v1 = input_metadata.max_context_len <= 8192 and (
+        max_num_partitions == 1 or num_seqs * num_heads > 512)
+    if use_v1:
+        # Run PagedAttention V1.
+        ops.paged_attention_v1(
+            output,
+            query,
+            key_cache,
+            value_cache,
+            num_kv_heads,
+            scale,
+            input_metadata.block_tables,
+            input_metadata.context_lens,
+            block_size,
+            input_metadata.max_context_len,
+            alibi_slopes,
+        )
+    else:
+        # Run PagedAttention V2.
+        assert _PARTITION_SIZE % block_size == 0
+        tmp_output = torch.empty(
+            size=(num_seqs, num_heads, max_num_partitions, head_size),
+            dtype=output.dtype,
+            device=output.device,
+        )
+        exp_sums = torch.empty(
+            size=(num_seqs, num_heads, max_num_partitions),
+            dtype=torch.float32,
+            device=output.device,
+        )
+        max_logits = torch.empty_like(exp_sums)
+        ops.paged_attention_v2(
+            output,
+            exp_sums,
+            max_logits,
+            tmp_output,
+            query,
+            key_cache,
+            value_cache,
+            num_kv_heads,
+            scale,
+            input_metadata.block_tables,
+            input_metadata.context_lens,
+            block_size,
+            input_metadata.max_context_len,
+            alibi_slopes,
+        )
+    return output