OpenAI Compatible Frontend (#116)

057daef7 · Zhuohan Li · GitHub · e8671783 · 057daef7 · 057daef7
Unverified Commit 057daef7 authored May 23, 2023 by Zhuohan Li Committed by GitHub May 23, 2023
20 changed files
--- a/cacheflow/core/block_manager.py
+++ b/cacheflow/core/block_manager.py
@@ -148,7 +148,7 @@ class BlockSpaceManager:
        # the sequences in the same group.
        blocks: Set[PhysicalTokenBlock] = set()
        for seq in seq_group.get_seqs():
-            if seq.status == SequenceStatus.FINISHED:
+            if SequenceStatus.is_finished(seq.status):
                continue
            block_table = self.block_tables[seq.seq_id]
            for block in block_table:
@@ -169,7 +169,7 @@ class BlockSpaceManager:
        # CPU block -> GPU block.
        mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
        for seq in seq_group.get_seqs():
-            if seq.status == SequenceStatus.FINISHED:
+            if SequenceStatus.is_finished(seq.status):
                continue
            new_block_table: BlockTable = []
            block_table = self.block_tables[seq.seq_id]
@@ -200,7 +200,7 @@ class BlockSpaceManager:
        # GPU block -> CPU block.
        mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
        for seq in seq_group.get_seqs():
-            if seq.status == SequenceStatus.FINISHED:
+            if SequenceStatus.is_finished(seq.status):
                continue
            new_block_table: BlockTable = []
            block_table = self.block_tables[seq.seq_id]

--- a/cacheflow/core/scheduler.py
+++ b/cacheflow/core/scheduler.py
@@ -292,10 +292,12 @@ class Scheduler:
                # Append a new token to the sequence.
                output = seq_outputs[seq.seq_id]
                seq.append_token_id(output.output_token, output.logprobs)
+        # Return a shallow copy of the running queue to prevent the queue
+        # from being modified by the caller.
        return self.running.copy()
-    def free_seq(self, seq: Sequence) -> None:
+    def free_seq(self, seq: Sequence, finish_status: SequenceStatus) -> None:
-        seq.status = SequenceStatus.FINISHED
+        seq.status = finish_status
        self.block_manager.free(seq)
    def free_finished_seq_groups(self) -> None:

--- a/cacheflow/entrypoints/openai/openai_frontend.py
+++ b/cacheflow/entrypoints/openai/openai_frontend.py
+# Adapted from https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/serve/openai_api_server.py
+import argparse
+from http import HTTPStatus
+import json
+import time
+from typing import AsyncGenerator, Dict, List, Optional
+import fastapi
+from fastapi.exceptions import RequestValidationError
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse, JSONResponse
+import uvicorn
+from cacheflow.outputs import RequestOutput
+from cacheflow.server.arg_utils import ServerArgs
+from cacheflow.server.async_llm_server import AsyncLLMServer
+from cacheflow.server.tokenizer_utils import get_tokenizer
+from cacheflow.logger import init_logger
+from cacheflow.sampling_params import SamplingParams
+from cacheflow.utils import random_uuid
+from cacheflow.entrypoints.openai.protocol import (
+    CompletionRequest,
+    CompletionResponse,
+    CompletionResponseChoice,
+    CompletionResponseStreamChoice,
+    CompletionStreamResponse,
+    ErrorResponse,
+    LogProbs,
+    ModelCard,
+    ModelList,
+    ModelPermission,
+    UsageInfo,
+)
+logger = init_logger(__name__)
+served_model = None
+app = fastapi.FastAPI()
+def create_error_response(status_code: HTTPStatus,
+                          message: str) -> JSONResponse:
+    return JSONResponse(
+        ErrorResponse(message=message, type="invalid_request_error").dict(),
+        status_code=status_code.value
+    )
+@app.exception_handler(RequestValidationError)
+async def validation_exception_handler(request, exc):
+    return create_error_response(HTTPStatus.BAD_REQUEST, str(exc))
+async def check_model(request) -> Optional[JSONResponse]:
+    if request.model == served_model:
+        return
+    ret = create_error_response(
+        HTTPStatus.NOT_FOUND,
+        f"The model `{request.model}` does not exist.",
+    )
+    return ret
+@app.get("/v1/models")
+async def show_available_models():
+    """Show available models. Right now we only have one model."""
+    model_cards = [ModelCard(id=served_model, root=served_model,
+                             permission=[ModelPermission()])]
+    return ModelList(data=model_cards)
+def create_logprobs(token_ids: List[int],
+                    id_logprobs: List[Dict[int, float]],
+                    initial_text_offset: int = 0) -> LogProbs:
+    """Create OpenAI-style logprobs."""
+    logprobs = LogProbs()
+    last_token_len = 0
+    for token_id, id_logprob in zip(token_ids, id_logprobs):
+        token = tokenizer.convert_ids_to_tokens(token_id)
+        logprobs.tokens.append(token)
+        logprobs.token_logprobs.append(id_logprob[token_id])
+        if len(logprobs.text_offset) == 0:
+            logprobs.text_offset.append(initial_text_offset)
+        else:
+            logprobs.text_offset.append(logprobs.text_offset[-1] + last_token_len)
+        last_token_len = len(token)
+        logprobs.top_logprobs.append(
+            {tokenizer.convert_ids_to_tokens(i): p
+             for i, p in id_logprob.items()})
+    return logprobs
+@app.post("/v1/completions")
+async def create_completion(request: CompletionRequest):
+    logger.info(f"Received completion request: {request}")
+    error_check_ret = await check_model(request)
+    if error_check_ret is not None:
+        return error_check_ret
+    if request.echo:
+        # We do not support echo since the cacheflow server does not
+        # currently support getting the logprobs of prompt tokens.
+        return create_error_response(HTTPStatus.BAD_REQUEST,
+                                     "echo is not currently supported")
+    if request.suffix is not None:
+        # The language models we currently support do not support suffix.
+        return create_error_response(HTTPStatus.BAD_REQUEST,
+                                    "suffix is not currently supported")
+    if request.logit_bias is not None:
+        # TODO: support logit_bias in cacheflow server.
+        return create_error_response(HTTPStatus.BAD_REQUEST,
+                                     "logit_bias is not currently supported")
+    model_name = request.model
+    request_id = f"cmpl-{random_uuid()}"
+    prompt = request.prompt
+    created_time = int(time.time())
+    try:
+        sampling_params = SamplingParams(
+            n=request.n,
+            best_of=request.best_of,
+            presence_penalty=request.presence_penalty,
+            frequency_penalty=request.frequency_penalty,
+            temperature=request.temperature,
+            top_p=request.top_p,
+            top_k=request.top_k,
+            stop=request.stop,
+            ignore_eos=request.ignore_eos,
+            max_tokens=request.max_tokens,
+            logprobs=request.logprobs,
+            use_beam_search=request.use_beam_search,
+        )
+    except ValueError as e:
+        return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
+    result_generator = server.generate(prompt, sampling_params,
+                                       request_id=request_id)
+    # Similar to the OpenAI API, when n != best_of, we do not stream the
+    # results. In addition, we do not stream the results when use beam search.
+    stream = (request.stream and
+              (request.best_of is None or request.n == request.best_of) and
+              not request.use_beam_search)
+    def create_stream_response_json(index: int,
+                                    text: str,
+                                    logprobs: Optional[LogProbs] = None,
+                                    finish_reason: Optional[str] = None) -> str:
+        choice_data = CompletionResponseStreamChoice(
+            index=index,
+            text=text,
+            logprobs=logprobs,
+            finish_reason=finish_reason,
+        )
+        response = CompletionStreamResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            choices=[choice_data],
+        )
+        response_json = response.json(ensure_ascii=False)
+        return response_json
+    async def completion_stream_generator() -> AsyncGenerator[str, None]:
+        previous_texts = [""] * request.n
+        previous_num_tokens = [0] * request.n
+        async for res in result_generator:
+            res: RequestOutput
+            for output in res.outputs:
+                i = output.index
+                delta_text = output.text[len(previous_texts[i]):]
+                if request.logprobs is not None:
+                    logprobs = create_logprobs(
+                        output.token_ids[previous_num_tokens[i]:],
+                        output.logprobs[previous_num_tokens[i]:],
+                        len(previous_texts[i]))
+                else:
+                    logprobs = None
+                previous_texts[i] = output.text
+                previous_num_tokens[i] = len(output.token_ids)
+                response_json = create_stream_response_json(
+                    index=i,
+                    text=delta_text,
+                    logprobs=logprobs,
+                )
+                yield f"data: {response_json}\n\n"
+                if output.finish_reason is not None:
+                    logprobs = LogProbs() if request.logprobs is not None else None
+                    response_json = create_stream_response_json(
+                        index=i,
+                        text="",
+                        logprobs=logprobs,
+                        finish_reason=output.finish_reason,
+                    )
+                    yield f"data: {response_json}\n\n"
+            yield "data: [DONE]\n\n"
+    # Streaming response
+    if stream:
+        return StreamingResponse(completion_stream_generator(),
+                                 media_type="text/event-stream")
+    # Non-streaming response
+    final_res: RequestOutput = None
+    async for res in result_generator:
+        final_res = res
+    assert final_res is not None
+    choices = []
+    for output in final_res.outputs:
+        if request.logprobs is not None:
+            logprobs = create_logprobs(output.token_ids, output.logprobs)
+        else:
+            logprobs = None
+        choice_data = CompletionResponseChoice(
+            index=output.index,
+            text=output.text,
+            logprobs=logprobs,
+            finish_reason=output.finish_reason,
+        )
+        choices.append(choice_data)
+    num_prompt_tokens = len(final_res.prompt_token_ids)
+    num_generated_tokens = sum(len(output.token_ids)
+                               for output in final_res.outputs)
+    usage = UsageInfo(
+        prompt_tokens=num_prompt_tokens,
+        completion_tokens=num_generated_tokens,
+        total_tokens=num_prompt_tokens + num_generated_tokens,
+    )
+    response = CompletionResponse(
+        id=request_id,
+        created=created_time,
+        model=model_name,
+        choices=choices,
+        usage=usage,
+    )
+    if request.stream:
+        # When user requests streaming but we don't stream, we still need to
+        # return a streaming response with a single event.
+        response_json = response.json(ensure_ascii=False)
+        async def fake_stream_generator() -> AsyncGenerator[str, None]:
+            yield f"data: {response_json}\n\n"
+            yield "data: [DONE]\n\n"
+        return StreamingResponse(fake_stream_generator(),
+                                 media_type="text/event-stream")
+    return response
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="CacheFlow OpenAI-Compatible RESTful API server."
+    )
+    parser.add_argument("--host", type=str, default="localhost", help="host name")
+    parser.add_argument("--port", type=int, default=8000, help="port number")
+    parser.add_argument(
+        "--allow-credentials", action="store_true", help="allow credentials"
+    )
+    parser.add_argument(
+        "--allowed-origins", type=json.loads, default=["*"], help="allowed origins"
+    )
+    parser.add_argument(
+        "--allowed-methods", type=json.loads, default=["*"], help="allowed methods"
+    )
+    parser.add_argument(
+        "--allowed-headers", type=json.loads, default=["*"], help="allowed headers"
+    )
+    parser.add_argument("--served-model-name", type=str, default=None,
+                        help="The model name used in the API. If not specified, "
+                             "the model name will be the same as the "
+                             "huggingface name.")
+    parser = ServerArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=args.allowed_origins,
+        allow_credentials=args.allow_credentials,
+        allow_methods=args.allowed_methods,
+        allow_headers=args.allowed_headers,
+    )
+    logger.info(f"args: {args}")
+    served_model = args.served_model_name or args.model
+    server_args = ServerArgs.from_cli_args(args)
+    server = AsyncLLMServer.from_server_args(server_args)
+    # A separate tokenizer to map token IDs to strings.
+    tokenizer = get_tokenizer(args.model)
+    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
--- a/cacheflow/entrypoints/openai/protocol.py
+++ b/cacheflow/entrypoints/openai/protocol.py
+# Adapted from https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
+import time
+from typing import Dict, List, Literal, Optional, Union
+from pydantic import BaseModel, Field
+from cacheflow.utils import random_uuid
+class ErrorResponse(BaseModel):
+    object: str = "error"
+    message: str
+    type: str
+    param: Optional[str] = None
+    code: Optional[str] = None
+class ModelPermission(BaseModel):
+    id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
+    object: str = "model_permission"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    allow_create_engine: bool = False
+    allow_sampling: bool = True
+    allow_logprobs: bool = True
+    allow_search_indices: bool = False
+    allow_view: bool = True
+    allow_fine_tuning: bool = False
+    organization: str = "*"
+    group: Optional[str] = None
+    is_blocking: str = False
+class ModelCard(BaseModel):
+    id: str
+    object: str = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: str = "cacheflow"
+    root: Optional[str] = None
+    parent: Optional[str] = None
+    permission: List[ModelPermission] = Field(default_factory=list)
+class ModelList(BaseModel):
+    object: str = "list"
+    data: List[ModelCard] = Field(default_factory=list)
+class UsageInfo(BaseModel):
+    prompt_tokens: int = 0
+    total_tokens: int = 0
+    completion_tokens: Optional[int] = 0
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[Dict[str, str]]
+    temperature: Optional[float] = 0.7
+    top_p: Optional[float] = 1.0
+    n: Optional[int] = 1
+    max_tokens: Optional[int] = None
+    stop: Optional[Union[str, List[str]]] = None
+    stream: Optional[bool] = False
+    presence_penalty: Optional[float] = 0.0
+    frequency_penalty: Optional[float] = 0.0
+    user: Optional[str] = None
+class CompletionRequest(BaseModel):
+    model: str
+    prompt: str
+    suffix: Optional[str] = None
+    max_tokens: Optional[int] = 16
+    temperature: Optional[float] = 1.0
+    top_p: Optional[float] = 1.0
+    n: Optional[int] = 1
+    stream: Optional[bool] = False
+    logprobs: Optional[int] = None
+    echo: Optional[bool] = False
+    stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
+    presence_penalty: Optional[float] = 0.0
+    frequency_penalty: Optional[float] = 0.0
+    best_of: Optional[int] = None
+    logit_bias: Optional[Dict[str, float]] = None
+    user: Optional[str] = None
+    # Additional parameters supported by cacheflow
+    top_k: Optional[int] = -1
+    ignore_eos: Optional[bool] = False
+    use_beam_search: Optional[bool] = False
+class LogProbs(BaseModel):
+    text_offset: List[int] = Field(default_factory=list)
+    token_logprobs: List[Optional[float]] = Field(default_factory=list)
+    tokens: List[str] = Field(default_factory=list)
+    top_logprobs: List[Optional[Dict[str, float]]] = Field(default_factory=list)
+class CompletionResponseChoice(BaseModel):
+    index: int
+    text: str
+    logprobs: Optional[LogProbs] = None
+    finish_reason: Optional[Literal["stop", "length"]] = None
+class CompletionResponse(BaseModel):
+    id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
+    object: str = "text_completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[CompletionResponseChoice]
+    usage: UsageInfo
+class CompletionResponseStreamChoice(BaseModel):
+    index: int
+    text: str
+    logprobs: Optional[LogProbs] = None
+    finish_reason: Optional[Literal["stop", "length"]] = None
+class CompletionStreamResponse(BaseModel):
+    id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
+    object: str = "text_completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[CompletionResponseStreamChoice]
--- a/cacheflow/entrypoints/simple_fastapi_frontend.py
+++ b/cacheflow/entrypoints/simple_fastapi_frontend.py
+import argparse
+import json
+from typing import AsyncGenerator
+from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse
+import uvicorn
+from cacheflow.sampling_params import SamplingParams
+from cacheflow.server.arg_utils import ServerArgs
+from cacheflow.server.async_llm_server import AsyncLLMServer
+from cacheflow.server.ray_utils import initialize_cluster
+TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds
+app = FastAPI()
+@app.post("/generate")
+async def generate_stream(request: Request) -> StreamingResponse:
+    request_dict = await request.json()
+    prompt = request_dict.pop("prompt")
+    sampling_params = SamplingParams(**request_dict)
+    results_generator = server.generate(prompt, sampling_params)
+    async def stream_results() -> AsyncGenerator[bytes, None]:
+        async for request_output in results_generator:
+            prompt = request_output.prompt
+            text_outputs = [
+                prompt + output.text
+                for output in request_output.outputs
+            ]
+            ret = {
+                "text": text_outputs,
+                "error": 0,
+            }
+            yield (json.dumps(ret) + "\0").encode("utf-8")
+    return StreamingResponse(stream_results())
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8001)
+    parser = ServerArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    server_args = ServerArgs.from_cli_args(args)
+    server = AsyncLLMServer.from_server_args(server_args)
+    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
--- a/cacheflow/model_executor/layers/sampler.py
+++ b/cacheflow/model_executor/layers/sampler.py
 """A layer that samples the next tokens from the model's outputs."""
-from typing import Dict, List, Tuple
+from typing import Dict, List, Tuple, Optional
 import numpy as np
 import torch
@@ -258,9 +258,9 @@ def _apply_top_p_top_k(
 def _get_topk_logprobs(
    logprobs: torch.Tensor,
-    num_logprobs: int,
+    num_logprobs: Optional[int],
 ) -> Dict[int, float]:
-    if num_logprobs == 0:
+    if num_logprobs is None or num_logprobs == 0:
        return {}
    topk_logprobs, topk_ids = torch.topk(logprobs, num_logprobs)

--- a/cacheflow/outputs.py
+++ b/cacheflow/outputs.py
-from typing import Dict, List
+from typing import Dict, List, Optional
-from cacheflow.sequence import SequenceGroup
+from cacheflow.sequence import SequenceGroup, SequenceStatus
 class CompletionOutput:
@@ -12,19 +12,25 @@ class CompletionOutput:
        token_ids: List[int],
        cumulative_logprob: float,
        logprobs: List[Dict[int, float]],
+        finish_reason: Optional[str] = None,
    ) -> None:
        self.index = index
        self.text = text
        self.token_ids = token_ids
        self.cumulative_logprob = cumulative_logprob
        self.logprobs = logprobs
+        self.finish_reason = finish_reason
+    def finished(self) -> bool:
+        return self.finish_reason is not None
    def __repr__(self) -> str:
        return (f"CompletionOutput(index={self.index}, "
                f"text={self.text!r}, "
                f"token_ids={self.token_ids}, "
                f"cumulative_logprob={self.cumulative_logprob}, "
-                f"logprobs={self.logprobs})")
+                f"logprobs={self.logprobs},"
+                f"finish_reason={self.finish_reason})")
 class RequestOutput:
@@ -35,13 +41,11 @@ class RequestOutput:
        prompt: str,
        prompt_token_ids: List[int],
        outputs: List[CompletionOutput],
-        done: bool,
    ) -> None:
        self.request_id = request_id
        self.prompt = prompt
        self.prompt_token_ids = prompt_token_ids
        self.outputs = outputs
-        self.done = done
    @classmethod
    def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
@@ -57,25 +61,28 @@ class RequestOutput:
        outputs: List[CompletionOutput] = []
        for seq in top_n_seqs:
            logprobs = seq.output_logprobs
-            if seq_group.sampling_params.logprobs == 0:
+            if seq_group.sampling_params.logprobs is None:
                # NOTE: We need to take care of this case because the sequence
                # always has the logprobs of the sampled tokens even if the
                # logprobs are not requested.
                logprobs = {}
+            finshed_reason = SequenceStatus.get_finished_reason(seq.status)
            output = CompletionOutput(seqs.index(seq), seq.output_text,
                                      seq.get_output_token_ids(),
-                                      seq.get_cumulative_logprob(), logprobs)
+                                      seq.get_cumulative_logprob(), logprobs,
+                                      finshed_reason)
            outputs.append(output)
        # Every sequence in the sequence group should have the same prompt.
        prompt = top_n_seqs[0].prompt
        prompt_token_ids = top_n_seqs[0].data.prompt_token_ids
-        return cls(seq_group.request_id, prompt, prompt_token_ids, outputs,
+        return cls(seq_group.request_id, prompt, prompt_token_ids, outputs)
-                   seq_group.is_finished())
    def __repr__(self) -> str:
        return (f"RequestOutput(request_id={self.request_id}, "
                f"prompt={self.prompt!r}, "
                f"prompt_token_ids={self.prompt_token_ids}, "
-                f"outputs={self.outputs}, "
+                f"outputs={self.outputs})")
-                f"done={self.done})")
+    def finished(self) -> bool:
+        return all(output.finished() for output in self.outputs)
--- a/cacheflow/sampling_params.py
+++ b/cacheflow/sampling_params.py
@@ -53,7 +53,7 @@ class SamplingParams:
        stop: Union[str, List[str]] = [],
        ignore_eos: bool = False,
        max_tokens: int = 16,
-        logprobs: int = 0,
+        logprobs: Optional[int] = None,
    ) -> None:
        self.n = n
        self.best_of = best_of if best_of is not None else n
@@ -98,7 +98,7 @@ class SamplingParams:
        if self.max_tokens < 1:
            raise ValueError(
                f"max_tokens must be at least 1, got {self.max_tokens}.")
-        if self.logprobs < 0:
+        if self.logprobs is not None and self.logprobs < 0:
            raise ValueError(
                f"logprobs must be non-negative, got {self.logprobs}.")

--- a/cacheflow/sequence.py
+++ b/cacheflow/sequence.py
 import copy
 import enum
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Union
 from cacheflow.block import LogicalTokenBlock
 from cacheflow.sampling_params import SamplingParams
@@ -10,8 +10,25 @@ class SequenceStatus(enum.Enum):
    WAITING = enum.auto()
    RUNNING = enum.auto()
    SWAPPED = enum.auto()
-    FINISHED = enum.auto()
+    FINISHED_STOPPED = enum.auto()
+    FINISHED_LENGTH_CAPPED = enum.auto()
+    @staticmethod
+    def is_finished(status: "SequenceStatus") -> bool:
+        return status in [
+            SequenceStatus.FINISHED_STOPPED,
+            SequenceStatus.FINISHED_LENGTH_CAPPED,
+        ]
+    @staticmethod
+    def get_finished_reason(status: "SequenceStatus") -> Union[str, None]:
+        if status == SequenceStatus.FINISHED_STOPPED:
+            finish_reason = "stop"
+        elif status == SequenceStatus.FINISHED_LENGTH_CAPPED:
+            finish_reason = "length"
+        else:
+            finish_reason = None
+        return finish_reason
 class SequenceData:
@@ -20,7 +37,6 @@ class SequenceData:
        prompt_token_ids: List[int],
    ) -> None:
        self.prompt_token_ids = prompt_token_ids
        self.output_token_ids: List[int] = []
        self.cumulative_logprob = 0.0
@@ -166,7 +182,7 @@ class SequenceGroup:
        raise ValueError(f'Sequence {seq_id} not found.')
    def is_finished(self) -> bool:
-        return all(seq.status == SequenceStatus.FINISHED for seq in self.seqs)
+        return all(SequenceStatus.is_finished(seq.status) for seq in self.seqs)
    def __repr__(self) -> str:
        return (f"SequenceGroup(request_id={self.request_id}, "

--- a/cacheflow/entrypoints/fastapi_server.py
+++ b/cacheflow/entrypoints/fastapi_server.py
-import argparse
 import asyncio
-import json
 import time
-from typing import Any, Dict
+from typing import Dict, Optional
-import uuid
-from fastapi import FastAPI, Request
-from fastapi.responses import StreamingResponse
 import ray
-import uvicorn
 from cacheflow.outputs import RequestOutput
 from cacheflow.sampling_params import SamplingParams
 from cacheflow.server.arg_utils import ServerArgs
 from cacheflow.server.llm_server import LLMServer
 from cacheflow.server.ray_utils import initialize_cluster
+from cacheflow.utils import random_uuid
 TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds
-app = FastAPI()
-class FastAPIServer:
+class AsyncLLMServer:
    def __init__(self, server_use_ray: bool, *args, **kwargs) -> None:
        if server_use_ray:
@@ -45,15 +39,15 @@ class FastAPIServer:
            self.request_outputs[request_id] = request_output
            self.request_events[request_id].set()
-    async def generate(self, request_dict: Dict[str, Any]):
+    async def generate(self, prompt: str, sampling_params: SamplingParams,
+                       request_id: Optional[str] = None) -> RequestOutput:
        # Preprocess the request.
        arrival_time = time.time()
-        prompt = request_dict.pop("prompt")
-        sampling_params = SamplingParams(**request_dict)
        # Create an event to notify us that there is new output from the
        # cacheflow server.
-        request_id = str(uuid.uuid4().hex[:8])
+        if request_id is None:
+            request_id = random_uuid()
        request_event = asyncio.Event()
        self.request_events[request_id] = request_event
@@ -82,19 +76,10 @@ class FastAPIServer:
            # Decode and return new outputs.
            request_output = self.request_outputs[request_id]
-            prompt = request_output.prompt
+            yield request_output
-            text_outputs = [
-                prompt + output.text
-                for output in request_output.outputs
-            ]
-            ret = {
-                "text": text_outputs,
-                "error": 0,
-            }
-            yield (json.dumps(ret) + "\0").encode("utf-8")
            # Once finished, release the resources of the sequence group.
-            if request_output.done:
+            if request_output.finished():
                del self.request_outputs[request_id]
                del self.request_events[request_id]
                # Kick the server if the server is not running. This is to
@@ -104,25 +89,15 @@ class FastAPIServer:
                    await self.server_step()
                break
+    @classmethod
-@app.post("/generate")
+    def from_server_args(cls, server_args: ServerArgs) -> "AsyncLLMServer":
-async def generate_stream(request: Request):
+        # Create the server configs.
-    request_dict = await request.json()
+        server_configs = server_args.create_server_configs()
-    return StreamingResponse(server.generate(request_dict))
+        parallel_config = server_configs[2]
+        # Initialize the cluster.
+        distributed_init_method, devices = initialize_cluster(parallel_config)
-if __name__ == "__main__":
+        # Create the LLM server.
-    parser = argparse.ArgumentParser()
+        server = cls(server_args.use_ray, *server_configs,
-    parser.add_argument("--host", type=str, default="localhost")
+                     distributed_init_method, devices,
-    parser.add_argument("--port", type=int, default=10002)
+                     log_stats=not server_args.disable_log_stats)
-    parser = ServerArgs.add_cli_args(parser)
+        return server
-    args = parser.parse_args()
-    server_configs = ServerArgs.from_cli_args(args).create_server_configs()
-    parallel_config = server_configs[2]
-    distributed_init_method, stage_devices = initialize_cluster(parallel_config)
-    server = FastAPIServer(args.use_ray, *server_configs,
-                           distributed_init_method, stage_devices,
-                           log_stats=not args.disable_log_stats)
-    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
--- a/cacheflow/server/llm_server.py
+++ b/cacheflow/server/llm_server.py
@@ -210,7 +210,8 @@ class LLMServer:
                        # Truncate the output text so that the stop string is
                        # not included in the output.
                        seq.output_text = seq.output_text[:-len(stop_str)]
-                        self.scheduler.free_seq(seq)
+                        self.scheduler.free_seq(seq,
+                                                SequenceStatus.FINISHED_STOPPED)
                        stopped = True
                        break
                if stopped:
@@ -218,12 +219,14 @@ class LLMServer:
                # Check if the sequence has reached max_tokens.
                if seq.get_output_len() == sampling_params.max_tokens:
-                    self.scheduler.free_seq(seq)
+                    self.scheduler.free_seq(
+                        seq, SequenceStatus.FINISHED_LENGTH_CAPPED)
                    continue
                # Check if the sequence has generated the EOS token.
                if not sampling_params.ignore_eos:
                    if seq.get_last_token_id() == self.tokenizer.eos_token_id:
-                        self.scheduler.free_seq(seq)
+                        self.scheduler.free_seq(seq,
+                                                SequenceStatus.FINISHED_STOPPED)
                        continue
    def _run_workers(
@@ -238,10 +241,10 @@ class LLMServer:
            executor = getattr(worker, method)
            if self.parallel_config.use_ray:
                executor = executor.remote
            output = executor(*args, **kwargs)
            all_outputs.append(output)
        if self.parallel_config.use_ray:
            all_outputs = ray.get(all_outputs)

--- a/cacheflow/utils.py
+++ b/cacheflow/utils.py
 import enum
+import uuid
 import psutil
 import torch
@@ -31,3 +32,7 @@ def get_gpu_memory(gpu: int = 0) -> int:
 def get_cpu_memory() -> int:
    """Returns the total CPU memory of the node in bytes."""
    return psutil.virtual_memory().total
+def random_uuid() -> str:
+    return str(uuid.uuid4().hex)
--- a/gradio_webserver.py
+++ b/gradio_webserver.py
 import argparse
 import json
-import time
 import gradio as gr
 import requests
@@ -24,9 +23,9 @@ def http_bot(prompt):
 def build_demo():
    with gr.Blocks() as demo:
        gr.Markdown(
-            "# Cacheflow demo\n"
+            "# Cacheflow text completion demo\n"
        )
-        inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER")# .style(container=False)
+        inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER")
        outputbox = gr.Textbox(label="Output", placeholder="Generated result from the model")
        inputbox.submit(http_bot, [inputbox], [outputbox])
    return demo
@@ -35,9 +34,11 @@ def build_demo():
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--host", type=str, default="localhost")
-    parser.add_argument("--port", type=int, default=10003)
+    parser.add_argument("--port", type=int, default=8002)
-    parser.add_argument("--model-url", type=str, default="http://localhost:10002/generate")
+    parser.add_argument("--model-url", type=str, default="http://localhost:8001/generate")
    args = parser.parse_args()
    demo = build_demo()
-    demo.queue(concurrency_count=100).launch(server_name=args.host, server_port=args.port)
+    demo.queue(concurrency_count=100).launch(server_name=args.host,
\ No newline at end of file
+                                             server_port=args.port,
+                                             share=True)
--- a/examples/openai_client.py
+++ b/examples/openai_client.py
+import openai
+openai.api_key = "EMPTY"
+openai.api_base = "http://localhost:8000/v1"
+model = "facebook/opt-125m"
+# list models
+models = openai.Model.list()
+print(models)
+# create a completion
+stream = True
+completion = openai.Completion.create(
+    model=model, prompt="A robot may not injure a human being", echo=False, n=2,
+    best_of=3, stream=stream, logprobs=3)
+# print the completion
+if stream:
+    for c in completion:
+        print(c)
+else:
+    print("completion:", completion)
--- a/examples/simple_fastapi_client.py
+++ b/examples/simple_fastapi_client.py
+import argparse
+import requests
+import json
+def clear_line(n=1):
+    LINE_UP = '\033[1A'
+    LINE_CLEAR = '\x1b[2K'
+    for i in range(n):
+        print(LINE_UP, end=LINE_CLEAR, flush=True)
+def http_request(prompt: str, api_url: str, n: int = 1):
+    headers = {"User-Agent": "Test Client"}
+    pload = {
+        "prompt": prompt,
+        "n": n,
+        "use_beam_search": True,
+        "temperature": 0.0,
+        "max_tokens": 16,
+    }
+    response = requests.post(api_url, headers=headers, json=pload, stream=True)
+    for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
+        if chunk:
+            data = json.loads(chunk.decode("utf-8"))
+            output = data["text"]
+            yield output
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8001)
+    parser.add_argument("--n", type=int, default=4)
+    parser.add_argument("--prompt", type=str, default="San Francisco is a")
+    args = parser.parse_args()
+    prompt = args.prompt
+    api_url = f"http://{args.host}:{args.port}/generate"
+    n = args.n
+    print(f"Prompt: {prompt}\n", flush=True)
+    num_printed_lines = 0
+    for h in http_request(prompt, api_url, n):
+        clear_line(num_printed_lines)
+        num_printed_lines = 0
+        for i, line in enumerate(h):
+            num_printed_lines += 1
+            print(f"Beam candidate {i}: {line}", flush=True)
--- a/examples/simple_server.py
+++ b/examples/simple_server.py
 import argparse
-import uuid
 from cacheflow import ServerArgs, LLMServer, SamplingParams
@@ -20,17 +19,19 @@ def main(args: argparse.Namespace):
         SamplingParams(n=3, best_of=3, use_beam_search=True, temperature=0.0)),
    ]
+    request_id = 0
    # Run the server.
    while True:
        # To test iteration-level scheduling, we add one request at each step.
        if test_prompts:
            prompt, sampling_params = test_prompts.pop(0)
-            request_id = str(uuid.uuid4().hex[:8])
+            server.add_request(str(request_id), prompt, sampling_params)
-            server.add_request(request_id, prompt, sampling_params)
+            request_id += 1
        request_outputs = server.step()
        for request_output in request_outputs:
-            if request_output.done:
+            if request_output.finished():
                print(request_output)
        if not (server.has_unfinished_requests() or test_prompts):

--- a/playground/http_client.py
+++ b/playground/http_client.py
-import requests
-import json
-def http_bot():
-    prompt = "How are you? I'm fine."
-    headers = {"User-Agent": "Test Client"}
-    pload = {
-        "prompt": prompt,
-    }
-    response = requests.post("http://localhost:10002", headers=headers, json=pload, stream=True)
-    for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
-        if chunk:
-            data = json.loads(chunk.decode("utf-8"))
-            output = data["text"]
-            yield output
-for h in http_bot():
-    print(h, end="", flush=True)
\ No newline at end of file
--- a/playground/streaming_fastapi_worker.py
+++ b/playground/streaming_fastapi_worker.py
-import argparse
-import asyncio
-import time
-from typing import Union
-import json
-from fastapi import FastAPI, Request
-from fastapi.responses import StreamingResponse
-import uvicorn
-app = FastAPI()
-async def text_streamer(args):
-    context = args["prompt"]
-    words = context.split(" ")
-    for word in words:
-        await asyncio.sleep(1)
-        print("word:", word)
-        ret = {
-            "text": word + " ",
-            "error": 0,
-        }
-        yield (json.dumps(ret) + "\0").encode("utf-8")
-@app.post("/")
-async def read_root(request: Request):
-    args = await request.json()
-    return StreamingResponse(text_streamer(args))
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--host", type=str, default="localhost")
-    parser.add_argument("--port", type=int, default=10002)
-    args = parser.parse_args()
-    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,3 +8,4 @@ transformers >= 4.28.0  # Required for LLaMA.
 xformers >= 0.0.19
 fastapi
 uvicorn
+pydantic  # Required for OpenAI server.
--- a/test_cli_client.py
+++ b/test_cli_client.py
-import requests
-import json
-def http_request():
-    prompt = "Ion Stoica is a"
-    headers = {"User-Agent": "Test Client"}
-    pload = {
-        "prompt": prompt,
-        "n": 4,
-        "use_beam_search": True,
-        "temperature": 0.0,
-    }
-    response = requests.post("http://localhost:10002/generate", headers=headers, json=pload, stream=True)
-    for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
-        if chunk:
-            data = json.loads(chunk.decode("utf-8"))
-            output = data["text"]
-            yield output
-for h in http_request():
-    print(h, flush=True)