v1.0

24eacbc0 · chenzk · 24eacbc0 · 24eacbc0 · 24eacbc0 · 24eacbc0
Commit 24eacbc0 authored May 09, 2024 by chenzk
20 changed files
--- a/inference/vllm/vllm/entrypoints/openai/api_server.py
+++ b/inference/vllm/vllm/entrypoints/openai/api_server.py
+# Adapted from
+# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/serve/openai_api_server.py
+
+import argparse
+import asyncio
+import json
+import time
+from http import HTTPStatus
+from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union
+
+import fastapi
+import uvicorn
+from fastapi import Request
+from fastapi.exceptions import RequestValidationError
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, StreamingResponse, Response
+from packaging import version
+
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.openai.protocol import (
+    CompletionRequest, CompletionResponse, CompletionResponseChoice,
+    CompletionResponseStreamChoice, CompletionStreamResponse,
+    ChatCompletionRequest, ChatCompletionResponse,
+    ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
+    ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse,
+    LogProbs, ModelCard, ModelList, ModelPermission, UsageInfo)
+from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils import random_uuid
+
+try:
+    import fastchat
+    from fastchat.conversation import Conversation, SeparatorStyle
+    from fastchat.model.model_adapter import get_conversation_template
+    _fastchat_available = True
+except ImportError:
+    _fastchat_available = False
+
+TIMEOUT_KEEP_ALIVE = 5  # seconds
+
+logger = init_logger(__name__)
+served_model = None
+app = fastapi.FastAPI()
+engine = None
+
+
+def create_error_response(status_code: HTTPStatus,
+                          message: str) -> JSONResponse:
+    return JSONResponse(ErrorResponse(message=message,
+                                      type="invalid_request_error").dict(),
+                        status_code=status_code.value)
+
+
+@app.exception_handler(RequestValidationError)
+async def validation_exception_handler(request, exc):  # pylint: disable=unused-argument
+    return create_error_response(HTTPStatus.BAD_REQUEST, str(exc))
+
+
+async def check_model(request) -> Optional[JSONResponse]:
+    if request.model == served_model:
+        return
+    ret = create_error_response(
+        HTTPStatus.NOT_FOUND,
+        f"The model `{request.model}` does not exist.",
+    )
+    return ret
+
+
+async def get_gen_prompt(request) -> str:
+    if not _fastchat_available:
+        raise ModuleNotFoundError(
+            "fastchat is not installed. Please install fastchat to use "
+            "the chat completion and conversation APIs: `$ pip install fschat`"
+        )
+    if version.parse(fastchat.__version__) < version.parse("0.2.23"):
+        raise ImportError(
+            f"fastchat version is low. Current version: {fastchat.__version__} "
+            "Please upgrade fastchat to use: `$ pip install -U fschat`")
+
+    conv = get_conversation_template(request.model)
+    conv = Conversation(
+        name=conv.name,
+        system_template=conv.system_template,
+        system_message=conv.system_message,
+        roles=conv.roles,
+        messages=list(conv.messages),  # prevent in-place modification
+        offset=conv.offset,
+        sep_style=SeparatorStyle(conv.sep_style),
+        sep=conv.sep,
+        sep2=conv.sep2,
+        stop_str=conv.stop_str,
+        stop_token_ids=conv.stop_token_ids,
+    )
+
+    if isinstance(request.messages, str):
+        prompt = request.messages
+    else:
+        for message in request.messages:
+            msg_role = message["role"]
+            if msg_role == "system":
+                conv.system_message = message["content"]
+            elif msg_role == "user":
+                conv.append_message(conv.roles[0], message["content"])
+            elif msg_role == "assistant":
+                conv.append_message(conv.roles[1], message["content"])
+            else:
+                raise ValueError(f"Unknown role: {msg_role}")
+
+        # Add a blank message for the assistant.
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+    return prompt
+
+
+async def check_length(
+    request: Union[ChatCompletionRequest, CompletionRequest],
+    prompt: Optional[str] = None,
+    prompt_ids: Optional[List[int]] = None
+) -> Tuple[List[int], Optional[JSONResponse]]:
+    assert (not (prompt is None and prompt_ids is None)
+            and not (prompt is not None and prompt_ids is not None)
+            ), "Either prompt or prompt_ids should be provided."
+    if prompt_ids is not None:
+        input_ids = prompt_ids
+    else:
+        input_ids = tokenizer(prompt).input_ids
+    token_num = len(input_ids)
+
+    if request.max_tokens is None:
+        request.max_tokens = max_model_len - token_num
+    if token_num + request.max_tokens > max_model_len:
+        return input_ids, create_error_response(
+            HTTPStatus.BAD_REQUEST,
+            f"This model's maximum context length is {max_model_len} tokens. "
+            f"However, you requested {request.max_tokens + token_num} tokens "
+            f"({token_num} in the messages, "
+            f"{request.max_tokens} in the completion). "
+            f"Please reduce the length of the messages or completion.",
+        )
+    else:
+        return input_ids, None
+
+
+@app.get("/health")
+async def health() -> Response:
+    """Health check."""
+    return Response(status_code=200)
+
+
+@app.get("/v1/models")
+async def show_available_models():
+    """Show available models. Right now we only have one model."""
+    model_cards = [
+        ModelCard(id=served_model,
+                  root=served_model,
+                  permission=[ModelPermission()])
+    ]
+    return ModelList(data=model_cards)
+
+
+def create_logprobs(token_ids: List[int],
+                    id_logprobs: List[Dict[int, float]],
+                    initial_text_offset: int = 0) -> LogProbs:
+    """Create OpenAI-style logprobs."""
+    logprobs = LogProbs()
+    last_token_len = 0
+    for token_id, id_logprob in zip(token_ids, id_logprobs):
+        token = tokenizer.convert_ids_to_tokens(token_id)
+        logprobs.tokens.append(token)
+        logprobs.token_logprobs.append(id_logprob[token_id])
+        if len(logprobs.text_offset) == 0:
+            logprobs.text_offset.append(initial_text_offset)
+        else:
+            logprobs.text_offset.append(logprobs.text_offset[-1] +
+                                        last_token_len)
+        last_token_len = len(token)
+
+        logprobs.top_logprobs.append({
+            tokenizer.convert_ids_to_tokens(i): p
+            for i, p in id_logprob.items()
+        })
+    return logprobs
+
+
+@app.post("/v1/chat/completions")
+async def create_chat_completion(request: ChatCompletionRequest,
+                                 raw_request: Request):
+    """Completion API similar to OpenAI's API.
+
+    See  https://platform.openai.com/docs/api-reference/chat/create
+    for the API specification. This API mimics the OpenAI ChatCompletion API.
+
+    NOTE: Currently we do not support the following features:
+        - function_call (Users should implement this by themselves)
+        - logit_bias (to be supported by vLLM engine)
+    """
+    logger.info(f"Received chat completion request: {request}")
+
+    error_check_ret = await check_model(request)
+    if error_check_ret is not None:
+        return error_check_ret
+
+    if request.logit_bias is not None and len(request.logit_bias) > 0:
+        # TODO: support logit_bias in vLLM engine.
+        return create_error_response(HTTPStatus.BAD_REQUEST,
+                                     "logit_bias is not currently supported")
+
+    prompt = await get_gen_prompt(request)
+    token_ids, error_check_ret = await check_length(request, prompt=prompt)
+    if error_check_ret is not None:
+        return error_check_ret
+
+    model_name = request.model
+    request_id = f"cmpl-{random_uuid()}"
+    created_time = int(time.monotonic())
+    try:
+        spaces_between_special_tokens = request.spaces_between_special_tokens
+        sampling_params = SamplingParams(
+            n=request.n,
+            presence_penalty=request.presence_penalty,
+            frequency_penalty=request.frequency_penalty,
+            temperature=request.temperature,
+            top_p=request.top_p,
+            stop=request.stop,
+            stop_token_ids=request.stop_token_ids,
+            max_tokens=request.max_tokens,
+            best_of=request.best_of,
+            top_k=request.top_k,
+            ignore_eos=request.ignore_eos,
+            use_beam_search=request.use_beam_search,
+            skip_special_tokens=request.skip_special_tokens,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+        )
+    except ValueError as e:
+        return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
+
+    result_generator = engine.generate(prompt, sampling_params, request_id,
+                                       token_ids)
+
+    def create_stream_response_json(
+        index: int,
+        text: str,
+        finish_reason: Optional[str] = None,
+        usage: Optional[UsageInfo] = None,
+    ) -> str:
+        choice_data = ChatCompletionResponseStreamChoice(
+            index=index,
+            delta=DeltaMessage(content=text),
+            finish_reason=finish_reason,
+        )
+        response = ChatCompletionStreamResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            choices=[choice_data],
+        )
+        if usage is not None:
+            response.usage = usage
+        # exclude unset to leave details out of each sse
+        response_json = response.json(exclude_unset=True, ensure_ascii=False)
+
+        return response_json
+
+    async def completion_stream_generator() -> AsyncGenerator[str, None]:
+        # First chunk with role
+        for i in range(request.n):
+            choice_data = ChatCompletionResponseStreamChoice(
+                index=i,
+                delta=DeltaMessage(role="assistant"),
+                finish_reason=None,
+            )
+            chunk = ChatCompletionStreamResponse(id=request_id,
+                                                 choices=[choice_data],
+                                                 model=model_name)
+            data = chunk.json(exclude_unset=True, ensure_ascii=False)
+            yield f"data: {data}\n\n"
+
+        previous_texts = [""] * request.n
+        previous_num_tokens = [0] * request.n
+        async for res in result_generator:
+            res: RequestOutput
+            for output in res.outputs:
+                i = output.index
+                delta_text = output.text[len(previous_texts[i]):]
+                previous_texts[i] = output.text
+                completion_tokens = len(output.token_ids)
+                previous_num_tokens[i] = completion_tokens
+                response_json = create_stream_response_json(
+                    index=i,
+                    text=delta_text,
+                )
+                yield f"data: {response_json}\n\n"
+                if output.finish_reason is not None:
+                    prompt_tokens = len(res.prompt_token_ids)
+                    final_usage = UsageInfo(
+                        prompt_tokens=prompt_tokens,
+                        completion_tokens=completion_tokens,
+                        total_tokens=prompt_tokens + completion_tokens,
+                    )
+                    response_json = create_stream_response_json(
+                        index=i,
+                        text="",
+                        finish_reason=output.finish_reason,
+                        usage=final_usage,
+                    )
+                    yield f"data: {response_json}\n\n"
+        yield "data: [DONE]\n\n"
+
+    # Streaming response
+    if request.stream:
+        return StreamingResponse(completion_stream_generator(),
+                                 media_type="text/event-stream")
+
+    # Non-streaming response
+    final_res: RequestOutput = None
+    async for res in result_generator:
+        if await raw_request.is_disconnected():
+            # Abort the request if the client disconnects.
+            await engine.abort(request_id)
+            return create_error_response(HTTPStatus.BAD_REQUEST,
+                                         "Client disconnected")
+        final_res = res
+    assert final_res is not None
+    choices = []
+    for output in final_res.outputs:
+        choice_data = ChatCompletionResponseChoice(
+            index=output.index,
+            message=ChatMessage(role="assistant", content=output.text),
+            finish_reason=output.finish_reason,
+        )
+        choices.append(choice_data)
+
+    num_prompt_tokens = len(final_res.prompt_token_ids)
+    num_generated_tokens = sum(
+        len(output.token_ids) for output in final_res.outputs)
+    usage = UsageInfo(
+        prompt_tokens=num_prompt_tokens,
+        completion_tokens=num_generated_tokens,
+        total_tokens=num_prompt_tokens + num_generated_tokens,
+    )
+    response = ChatCompletionResponse(
+        id=request_id,
+        created=created_time,
+        model=model_name,
+        choices=choices,
+        usage=usage,
+    )
+
+    if request.stream:
+        # When user requests streaming but we don't stream, we still need to
+        # return a streaming response with a single event.
+        response_json = response.json(ensure_ascii=False)
+
+        async def fake_stream_generator() -> AsyncGenerator[str, None]:
+            yield f"data: {response_json}\n\n"
+            yield "data: [DONE]\n\n"
+
+        return StreamingResponse(fake_stream_generator(),
+                                 media_type="text/event-stream")
+
+    return response
+
+
+@app.post("/v1/completions")
+async def create_completion(request: CompletionRequest, raw_request: Request):
+    """Completion API similar to OpenAI's API.
+
+    See https://platform.openai.com/docs/api-reference/completions/create
+    for the API specification. This API mimics the OpenAI Completion API.
+
+    NOTE: Currently we do not support the following features:
+        - echo (since the vLLM engine does not currently support
+          getting the logprobs of prompt tokens)
+        - suffix (the language models we currently support do not support
+          suffix)
+        - logit_bias (to be supported by vLLM engine)
+    """
+    logger.info(f"Received completion request: {request}")
+
+    error_check_ret = await check_model(request)
+    if error_check_ret is not None:
+        return error_check_ret
+
+    if request.echo:
+        # We do not support echo since the vLLM engine does not
+        # currently support getting the logprobs of prompt tokens.
+        return create_error_response(HTTPStatus.BAD_REQUEST,
+                                     "echo is not currently supported")
+
+    if request.suffix is not None:
+        # The language models we currently support do not support suffix.
+        return create_error_response(HTTPStatus.BAD_REQUEST,
+                                     "suffix is not currently supported")
+
+    if request.logit_bias is not None and len(request.logit_bias) > 0:
+        # TODO: support logit_bias in vLLM engine.
+        return create_error_response(HTTPStatus.BAD_REQUEST,
+                                     "logit_bias is not currently supported")
+
+    model_name = request.model
+    request_id = f"cmpl-{random_uuid()}"
+
+    use_token_ids = False
+    if isinstance(request.prompt, list):
+        if len(request.prompt) == 0:
+            return create_error_response(HTTPStatus.BAD_REQUEST,
+                                         "please provide at least one prompt")
+        first_element = request.prompt[0]
+        if isinstance(first_element, int):
+            use_token_ids = True
+            prompt = request.prompt
+        elif isinstance(first_element, (str, list)):
+            # TODO: handles multiple prompt case in list[list[int]]
+            if len(request.prompt) > 1:
+                return create_error_response(
+                    HTTPStatus.BAD_REQUEST,
+                    "multiple prompts in a batch is not currently supported")
+            use_token_ids = not isinstance(first_element, str)
+            prompt = request.prompt[0]
+    else:
+        prompt = request.prompt
+
+    if use_token_ids:
+        _, error_check_ret = await check_length(request, prompt_ids=prompt)
+    else:
+        token_ids, error_check_ret = await check_length(request, prompt=prompt)
+    if error_check_ret is not None:
+        return error_check_ret
+
+    created_time = int(time.monotonic())
+    try:
+        spaces_between_special_tokens = request.spaces_between_special_tokens
+        sampling_params = SamplingParams(
+            n=request.n,
+            best_of=request.best_of,
+            presence_penalty=request.presence_penalty,
+            frequency_penalty=request.frequency_penalty,
+            temperature=request.temperature,
+            top_p=request.top_p,
+            top_k=request.top_k,
+            stop=request.stop,
+            stop_token_ids=request.stop_token_ids,
+            ignore_eos=request.ignore_eos,
+            max_tokens=request.max_tokens,
+            logprobs=request.logprobs,
+            use_beam_search=request.use_beam_search,
+            skip_special_tokens=request.skip_special_tokens,
+            spaces_between_special_tokens=spaces_between_special_tokens,
+        )
+    except ValueError as e:
+        return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
+
+    if use_token_ids:
+        result_generator = engine.generate(None,
+                                           sampling_params,
+                                           request_id,
+                                           prompt_token_ids=prompt)
+    else:
+        result_generator = engine.generate(prompt, sampling_params, request_id,
+                                           token_ids)
+
+    # Similar to the OpenAI API, when n != best_of, we do not stream the
+    # results. In addition, we do not stream the results when use beam search.
+    stream = (request.stream
+              and (request.best_of is None or request.n == request.best_of)
+              and not request.use_beam_search)
+
+    def create_stream_response_json(
+        index: int,
+        text: str,
+        logprobs: Optional[LogProbs] = None,
+        finish_reason: Optional[str] = None,
+        usage: Optional[UsageInfo] = None,
+    ) -> str:
+        choice_data = CompletionResponseStreamChoice(
+            index=index,
+            text=text,
+            logprobs=logprobs,
+            finish_reason=finish_reason,
+        )
+        response = CompletionStreamResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            choices=[choice_data],
+        )
+        if usage is not None:
+            response.usage = usage
+        response_json = response.json(exclude_unset=True, ensure_ascii=False)
+
+        return response_json
+
+    async def completion_stream_generator() -> AsyncGenerator[str, None]:
+        previous_texts = [""] * request.n
+        previous_num_tokens = [0] * request.n
+        async for res in result_generator:
+            res: RequestOutput
+            for output in res.outputs:
+                i = output.index
+                delta_text = output.text[len(previous_texts[i]):]
+                if request.logprobs is not None:
+                    logprobs = create_logprobs(
+                        output.token_ids[previous_num_tokens[i]:],
+                        output.logprobs[previous_num_tokens[i]:],
+                        len(previous_texts[i]))
+                else:
+                    logprobs = None
+                previous_texts[i] = output.text
+                previous_num_tokens[i] = len(output.token_ids)
+                response_json = create_stream_response_json(
+                    index=i,
+                    text=delta_text,
+                    logprobs=logprobs,
+                )
+                yield f"data: {response_json}\n\n"
+                if output.finish_reason is not None:
+                    logprobs = (LogProbs()
+                                if request.logprobs is not None else None)
+                    prompt_tokens = len(res.prompt_token_ids)
+                    completion_tokens = len(output.token_ids)
+                    final_usage = UsageInfo(
+                        prompt_tokens=prompt_tokens,
+                        completion_tokens=completion_tokens,
+                        total_tokens=prompt_tokens + completion_tokens,
+                    )
+                    response_json = create_stream_response_json(
+                        index=i,
+                        text="",
+                        logprobs=logprobs,
+                        finish_reason=output.finish_reason,
+                        usage=final_usage,
+                    )
+                    yield f"data: {response_json}\n\n"
+        yield "data: [DONE]\n\n"
+
+    # Streaming response
+    if stream:
+        return StreamingResponse(completion_stream_generator(),
+                                 media_type="text/event-stream")
+
+    # Non-streaming response
+    final_res: RequestOutput = None
+    async for res in result_generator:
+        if await raw_request.is_disconnected():
+            # Abort the request if the client disconnects.
+            await engine.abort(request_id)
+            return create_error_response(HTTPStatus.BAD_REQUEST,
+                                         "Client disconnected")
+        final_res = res
+    assert final_res is not None
+    choices = []
+    for output in final_res.outputs:
+        if request.logprobs is not None:
+            logprobs = create_logprobs(output.token_ids, output.logprobs)
+        else:
+            logprobs = None
+        choice_data = CompletionResponseChoice(
+            index=output.index,
+            text=output.text,
+            logprobs=logprobs,
+            finish_reason=output.finish_reason,
+        )
+        choices.append(choice_data)
+
+    num_prompt_tokens = len(final_res.prompt_token_ids)
+    num_generated_tokens = sum(
+        len(output.token_ids) for output in final_res.outputs)
+    usage = UsageInfo(
+        prompt_tokens=num_prompt_tokens,
+        completion_tokens=num_generated_tokens,
+        total_tokens=num_prompt_tokens + num_generated_tokens,
+    )
+    response = CompletionResponse(
+        id=request_id,
+        created=created_time,
+        model=model_name,
+        choices=choices,
+        usage=usage,
+    )
+
+    if request.stream:
+        # When user requests streaming but we don't stream, we still need to
+        # return a streaming response with a single event.
+        response_json = response.json(ensure_ascii=False)
+
+        async def fake_stream_generator() -> AsyncGenerator[str, None]:
+            yield f"data: {response_json}\n\n"
+            yield "data: [DONE]\n\n"
+
+        return StreamingResponse(fake_stream_generator(),
+                                 media_type="text/event-stream")
+
+    return response
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="vLLM OpenAI-Compatible RESTful API server.")
+    parser.add_argument("--host", type=str, default=None, help="host name")
+    parser.add_argument("--port", type=int, default=8000, help="port number")
+    parser.add_argument("--allow-credentials",
+                        action="store_true",
+                        help="allow credentials")
+    parser.add_argument("--allowed-origins",
+                        type=json.loads,
+                        default=["*"],
+                        help="allowed origins")
+    parser.add_argument("--allowed-methods",
+                        type=json.loads,
+                        default=["*"],
+                        help="allowed methods")
+    parser.add_argument("--allowed-headers",
+                        type=json.loads,
+                        default=["*"],
+                        help="allowed headers")
+    parser.add_argument("--served-model-name",
+                        type=str,
+                        default=None,
+                        help="The model name used in the API. If not "
+                        "specified, the model name will be the same as "
+                        "the huggingface name.")
+
+    parser = AsyncEngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=args.allowed_origins,
+        allow_credentials=args.allow_credentials,
+        allow_methods=args.allowed_methods,
+        allow_headers=args.allowed_headers,
+    )
+
+    logger.info(f"args: {args}")
+
+    if args.served_model_name is not None:
+        served_model = args.served_model_name
+    else:
+        served_model = args.model
+
+    engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = AsyncLLMEngine.from_engine_args(engine_args)
+    engine_model_config = asyncio.run(engine.get_model_config())
+    max_model_len = engine_model_config.max_model_len
+
+    # A separate tokenizer to map token IDs to strings.
+    tokenizer = get_tokenizer(
+        engine_model_config.tokenizer,
+        tokenizer_mode=engine_model_config.tokenizer_mode,
+        trust_remote_code=engine_model_config.trust_remote_code)
+
+    uvicorn.run(app,
+                host=args.host,
+                port=args.port,
+                log_level="info",
+                timeout_keep_alive=TIMEOUT_KEEP_ALIVE)
--- a/inference/vllm/vllm/entrypoints/openai/protocol.py
+++ b/inference/vllm/vllm/entrypoints/openai/protocol.py
+# Adapted from
+# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
+import time
+from typing import Dict, List, Literal, Optional, Union
+
+from pydantic import BaseModel, Field
+
+from vllm.utils import random_uuid
+
+
+class ErrorResponse(BaseModel):
+    object: str = "error"
+    message: str
+    type: str
+    param: Optional[str] = None
+    code: Optional[str] = None
+
+
+class ModelPermission(BaseModel):
+    id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
+    object: str = "model_permission"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    allow_create_engine: bool = False
+    allow_sampling: bool = True
+    allow_logprobs: bool = True
+    allow_search_indices: bool = False
+    allow_view: bool = True
+    allow_fine_tuning: bool = False
+    organization: str = "*"
+    group: Optional[str] = None
+    is_blocking: str = False
+
+
+class ModelCard(BaseModel):
+    id: str
+    object: str = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: str = "vllm"
+    root: Optional[str] = None
+    parent: Optional[str] = None
+    permission: List[ModelPermission] = Field(default_factory=list)
+
+
+class ModelList(BaseModel):
+    object: str = "list"
+    data: List[ModelCard] = Field(default_factory=list)
+
+
+class UsageInfo(BaseModel):
+    prompt_tokens: int = 0
+    total_tokens: int = 0
+    completion_tokens: Optional[int] = 0
+
+
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: Union[str, List[Dict[str, str]]]
+    temperature: Optional[float] = 0.7
+    top_p: Optional[float] = 1.0
+    n: Optional[int] = 1
+    max_tokens: Optional[int] = None
+    stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
+    stream: Optional[bool] = False
+    presence_penalty: Optional[float] = 0.0
+    frequency_penalty: Optional[float] = 0.0
+    logit_bias: Optional[Dict[str, float]] = None
+    user: Optional[str] = None
+    # Additional parameters supported by vLLM
+    best_of: Optional[int] = None
+    top_k: Optional[int] = -1
+    ignore_eos: Optional[bool] = False
+    use_beam_search: Optional[bool] = False
+    stop_token_ids: Optional[List[int]] = Field(default_factory=list)
+    skip_special_tokens: Optional[bool] = True
+    spaces_between_special_tokens: Optional[bool] = True
+
+
+class CompletionRequest(BaseModel):
+    model: str
+    # a string, array of strings, array of tokens, or array of token arrays
+    prompt: Union[List[int], List[List[int]], str, List[str]]
+    suffix: Optional[str] = None
+    max_tokens: Optional[int] = 16
+    temperature: Optional[float] = 1.0
+    top_p: Optional[float] = 1.0
+    n: Optional[int] = 1
+    stream: Optional[bool] = False
+    logprobs: Optional[int] = None
+    echo: Optional[bool] = False
+    stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
+    presence_penalty: Optional[float] = 0.0
+    frequency_penalty: Optional[float] = 0.0
+    best_of: Optional[int] = None
+    logit_bias: Optional[Dict[str, float]] = None
+    user: Optional[str] = None
+    # Additional parameters supported by vLLM
+    top_k: Optional[int] = -1
+    ignore_eos: Optional[bool] = False
+    use_beam_search: Optional[bool] = False
+    stop_token_ids: Optional[List[int]] = Field(default_factory=list)
+    skip_special_tokens: Optional[bool] = True
+    spaces_between_special_tokens: Optional[bool] = True
+
+
+class LogProbs(BaseModel):
+    text_offset: List[int] = Field(default_factory=list)
+    token_logprobs: List[Optional[float]] = Field(default_factory=list)
+    tokens: List[str] = Field(default_factory=list)
+    top_logprobs: List[Optional[Dict[str,
+                                     float]]] = Field(default_factory=list)
+
+
+class CompletionResponseChoice(BaseModel):
+    index: int
+    text: str
+    logprobs: Optional[LogProbs] = None
+    finish_reason: Optional[Literal["stop", "length"]] = None
+
+
+class CompletionResponse(BaseModel):
+    id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
+    object: str = "text_completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[CompletionResponseChoice]
+    usage: UsageInfo
+
+
+class CompletionResponseStreamChoice(BaseModel):
+    index: int
+    text: str
+    logprobs: Optional[LogProbs] = None
+    finish_reason: Optional[Literal["stop", "length"]] = None
+
+
+class CompletionStreamResponse(BaseModel):
+    id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
+    object: str = "text_completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[CompletionResponseStreamChoice]
+    usage: Optional[UsageInfo]
+
+
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+
+
+class ChatCompletionResponseChoice(BaseModel):
+    index: int
+    message: ChatMessage
+    finish_reason: Optional[Literal["stop", "length"]] = None
+
+
+class ChatCompletionResponse(BaseModel):
+    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
+    object: str = "chat.completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[ChatCompletionResponseChoice]
+    usage: UsageInfo
+
+
+class DeltaMessage(BaseModel):
+    role: Optional[str] = None
+    content: Optional[str] = None
+
+
+class ChatCompletionResponseStreamChoice(BaseModel):
+    index: int
+    delta: DeltaMessage
+    finish_reason: Optional[Literal["stop", "length"]] = None
+
+
+class ChatCompletionStreamResponse(BaseModel):
+    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
+    object: str = "chat.completion.chunk"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[ChatCompletionResponseStreamChoice]
+    usage: Optional[UsageInfo] = Field(
+        default=None, description="data about request and response")
--- a/inference/vllm/vllm/layernorm_ops.cpython-310-x86_64-linux-gnu.so
+++ b/inference/vllm/vllm/layernorm_ops.cpython-310-x86_64-linux-gnu.so
--- a/inference/vllm/vllm/logger.py
+++ b/inference/vllm/vllm/logger.py
+# Adapted from
+# https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py
+"""Logging configuration for vLLM."""
+import logging
+import sys
+
+_FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
+_DATE_FORMAT = "%m-%d %H:%M:%S"
+
+
+class NewLineFormatter(logging.Formatter):
+    """Adds logging prefix to newlines to align multi-line messages."""
+
+    def __init__(self, fmt, datefmt=None):
+        logging.Formatter.__init__(self, fmt, datefmt)
+
+    def format(self, record):
+        msg = logging.Formatter.format(self, record)
+        if record.message != "":
+            parts = msg.split(record.message)
+            msg = msg.replace("\n", "\r\n" + parts[0])
+        return msg
+
+
+_root_logger = logging.getLogger("vllm")
+_default_handler = None
+
+
+def _setup_logger():
+    _root_logger.setLevel(logging.DEBUG)
+    global _default_handler
+    if _default_handler is None:
+        _default_handler = logging.StreamHandler(sys.stdout)
+        _default_handler.flush = sys.stdout.flush  # type: ignore
+        _default_handler.setLevel(logging.INFO)
+        _root_logger.addHandler(_default_handler)
+    fmt = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT)
+    _default_handler.setFormatter(fmt)
+    # Setting this will avoid the message
+    # being propagated to the parent logger.
+    _root_logger.propagate = False
+
+
+# The logger is initialized when the module is imported.
+# This is thread-safe as the module is only imported once,
+# guaranteed by the Python GIL.
+_setup_logger()
+
+
+def init_logger(name: str):
+    # Use the same settings as above for root logger
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    logger.addHandler(_default_handler)
+    logger.propagate = False
+    return logger
--- a/inference/vllm/vllm/model_executor/__init__.py
+++ b/inference/vllm/vllm/model_executor/__init__.py
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.utils import set_random_seed
+
+__all__ = [
+    "InputMetadata",
+    "get_model",
+    "set_random_seed",
+]
--- a/inference/vllm/vllm/model_executor/__pycache__/__init__.cpython-310.pyc
+++ b/inference/vllm/vllm/model_executor/__pycache__/__init__.cpython-310.pyc
--- a/inference/vllm/vllm/model_executor/__pycache__/input_metadata.cpython-310.pyc
+++ b/inference/vllm/vllm/model_executor/__pycache__/input_metadata.cpython-310.pyc
--- a/inference/vllm/vllm/model_executor/__pycache__/model_loader.cpython-310.pyc
+++ b/inference/vllm/vllm/model_executor/__pycache__/model_loader.cpython-310.pyc
--- a/inference/vllm/vllm/model_executor/__pycache__/utils.cpython-310.pyc
+++ b/inference/vllm/vllm/model_executor/__pycache__/utils.cpython-310.pyc
--- a/inference/vllm/vllm/model_executor/__pycache__/weight_utils.cpython-310.pyc
+++ b/inference/vllm/vllm/model_executor/__pycache__/weight_utils.cpython-310.pyc
--- a/inference/vllm/vllm/model_executor/input_metadata.py
+++ b/inference/vllm/vllm/model_executor/input_metadata.py
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from xformers.ops import AttentionBias
+
+from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.sequence import SequenceData
+
+
+class InputMetadata:
+    """Metadata for input sequences. Used for PagedAttention.
+
+    Args:
+        seq_groups: List of (seq_ids, sampling_params).
+        seq_data: Seq_id -> SequenceData.
+        prompt_lens: Lengths of prompts.
+        slot_mapping: The address to write the new KV to of each token.
+        context_lens: the length of attention context for each generation token.
+        max_context_len: The maximum context length.
+        block_tables: The block tables. (Seq id -> list of physical block)
+    """
+
+    def __init__(
+        self,
+        seq_groups: List[Tuple[List[int], SamplingParams]],
+        seq_data: Dict[int, SequenceData],
+        prompt_lens: List[int],
+        slot_mapping: torch.Tensor,
+        context_lens: torch.Tensor,
+        max_context_len: int,
+        block_tables: torch.Tensor,
+        selected_token_indices: torch.Tensor,
+        categorized_sample_indices: Dict[SamplingType, torch.Tensor],
+        sliding_window: Optional[int] = None,
+    ) -> None:
+        self.seq_groups = seq_groups
+        self.seq_data = seq_data
+        self.prompt_lens = prompt_lens
+        self.slot_mapping = slot_mapping
+        self.context_lens = context_lens
+        self.max_context_len = max_context_len
+        self.block_tables = block_tables
+        self.selected_token_indices = selected_token_indices
+        self.categorized_sample_indices = categorized_sample_indices
+
+        self.max_prompt_len = max(prompt_lens) if prompt_lens else 0
+        self.to_cache = None
+        if sliding_window is not None:
+            # We need to keep the positions of sliding windows within
+            # the key / value tables, this is helpful to know which
+            # elements we need to cache.
+            to_cache, start_idx = [], 0
+            for prompt_len in self.prompt_lens:
+                to_cache.extend(
+                    range(
+                        start_idx + max(0, prompt_len - sliding_window),
+                        start_idx + prompt_len,
+                    ))
+                start_idx += self.max_prompt_len
+            to_cache.extend(range(start_idx, slot_mapping.shape[0]))
+            self.to_cache = torch.tensor(to_cache,
+                                         dtype=torch.int32,
+                                         device=self.slot_mapping.device)
+
+        self.num_prompts = len(prompt_lens)
+        self.num_prompt_tokens = self.num_prompts * self.max_prompt_len
+        self.num_generation_tokens = context_lens.shape[0]
+        if block_tables.numel() > 0:
+            self.max_num_blocks_per_seq = block_tables.shape[1]
+        else:
+            self.max_num_blocks_per_seq = 0
+        assert block_tables.shape[0] == self.num_generation_tokens
+
+        # Set during the execution of the first attention op.
+        self.attn_bias: Optional[AttentionBias] = None
+
+    def __repr__(self) -> str:
+        # Print only useful metadata.
+        return (
+            f'InputMetadata('
+            f'num_prompt_tokens={self.num_prompt_tokens}, '
+            f'num_prompts={self.num_prompts}, '
+            f'prompt_lens={self.prompt_lens}, '
+            f'num_generation_tokens={self.num_generation_tokens}, '
+            f'context_lens={self.context_lens}, '
+            f'max_context_len={self.max_context_len}), '
+            f'max_num_blocks_per_seq={self.max_num_blocks_per_seq}, '
+            f'block_tables={self.block_tables}, '
+            f'selected_token_indices={self.selected_token_indices}, '
+            f'categorized_sample_indices={self.categorized_sample_indices}, '
+            f'slot_mapping={self.slot_mapping})')
--- a/inference/vllm/vllm/model_executor/layers/__init__.py
+++ b/inference/vllm/vllm/model_executor/layers/__init__.py
--- a/inference/vllm/vllm/model_executor/layers/__pycache__/__init__.cpython-310.pyc
+++ b/inference/vllm/vllm/model_executor/layers/__pycache__/__init__.cpython-310.pyc
--- a/inference/vllm/vllm/model_executor/layers/__pycache__/activation.cpython-310.pyc
+++ b/inference/vllm/vllm/model_executor/layers/__pycache__/activation.cpython-310.pyc
--- a/inference/vllm/vllm/model_executor/layers/__pycache__/attention.cpython-310.pyc
+++ b/inference/vllm/vllm/model_executor/layers/__pycache__/attention.cpython-310.pyc
--- a/inference/vllm/vllm/model_executor/layers/__pycache__/layernorm.cpython-310.pyc
+++ b/inference/vllm/vllm/model_executor/layers/__pycache__/layernorm.cpython-310.pyc
--- a/inference/vllm/vllm/model_executor/layers/__pycache__/linear.cpython-310.pyc
+++ b/inference/vllm/vllm/model_executor/layers/__pycache__/linear.cpython-310.pyc
--- a/inference/vllm/vllm/model_executor/layers/__pycache__/rotary_embedding.cpython-310.pyc
+++ b/inference/vllm/vllm/model_executor/layers/__pycache__/rotary_embedding.cpython-310.pyc
--- a/inference/vllm/vllm/model_executor/layers/__pycache__/sampler.cpython-310.pyc
+++ b/inference/vllm/vllm/model_executor/layers/__pycache__/sampler.cpython-310.pyc
--- a/inference/vllm/vllm/model_executor/layers/__pycache__/vocab_parallel_embedding.cpython-310.pyc
+++ b/inference/vllm/vllm/model_executor/layers/__pycache__/vocab_parallel_embedding.cpython-310.pyc