serve.py

# Copyright (c) OpenMMLab. All rights reserved.
from typing import List


class SubCliServe(object):
    """Serve LLMs and interact on terminal or web UI."""

    def gradio(self,
               model_path_or_server: str,
               server_name: str = 'localhost',
               server_port: int = 6006,
               batch_size: int = 32,
               tp: int = 1,
               restful_api: bool = False):
        """Serve LLMs with web ui using gradio.

        Example 1:
            lmdeploy serve gradio ./workspace

        Example 2:
            lmdeploy serve gradio http://localhost:23333
            --server_name localhost
            --server_port 6006
            --restful_api True

        Example 3:
            lmdeploy serve gradio ${triton_server_ip_addresss}:33337

        Args:
            model_path_or_server (str): the path of the deployed model or the
                tritonserver URL or restful api URL. The former is for directly
                running service with gradio. The latter is for running with
                tritonserver by default. If the input URL is restful api.
                Please enable another flag `restful_api`.
            server_name (str): the ip address of gradio server
            server_port (int): the port of gradio server
            batch_size (int): batch size for running Turbomind directly
            tp (int): tensor parallel for Turbomind
            restful_api (bool): a flag for model_path_or_server
        """
        from lmdeploy.serve.gradio.app import run
        run(model_path_or_server,
            server_name=server_name,
            server_port=server_port,
            batch_size=batch_size,
            tp=tp,
            restful_api=restful_api)

    def api_server(self,
                   model_path: str,
                   server_name: str = 'localhost',
                   server_port: int = 23333,
                   instance_num: int = 32,
                   tp: int = 1,
                   allow_origins: List[str] = ['*'],
                   allow_credentials: bool = True,
                   allow_methods: List[str] = ['*'],
                   allow_headers: List[str] = ['*']):
        """Serve LLMs with restful api using fastapi.

        Args:
            model_path (str): the path of the deployed model
            server_name (str): host ip for serving
            server_port (int): server port
            instance_num (int): number of instances of turbomind model
            tp (int): tensor parallel
            allow_origins (List[str]): a list of allowed origins for CORS
            allow_credentials (bool): whether to allow credentials for CORS
            allow_methods (List[str]): a list of allowed HTTP methods for CORS
            allow_headers (List[str]): a list of allowed HTTP headers for CORS
        """
        from lmdeploy.serve.openai.api_server import main as run_api_server

        run_api_server(model_path,
                       server_name=server_name,
                       server_port=server_port,
                       instance_num=instance_num,
                       tp=tp,
                       allow_origins=allow_origins,
                       allow_credentials=allow_credentials,
                       allow_methods=allow_methods,
                       allow_headers=allow_headers)

    def api_client(self, restful_api_url: str, session_id: int = 0):
        """Interact with restful api server in terminal.

        Args:
            restful_api_url: The restful api URL.
            session_id: The identical id of a session.
        """
        from lmdeploy.serve.openai.api_client import main as run_api_client
        run_api_client(restful_api_url, session_id=session_id)

    def triton_client(self,
                      tritonserver_addr: str,
                      session_id: int = 1,
                      cap: str = 'chat',
                      stream_output: bool = True,
                      **kwargs):
        """Interact with Triton Server using gRPC protocol.

        Args:
            tritonserver_addr (str): the address in format "ip:port" of
              triton inference server
            session_id (int): the identical id of a session
            cap (str): the capability of a model. For example, codellama
                has the ability among ['completion', 'infill', 'instruct',
                'python']
            stream_output (bool): indicator for streaming output or not
            **kwargs (dict): other arguments for initializing model's
                chat template
        """

        from lmdeploy.serve.client import main as run_triton_client

        run_triton_client(
            tritonserver_addr,
            session_id=session_id,
            cap=cap,
            stream_output=stream_output,
            **kwargs,
        )