Add api.py (#805)

* add api.py * update serve function * add model_name arg and provide examples * docstring * remove service_available * type hint

Add api.py (#805)
* add api.py * update serve function * add model_name arg and provide examples * docstring * remove service_available * type hint
5c9aa51a · AllentDan · GitHub · 16b4b823 · 5c9aa51a · 5c9aa51a
Unverified Commit 5c9aa51a authored Dec 13, 2023 by AllentDan Committed by GitHub Dec 13, 2023
7 changed files
--- a/lmdeploy/__init__.py
+++ b/lmdeploy/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
+
+from lmdeploy.api import client, pipeline, serve
+
+__all__ = ['pipeline', 'serve', 'client']
+
+
 def bootstrap():
    import os
    import sys

--- a/lmdeploy/api.py
+++ b/lmdeploy/api.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from typing import Optional
+
+
+def pipeline(model_path: str,
+             model_name: Optional[str] = None,
+             instance_num: int = 32,
+             tp: int = 1,
+             log_level='ERROR',
+             **kwargs):
+    """
+    Args:
+        model_path (str): the path of a model.
+            It could be one of the following options:
+                - i) A local directory path of a turbomind model which is
+                    converted by `lmdeploy convert` command or download from
+                    ii) and iii).
+                - ii) The model_id of a lmdeploy-quantized model hosted
+                    inside a model repo on huggingface.co, such as
+                    "InternLM/internlm-chat-20b-4bit",
+                    "lmdeploy/llama2-chat-70b-4bit", etc.
+                - iii) The model_id of a model hosted inside a model repo
+                    on huggingface.co, such as "InternLM/internlm-chat-7b",
+                    "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
+                    and so on.
+        model_name (str): needed when model_path is a pytorch model on
+            huggingface.co, such as "InternLM/internlm-chat-7b",
+            "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
+        instance_num (int): instance numbers to be created
+        tp (int): tensor parallel
+        log_level(str): set log level whose value among [CRITICAL, ERROR, WARNING, INFO, DEBUG]
+
+    Examples:
+        >>> import lmdeploy
+        >>> pipe = lmdeploy.pipeline('InternLM/internlm-chat-7b-v1_1', 'internlm-chat-7b')
+        >>> response = pipe(['hi','say this is a test'])
+        >>> print(response)
+    """ # noqa E501
+    from lmdeploy.serve.async_engine import AsyncEngine
+    os.environ['TM_LOG_LEVEL'] = log_level
+    return AsyncEngine(model_path,
+                       model_name=model_name,
+                       instance_num=instance_num,
+                       tp=tp,
+                       **kwargs)
+
+
+def serve(model_path: str,
+          model_name: Optional[str] = None,
+          server_name: str = '0.0.0.0',
+          server_port: int = 23333,
+          instance_num: int = 64,
+          tp: int = 1,
+          log_level: str = 'ERROR',
+          **kwargs):
+    """This will run the api_server in a subprocess.
+
+    Args:
+        model_path (str): the path of a model.
+            It could be one of the following options:
+                - i) A local directory path of a turbomind model which is
+                    converted by `lmdeploy convert` command or download from
+                    ii) and iii).
+                - ii) The model_id of a lmdeploy-quantized model hosted
+                    inside a model repo on huggingface.co, such as
+                    "InternLM/internlm-chat-20b-4bit",
+                    "lmdeploy/llama2-chat-70b-4bit", etc.
+                - iii) The model_id of a model hosted inside a model repo
+                    on huggingface.co, such as "InternLM/internlm-chat-7b",
+                    "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
+                    and so on.
+        model_name (str): needed when model_path is a pytorch model on
+            huggingface.co, such as "InternLM/internlm-chat-7b",
+            "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
+        server_name (str): host ip for serving
+        server_port (int): server port
+        instance_num (int): number of instances of turbomind model
+        tp (int): tensor parallel
+        log_level(str): set log level whose value among [CRITICAL, ERROR, WARNING, INFO, DEBUG]
+
+    Return:
+        APIClient: A client chatbot for LLaMA series models.
+
+    Examples:
+        >>> import lmdeploy
+        >>> client = lmdeploy.serve('InternLM/internlm-chat-7b-v1_1', 'internlm-chat-7b')
+        >>> for output in client.chat('hi', 1):
+        ...    print(output)
+    """ # noqa E501
+    import time
+    from multiprocessing import Process
+
+    from lmdeploy.serve.openai.api_client import APIClient
+    from lmdeploy.serve.openai.api_server import serve
+    task = Process(target=serve,
+                   args=(model_path, ),
+                   kwargs=dict(model_name=model_name,
+                               server_name=server_name,
+                               server_port=server_port,
+                               instance_num=instance_num,
+                               tp=tp,
+                               log_level=log_level,
+                               **kwargs))
+    task.start()
+    client = APIClient(f'http://{server_name}:{server_port}')
+    while True:
+        time.sleep(1)
+        try:
+            client.available_models
+            return client
+        except:  # noqa
+            pass
+
+
+def client(api_server_url: str = 'http://0.0.0.0:23333', **kwargs):
+    """
+    Args:
+        api_server_url (str): communicating address 'http://<ip>:<port>' of
+            api_server
+    Return:
+        Chatbot for LLaMA series models with turbomind as inference engine.
+    """
+    from lmdeploy.serve.openai.api_client import APIClient
+    return APIClient(api_server_url, **kwargs)
--- a/lmdeploy/cli/serve.py
+++ b/lmdeploy/cli/serve.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import List
+from typing import List, Optional


 class SubCliServe(object):
@@ -46,6 +46,7 @@ class SubCliServe(object):

    def api_server(self,
                   model_path: str,
+                   model_name: Optional[str] = None,
                   server_name: str = '0.0.0.0',
                   server_port: int = 23333,
                   instance_num: int = 64,
@@ -58,7 +59,21 @@ class SubCliServe(object):
        """Serve LLMs with restful api using fastapi.

        Args:
-            model_path (str): the path of the deployed model
+            model_path (str): the path of a model.
+                It could be one of the following options:
+                    - i) A local directory path of a turbomind model which is
+                        converted by `lmdeploy convert` command or
+                        download from ii) and iii).
+                    - ii) The model_id of a lmdeploy-quantized model hosted
+                        inside a model repo on huggingface.co, such as
+                        "InternLM/internlm-chat-20b-4bit",
+                        "lmdeploy/llama2-chat-70b-4bit", etc.
+                    - iii) The model_id of a model hosted inside a model repo
+                        on huggingface.co, such as "InternLM/internlm-chat-7b",
+                        "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
+                        and so on.
+            model_name (str): needed when model_path is a pytorch model on
+                huggingface.co, such as "InternLM/internlm-chat-7b"
            server_name (str): host ip for serving
            server_port (int): server port
            instance_num (int): number of instances of turbomind model
@@ -69,9 +84,10 @@ class SubCliServe(object):
            allow_headers (List[str]): a list of allowed HTTP headers for CORS
            kwargs (dict) extra params to init api server
        """
-        from lmdeploy.serve.openai.api_server import main as run_api_server
+        from lmdeploy.serve.openai.api_server import serve as run_api_server

        run_api_server(model_path,
+                       model_name=model_name,
                       server_name=server_name,
                       server_port=server_port,
                       instance_num=instance_num,

--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -3,7 +3,7 @@ import asyncio
 import dataclasses
 import random
 from contextlib import contextmanager
-from typing import List, Literal, Optional
+from typing import List, Literal, Optional, Union


 @dataclasses.dataclass
@@ -20,14 +20,35 @@ class AsyncEngine:
    """Async inference engine. Maintaining a bunch of tm_model instances.

    Args:
-        model_path (str): the path of the deployed model
+        model_path (str): the path of a model.
+            It could be one of the following options:
+                - i) A local directory path of a turbomind model which is
+                    converted by `lmdeploy convert` command or download from
+                    ii) and iii).
+                - ii) The model_id of a lmdeploy-quantized model hosted
+                    inside a model repo on huggingface.co, such as
+                    "InternLM/internlm-chat-20b-4bit",
+                    "lmdeploy/llama2-chat-70b-4bit", etc.
+                - iii) The model_id of a model hosted inside a model repo
+                    on huggingface.co, such as "InternLM/internlm-chat-7b",
+                    "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
+                    and so on.
+        model_name (str): needed when model_path is a pytorch model on
+            huggingface.co, such as "InternLM/internlm-chat-7b",
+            "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
        instance_num (int): instance numbers to be created
        tp (int): tensor parallel
    """

-    def __init__(self, model_path, instance_num=32, tp=1, **kwargs) -> None:
+    def __init__(self,
+                 model_path: str,
+                 model_name: Optional[str] = None,
+                 instance_num: int = 32,
+                 tp: int = 1,
+                 **kwargs) -> None:
        from lmdeploy import turbomind as tm
        self.tm_model = tm.TurboMind.from_pretrained(model_path,
+                                                     model_name=model_name,
                                                     tp=tp,
                                                     **kwargs)
        self.tokenizer = self.tm_model.tokenizer
@@ -41,6 +62,42 @@ class AsyncEngine:
        for i in range(instance_num):
            self.gens_set.add(self.tm_model.create_instance())

+    def __call__(self,
+                 prompts: List[str],
+                 request_output_len=512,
+                 top_k=40,
+                 top_p=0.8,
+                 temperature=0.8,
+                 repetition_penalty=1.0,
+                 ignore_eos=False,
+                 do_preprocess=True,
+                 **kwargs):
+        """Inference a batch of prompts.
+
+        Args:
+            prompts (List[str]): a batch of prompts
+            request_output_len (int): output token nums
+            top_k (int): The number of the highest probability vocabulary
+              tokens to keep for top-k-filtering
+            top_p (float): If set to float < 1, only the smallest set of most
+              probable tokens with probabilities that add up to top_p or higher
+            are kept for generation.
+            temperature (float): to modulate the next token probability
+            repetition_penalty (float): The parameter for repetition penalty.
+              1.0 means no penalty
+            ignore_eos (bool): indicator for ignoring eos
+            do_preprocess (bool): whether pre-process the messages.
+        """
+        return self.batch_infer(prompts,
+                                request_output_len=request_output_len,
+                                top_k=top_k,
+                                top_p=top_p,
+                                temperature=temperature,
+                                repetition_penalty=repetition_penalty,
+                                ignore_eos=ignore_eos,
+                                do_preprocess=do_preprocess,
+                                **kwargs)
+
    def stop_session(self, session_id: int):
        """Stop a session by a session_id."""
        input_ids = [self.tm_model.eos_id]
@@ -100,7 +157,7 @@ class AsyncEngine:
        return self.gens_set.pop()

    def batch_infer(self,
-                    prompts: List[str],
+                    prompts: Union[List[str], str],
                    request_output_len=512,
                    top_k=40,
                    top_p=0.8,
@@ -112,7 +169,7 @@ class AsyncEngine:
        """Inference a batch of prompts.

        Args:
-            prompts (List[str]): a batch of prompts
+            prompts (List[str] | str): a batch of prompts
            request_output_len (int): output token nums
            top_k (int): The number of the highest probability vocabulary
              tokens to keep for top-k-filtering
@@ -125,6 +182,8 @@ class AsyncEngine:
            ignore_eos (bool): indicator for ignoring eos
            do_preprocess (bool): whether pre-process the messages.
        """
+        input_str = isinstance(prompts, str)
+        prompts = [prompts] if input_str else prompts
        assert isinstance(prompts, List), 'prompts should be a list'
        batch_size = len(prompts)
        outputs = [''] * batch_size
@@ -154,6 +213,7 @@ class AsyncEngine:
                *[_inner_call(i, generators[i]) for i in range(batch_size)])

        self.loop.run_until_complete(gather())
+        outputs = outputs[0] if input_str else outputs
        return outputs

    async def generate(

--- a/lmdeploy/serve/gradio/turbomind_coupled.py
+++ b/lmdeploy/serve/gradio/turbomind_coupled.py
 # Copyright (c) OpenMMLab. All rights reserved.
 from threading import Lock
-from typing import Sequence
+from typing import Optional, Sequence

 import gradio as gr

@@ -115,6 +115,7 @@ async def cancel_local_func(state_chatbot: Sequence, cancel_btn: gr.Button,


 def run_local(model_path: str,
+              model_name: Optional[str] = None,
              server_name: str = 'localhost',
              server_port: int = 6006,
              batch_size: int = 4,
@@ -123,13 +124,29 @@ def run_local(model_path: str,
    """chat with AI assistant through web ui.

    Args:
-        model_path (str): the path of the deployed model
+        model_path (str): the path of a model.
+            It could be one of the following options:
+                - i) A local directory path of a turbomind model which is
+                    converted by `lmdeploy convert` command or download from
+                    ii) and iii).
+                - ii) The model_id of a lmdeploy-quantized model hosted
+                    inside a model repo on huggingface.co, such as
+                    "InternLM/internlm-chat-20b-4bit",
+                    "lmdeploy/llama2-chat-70b-4bit", etc.
+                - iii) The model_id of a model hosted inside a model repo
+                    on huggingface.co, such as "InternLM/internlm-chat-7b",
+                    "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
+                    and so on.
+        model_name (str): needed when model_path is a pytorch model on
+            huggingface.co, such as "InternLM/internlm-chat-7b",
+            "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
        server_name (str): the ip address of gradio server
        server_port (int): the port of gradio server
        batch_size (int): batch size for running Turbomind directly
        tp (int): tensor parallel for Turbomind
    """
    InterFace.async_engine = AsyncEngine(model_path=model_path,
+                                         model_name=model_name,
                                         instance_num=batch_size,
                                         tp=tp,
                                         **kwargs)

--- a/lmdeploy/serve/openai/api_server.py
+++ b/lmdeploy/serve/openai/api_server.py
@@ -476,7 +476,8 @@ async def chat_interactive_v1(request: GenerateRequest,
        return JSONResponse(ret)


-def main(model_path: str,
+def serve(model_path: str,
+          model_name: Optional[str] = None,
          server_name: str = '0.0.0.0',
          server_port: int = 23333,
          instance_num: int = 64,
@@ -491,7 +492,21 @@ def main(model_path: str,
    interface.

    Args:
-        model_path (str): the path of the deployed model
+        model_path (str): the path of a model.
+            It could be one of the following options:
+                - i) A local directory path of a turbomind model which is
+                    converted by `lmdeploy convert` command or download from
+                    ii) and iii).
+                - ii) The model_id of a lmdeploy-quantized model hosted
+                    inside a model repo on huggingface.co, such as
+                    "InternLM/internlm-chat-20b-4bit",
+                    "lmdeploy/llama2-chat-70b-4bit", etc.
+                - iii) The model_id of a model hosted inside a model repo
+                    on huggingface.co, such as "InternLM/internlm-chat-7b",
+                    "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
+                    and so on.
+        model_name (str): needed when model_path is a pytorch model on
+            huggingface.co, such as "InternLM/internlm-chat-7b"
        server_name (str): host ip for serving
        server_port (int): server port
        instance_num (int): number of instances of turbomind model
@@ -514,6 +529,7 @@ def main(model_path: str,
        )

    VariableInterface.async_engine = AsyncEngine(model_path=model_path,
+                                                 model_name=model_name,
                                                 instance_num=instance_num,
                                                 tp=tp,
                                                 **kwargs)
@@ -526,4 +542,4 @@ def main(model_path: str,
 if __name__ == '__main__':
    import fire

-    fire.Fire(main)
+    fire.Fire(serve)
--- a/tests/test_lmdeploy/test_cli.py
+++ b/tests/test_lmdeploy/test_cli.py
@@ -43,7 +43,7 @@ def test_subcli_serve():
    from lmdeploy.serve.client import main as run_triton_client
    from lmdeploy.serve.gradio.app import run as run_gradio
    from lmdeploy.serve.openai.api_client import main as run_api_client
-    from lmdeploy.serve.openai.api_server import main as run_api_server
+    from lmdeploy.serve.openai.api_server import serve as run_api_server

    compare_func(SubCliServe.gradio, run_gradio)
    compare_func(SubCliServe.api_server, run_api_server)