Unverified Commit 5c9aa51a authored by AllentDan's avatar AllentDan Committed by GitHub
Browse files

Add api.py (#805)

* add api.py

* update serve function

* add model_name arg and provide examples

* docstring

* remove service_available

* type hint
parent 16b4b823
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from lmdeploy.api import client, pipeline, serve
__all__ = ['pipeline', 'serve', 'client']
def bootstrap(): def bootstrap():
import os import os
import sys import sys
......
# Copyright (c) OpenMMLab. All rights reserved.
import os
from typing import Optional
def pipeline(model_path: str,
model_name: Optional[str] = None,
instance_num: int = 32,
tp: int = 1,
log_level='ERROR',
**kwargs):
"""
Args:
model_path (str): the path of a model.
It could be one of the following options:
- i) A local directory path of a turbomind model which is
converted by `lmdeploy convert` command or download from
ii) and iii).
- ii) The model_id of a lmdeploy-quantized model hosted
inside a model repo on huggingface.co, such as
"InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on.
model_name (str): needed when model_path is a pytorch model on
huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
instance_num (int): instance numbers to be created
tp (int): tensor parallel
log_level(str): set log level whose value among [CRITICAL, ERROR, WARNING, INFO, DEBUG]
Examples:
>>> import lmdeploy
>>> pipe = lmdeploy.pipeline('InternLM/internlm-chat-7b-v1_1', 'internlm-chat-7b')
>>> response = pipe(['hi','say this is a test'])
>>> print(response)
""" # noqa E501
from lmdeploy.serve.async_engine import AsyncEngine
os.environ['TM_LOG_LEVEL'] = log_level
return AsyncEngine(model_path,
model_name=model_name,
instance_num=instance_num,
tp=tp,
**kwargs)
def serve(model_path: str,
model_name: Optional[str] = None,
server_name: str = '0.0.0.0',
server_port: int = 23333,
instance_num: int = 64,
tp: int = 1,
log_level: str = 'ERROR',
**kwargs):
"""This will run the api_server in a subprocess.
Args:
model_path (str): the path of a model.
It could be one of the following options:
- i) A local directory path of a turbomind model which is
converted by `lmdeploy convert` command or download from
ii) and iii).
- ii) The model_id of a lmdeploy-quantized model hosted
inside a model repo on huggingface.co, such as
"InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on.
model_name (str): needed when model_path is a pytorch model on
huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
server_name (str): host ip for serving
server_port (int): server port
instance_num (int): number of instances of turbomind model
tp (int): tensor parallel
log_level(str): set log level whose value among [CRITICAL, ERROR, WARNING, INFO, DEBUG]
Return:
APIClient: A client chatbot for LLaMA series models.
Examples:
>>> import lmdeploy
>>> client = lmdeploy.serve('InternLM/internlm-chat-7b-v1_1', 'internlm-chat-7b')
>>> for output in client.chat('hi', 1):
... print(output)
""" # noqa E501
import time
from multiprocessing import Process
from lmdeploy.serve.openai.api_client import APIClient
from lmdeploy.serve.openai.api_server import serve
task = Process(target=serve,
args=(model_path, ),
kwargs=dict(model_name=model_name,
server_name=server_name,
server_port=server_port,
instance_num=instance_num,
tp=tp,
log_level=log_level,
**kwargs))
task.start()
client = APIClient(f'http://{server_name}:{server_port}')
while True:
time.sleep(1)
try:
client.available_models
return client
except: # noqa
pass
def client(api_server_url: str = 'http://0.0.0.0:23333', **kwargs):
"""
Args:
api_server_url (str): communicating address 'http://<ip>:<port>' of
api_server
Return:
Chatbot for LLaMA series models with turbomind as inference engine.
"""
from lmdeploy.serve.openai.api_client import APIClient
return APIClient(api_server_url, **kwargs)
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from typing import List from typing import List, Optional
class SubCliServe(object): class SubCliServe(object):
...@@ -46,6 +46,7 @@ class SubCliServe(object): ...@@ -46,6 +46,7 @@ class SubCliServe(object):
def api_server(self, def api_server(self,
model_path: str, model_path: str,
model_name: Optional[str] = None,
server_name: str = '0.0.0.0', server_name: str = '0.0.0.0',
server_port: int = 23333, server_port: int = 23333,
instance_num: int = 64, instance_num: int = 64,
...@@ -58,7 +59,21 @@ class SubCliServe(object): ...@@ -58,7 +59,21 @@ class SubCliServe(object):
"""Serve LLMs with restful api using fastapi. """Serve LLMs with restful api using fastapi.
Args: Args:
model_path (str): the path of the deployed model model_path (str): the path of a model.
It could be one of the following options:
- i) A local directory path of a turbomind model which is
converted by `lmdeploy convert` command or
download from ii) and iii).
- ii) The model_id of a lmdeploy-quantized model hosted
inside a model repo on huggingface.co, such as
"InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on.
model_name (str): needed when model_path is a pytorch model on
huggingface.co, such as "InternLM/internlm-chat-7b"
server_name (str): host ip for serving server_name (str): host ip for serving
server_port (int): server port server_port (int): server port
instance_num (int): number of instances of turbomind model instance_num (int): number of instances of turbomind model
...@@ -69,9 +84,10 @@ class SubCliServe(object): ...@@ -69,9 +84,10 @@ class SubCliServe(object):
allow_headers (List[str]): a list of allowed HTTP headers for CORS allow_headers (List[str]): a list of allowed HTTP headers for CORS
kwargs (dict) extra params to init api server kwargs (dict) extra params to init api server
""" """
from lmdeploy.serve.openai.api_server import main as run_api_server from lmdeploy.serve.openai.api_server import serve as run_api_server
run_api_server(model_path, run_api_server(model_path,
model_name=model_name,
server_name=server_name, server_name=server_name,
server_port=server_port, server_port=server_port,
instance_num=instance_num, instance_num=instance_num,
......
...@@ -3,7 +3,7 @@ import asyncio ...@@ -3,7 +3,7 @@ import asyncio
import dataclasses import dataclasses
import random import random
from contextlib import contextmanager from contextlib import contextmanager
from typing import List, Literal, Optional from typing import List, Literal, Optional, Union
@dataclasses.dataclass @dataclasses.dataclass
...@@ -20,14 +20,35 @@ class AsyncEngine: ...@@ -20,14 +20,35 @@ class AsyncEngine:
"""Async inference engine. Maintaining a bunch of tm_model instances. """Async inference engine. Maintaining a bunch of tm_model instances.
Args: Args:
model_path (str): the path of the deployed model model_path (str): the path of a model.
It could be one of the following options:
- i) A local directory path of a turbomind model which is
converted by `lmdeploy convert` command or download from
ii) and iii).
- ii) The model_id of a lmdeploy-quantized model hosted
inside a model repo on huggingface.co, such as
"InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on.
model_name (str): needed when model_path is a pytorch model on
huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
instance_num (int): instance numbers to be created instance_num (int): instance numbers to be created
tp (int): tensor parallel tp (int): tensor parallel
""" """
def __init__(self, model_path, instance_num=32, tp=1, **kwargs) -> None: def __init__(self,
model_path: str,
model_name: Optional[str] = None,
instance_num: int = 32,
tp: int = 1,
**kwargs) -> None:
from lmdeploy import turbomind as tm from lmdeploy import turbomind as tm
self.tm_model = tm.TurboMind.from_pretrained(model_path, self.tm_model = tm.TurboMind.from_pretrained(model_path,
model_name=model_name,
tp=tp, tp=tp,
**kwargs) **kwargs)
self.tokenizer = self.tm_model.tokenizer self.tokenizer = self.tm_model.tokenizer
...@@ -41,6 +62,42 @@ class AsyncEngine: ...@@ -41,6 +62,42 @@ class AsyncEngine:
for i in range(instance_num): for i in range(instance_num):
self.gens_set.add(self.tm_model.create_instance()) self.gens_set.add(self.tm_model.create_instance())
def __call__(self,
prompts: List[str],
request_output_len=512,
top_k=40,
top_p=0.8,
temperature=0.8,
repetition_penalty=1.0,
ignore_eos=False,
do_preprocess=True,
**kwargs):
"""Inference a batch of prompts.
Args:
prompts (List[str]): a batch of prompts
request_output_len (int): output token nums
top_k (int): The number of the highest probability vocabulary
tokens to keep for top-k-filtering
top_p (float): If set to float < 1, only the smallest set of most
probable tokens with probabilities that add up to top_p or higher
are kept for generation.
temperature (float): to modulate the next token probability
repetition_penalty (float): The parameter for repetition penalty.
1.0 means no penalty
ignore_eos (bool): indicator for ignoring eos
do_preprocess (bool): whether pre-process the messages.
"""
return self.batch_infer(prompts,
request_output_len=request_output_len,
top_k=top_k,
top_p=top_p,
temperature=temperature,
repetition_penalty=repetition_penalty,
ignore_eos=ignore_eos,
do_preprocess=do_preprocess,
**kwargs)
def stop_session(self, session_id: int): def stop_session(self, session_id: int):
"""Stop a session by a session_id.""" """Stop a session by a session_id."""
input_ids = [self.tm_model.eos_id] input_ids = [self.tm_model.eos_id]
...@@ -100,7 +157,7 @@ class AsyncEngine: ...@@ -100,7 +157,7 @@ class AsyncEngine:
return self.gens_set.pop() return self.gens_set.pop()
def batch_infer(self, def batch_infer(self,
prompts: List[str], prompts: Union[List[str], str],
request_output_len=512, request_output_len=512,
top_k=40, top_k=40,
top_p=0.8, top_p=0.8,
...@@ -112,7 +169,7 @@ class AsyncEngine: ...@@ -112,7 +169,7 @@ class AsyncEngine:
"""Inference a batch of prompts. """Inference a batch of prompts.
Args: Args:
prompts (List[str]): a batch of prompts prompts (List[str] | str): a batch of prompts
request_output_len (int): output token nums request_output_len (int): output token nums
top_k (int): The number of the highest probability vocabulary top_k (int): The number of the highest probability vocabulary
tokens to keep for top-k-filtering tokens to keep for top-k-filtering
...@@ -125,6 +182,8 @@ class AsyncEngine: ...@@ -125,6 +182,8 @@ class AsyncEngine:
ignore_eos (bool): indicator for ignoring eos ignore_eos (bool): indicator for ignoring eos
do_preprocess (bool): whether pre-process the messages. do_preprocess (bool): whether pre-process the messages.
""" """
input_str = isinstance(prompts, str)
prompts = [prompts] if input_str else prompts
assert isinstance(prompts, List), 'prompts should be a list' assert isinstance(prompts, List), 'prompts should be a list'
batch_size = len(prompts) batch_size = len(prompts)
outputs = [''] * batch_size outputs = [''] * batch_size
...@@ -154,6 +213,7 @@ class AsyncEngine: ...@@ -154,6 +213,7 @@ class AsyncEngine:
*[_inner_call(i, generators[i]) for i in range(batch_size)]) *[_inner_call(i, generators[i]) for i in range(batch_size)])
self.loop.run_until_complete(gather()) self.loop.run_until_complete(gather())
outputs = outputs[0] if input_str else outputs
return outputs return outputs
async def generate( async def generate(
......
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from threading import Lock from threading import Lock
from typing import Sequence from typing import Optional, Sequence
import gradio as gr import gradio as gr
...@@ -115,6 +115,7 @@ async def cancel_local_func(state_chatbot: Sequence, cancel_btn: gr.Button, ...@@ -115,6 +115,7 @@ async def cancel_local_func(state_chatbot: Sequence, cancel_btn: gr.Button,
def run_local(model_path: str, def run_local(model_path: str,
model_name: Optional[str] = None,
server_name: str = 'localhost', server_name: str = 'localhost',
server_port: int = 6006, server_port: int = 6006,
batch_size: int = 4, batch_size: int = 4,
...@@ -123,13 +124,29 @@ def run_local(model_path: str, ...@@ -123,13 +124,29 @@ def run_local(model_path: str,
"""chat with AI assistant through web ui. """chat with AI assistant through web ui.
Args: Args:
model_path (str): the path of the deployed model model_path (str): the path of a model.
It could be one of the following options:
- i) A local directory path of a turbomind model which is
converted by `lmdeploy convert` command or download from
ii) and iii).
- ii) The model_id of a lmdeploy-quantized model hosted
inside a model repo on huggingface.co, such as
"InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on.
model_name (str): needed when model_path is a pytorch model on
huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
server_name (str): the ip address of gradio server server_name (str): the ip address of gradio server
server_port (int): the port of gradio server server_port (int): the port of gradio server
batch_size (int): batch size for running Turbomind directly batch_size (int): batch size for running Turbomind directly
tp (int): tensor parallel for Turbomind tp (int): tensor parallel for Turbomind
""" """
InterFace.async_engine = AsyncEngine(model_path=model_path, InterFace.async_engine = AsyncEngine(model_path=model_path,
model_name=model_name,
instance_num=batch_size, instance_num=batch_size,
tp=tp, tp=tp,
**kwargs) **kwargs)
......
...@@ -476,22 +476,37 @@ async def chat_interactive_v1(request: GenerateRequest, ...@@ -476,22 +476,37 @@ async def chat_interactive_v1(request: GenerateRequest,
return JSONResponse(ret) return JSONResponse(ret)
def main(model_path: str, def serve(model_path: str,
server_name: str = '0.0.0.0', model_name: Optional[str] = None,
server_port: int = 23333, server_name: str = '0.0.0.0',
instance_num: int = 64, server_port: int = 23333,
tp: int = 1, instance_num: int = 64,
allow_origins: List[str] = ['*'], tp: int = 1,
allow_credentials: bool = True, allow_origins: List[str] = ['*'],
allow_methods: List[str] = ['*'], allow_credentials: bool = True,
allow_headers: List[str] = ['*'], allow_methods: List[str] = ['*'],
log_level: str = 'ERROR', allow_headers: List[str] = ['*'],
**kwargs): log_level: str = 'ERROR',
**kwargs):
"""An example to perform model inference through the command line """An example to perform model inference through the command line
interface. interface.
Args: Args:
model_path (str): the path of the deployed model model_path (str): the path of a model.
It could be one of the following options:
- i) A local directory path of a turbomind model which is
converted by `lmdeploy convert` command or download from
ii) and iii).
- ii) The model_id of a lmdeploy-quantized model hosted
inside a model repo on huggingface.co, such as
"InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on.
model_name (str): needed when model_path is a pytorch model on
huggingface.co, such as "InternLM/internlm-chat-7b"
server_name (str): host ip for serving server_name (str): host ip for serving
server_port (int): server port server_port (int): server port
instance_num (int): number of instances of turbomind model instance_num (int): number of instances of turbomind model
...@@ -514,6 +529,7 @@ def main(model_path: str, ...@@ -514,6 +529,7 @@ def main(model_path: str,
) )
VariableInterface.async_engine = AsyncEngine(model_path=model_path, VariableInterface.async_engine = AsyncEngine(model_path=model_path,
model_name=model_name,
instance_num=instance_num, instance_num=instance_num,
tp=tp, tp=tp,
**kwargs) **kwargs)
...@@ -526,4 +542,4 @@ def main(model_path: str, ...@@ -526,4 +542,4 @@ def main(model_path: str,
if __name__ == '__main__': if __name__ == '__main__':
import fire import fire
fire.Fire(main) fire.Fire(serve)
...@@ -43,7 +43,7 @@ def test_subcli_serve(): ...@@ -43,7 +43,7 @@ def test_subcli_serve():
from lmdeploy.serve.client import main as run_triton_client from lmdeploy.serve.client import main as run_triton_client
from lmdeploy.serve.gradio.app import run as run_gradio from lmdeploy.serve.gradio.app import run as run_gradio
from lmdeploy.serve.openai.api_client import main as run_api_client from lmdeploy.serve.openai.api_client import main as run_api_client
from lmdeploy.serve.openai.api_server import main as run_api_server from lmdeploy.serve.openai.api_server import serve as run_api_server
compare_func(SubCliServe.gradio, run_gradio) compare_func(SubCliServe.gradio, run_gradio)
compare_func(SubCliServe.api_server, run_api_server) compare_func(SubCliServe.api_server, run_api_server)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment