Unverified Commit 5c9aa51a authored by AllentDan's avatar AllentDan Committed by GitHub
Browse files

Add api.py (#805)

* add api.py

* update serve function

* add model_name arg and provide examples

* docstring

* remove service_available

* type hint
parent 16b4b823
# Copyright (c) OpenMMLab. All rights reserved.
from lmdeploy.api import client, pipeline, serve
__all__ = ['pipeline', 'serve', 'client']
def bootstrap():
import os
import sys
......
# Copyright (c) OpenMMLab. All rights reserved.
import os
from typing import Optional
def pipeline(model_path: str,
model_name: Optional[str] = None,
instance_num: int = 32,
tp: int = 1,
log_level='ERROR',
**kwargs):
"""
Args:
model_path (str): the path of a model.
It could be one of the following options:
- i) A local directory path of a turbomind model which is
converted by `lmdeploy convert` command or download from
ii) and iii).
- ii) The model_id of a lmdeploy-quantized model hosted
inside a model repo on huggingface.co, such as
"InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on.
model_name (str): needed when model_path is a pytorch model on
huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
instance_num (int): instance numbers to be created
tp (int): tensor parallel
log_level(str): set log level whose value among [CRITICAL, ERROR, WARNING, INFO, DEBUG]
Examples:
>>> import lmdeploy
>>> pipe = lmdeploy.pipeline('InternLM/internlm-chat-7b-v1_1', 'internlm-chat-7b')
>>> response = pipe(['hi','say this is a test'])
>>> print(response)
""" # noqa E501
from lmdeploy.serve.async_engine import AsyncEngine
os.environ['TM_LOG_LEVEL'] = log_level
return AsyncEngine(model_path,
model_name=model_name,
instance_num=instance_num,
tp=tp,
**kwargs)
def serve(model_path: str,
model_name: Optional[str] = None,
server_name: str = '0.0.0.0',
server_port: int = 23333,
instance_num: int = 64,
tp: int = 1,
log_level: str = 'ERROR',
**kwargs):
"""This will run the api_server in a subprocess.
Args:
model_path (str): the path of a model.
It could be one of the following options:
- i) A local directory path of a turbomind model which is
converted by `lmdeploy convert` command or download from
ii) and iii).
- ii) The model_id of a lmdeploy-quantized model hosted
inside a model repo on huggingface.co, such as
"InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on.
model_name (str): needed when model_path is a pytorch model on
huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
server_name (str): host ip for serving
server_port (int): server port
instance_num (int): number of instances of turbomind model
tp (int): tensor parallel
log_level(str): set log level whose value among [CRITICAL, ERROR, WARNING, INFO, DEBUG]
Return:
APIClient: A client chatbot for LLaMA series models.
Examples:
>>> import lmdeploy
>>> client = lmdeploy.serve('InternLM/internlm-chat-7b-v1_1', 'internlm-chat-7b')
>>> for output in client.chat('hi', 1):
... print(output)
""" # noqa E501
import time
from multiprocessing import Process
from lmdeploy.serve.openai.api_client import APIClient
from lmdeploy.serve.openai.api_server import serve
task = Process(target=serve,
args=(model_path, ),
kwargs=dict(model_name=model_name,
server_name=server_name,
server_port=server_port,
instance_num=instance_num,
tp=tp,
log_level=log_level,
**kwargs))
task.start()
client = APIClient(f'http://{server_name}:{server_port}')
while True:
time.sleep(1)
try:
client.available_models
return client
except: # noqa
pass
def client(api_server_url: str = 'http://0.0.0.0:23333', **kwargs):
"""
Args:
api_server_url (str): communicating address 'http://<ip>:<port>' of
api_server
Return:
Chatbot for LLaMA series models with turbomind as inference engine.
"""
from lmdeploy.serve.openai.api_client import APIClient
return APIClient(api_server_url, **kwargs)
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List
from typing import List, Optional
class SubCliServe(object):
......@@ -46,6 +46,7 @@ class SubCliServe(object):
def api_server(self,
model_path: str,
model_name: Optional[str] = None,
server_name: str = '0.0.0.0',
server_port: int = 23333,
instance_num: int = 64,
......@@ -58,7 +59,21 @@ class SubCliServe(object):
"""Serve LLMs with restful api using fastapi.
Args:
model_path (str): the path of the deployed model
model_path (str): the path of a model.
It could be one of the following options:
- i) A local directory path of a turbomind model which is
converted by `lmdeploy convert` command or
download from ii) and iii).
- ii) The model_id of a lmdeploy-quantized model hosted
inside a model repo on huggingface.co, such as
"InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on.
model_name (str): needed when model_path is a pytorch model on
huggingface.co, such as "InternLM/internlm-chat-7b"
server_name (str): host ip for serving
server_port (int): server port
instance_num (int): number of instances of turbomind model
......@@ -69,9 +84,10 @@ class SubCliServe(object):
allow_headers (List[str]): a list of allowed HTTP headers for CORS
kwargs (dict) extra params to init api server
"""
from lmdeploy.serve.openai.api_server import main as run_api_server
from lmdeploy.serve.openai.api_server import serve as run_api_server
run_api_server(model_path,
model_name=model_name,
server_name=server_name,
server_port=server_port,
instance_num=instance_num,
......
......@@ -3,7 +3,7 @@ import asyncio
import dataclasses
import random
from contextlib import contextmanager
from typing import List, Literal, Optional
from typing import List, Literal, Optional, Union
@dataclasses.dataclass
......@@ -20,14 +20,35 @@ class AsyncEngine:
"""Async inference engine. Maintaining a bunch of tm_model instances.
Args:
model_path (str): the path of the deployed model
model_path (str): the path of a model.
It could be one of the following options:
- i) A local directory path of a turbomind model which is
converted by `lmdeploy convert` command or download from
ii) and iii).
- ii) The model_id of a lmdeploy-quantized model hosted
inside a model repo on huggingface.co, such as
"InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on.
model_name (str): needed when model_path is a pytorch model on
huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
instance_num (int): instance numbers to be created
tp (int): tensor parallel
"""
def __init__(self, model_path, instance_num=32, tp=1, **kwargs) -> None:
def __init__(self,
model_path: str,
model_name: Optional[str] = None,
instance_num: int = 32,
tp: int = 1,
**kwargs) -> None:
from lmdeploy import turbomind as tm
self.tm_model = tm.TurboMind.from_pretrained(model_path,
model_name=model_name,
tp=tp,
**kwargs)
self.tokenizer = self.tm_model.tokenizer
......@@ -41,6 +62,42 @@ class AsyncEngine:
for i in range(instance_num):
self.gens_set.add(self.tm_model.create_instance())
def __call__(self,
prompts: List[str],
request_output_len=512,
top_k=40,
top_p=0.8,
temperature=0.8,
repetition_penalty=1.0,
ignore_eos=False,
do_preprocess=True,
**kwargs):
"""Inference a batch of prompts.
Args:
prompts (List[str]): a batch of prompts
request_output_len (int): output token nums
top_k (int): The number of the highest probability vocabulary
tokens to keep for top-k-filtering
top_p (float): If set to float < 1, only the smallest set of most
probable tokens with probabilities that add up to top_p or higher
are kept for generation.
temperature (float): to modulate the next token probability
repetition_penalty (float): The parameter for repetition penalty.
1.0 means no penalty
ignore_eos (bool): indicator for ignoring eos
do_preprocess (bool): whether pre-process the messages.
"""
return self.batch_infer(prompts,
request_output_len=request_output_len,
top_k=top_k,
top_p=top_p,
temperature=temperature,
repetition_penalty=repetition_penalty,
ignore_eos=ignore_eos,
do_preprocess=do_preprocess,
**kwargs)
def stop_session(self, session_id: int):
"""Stop a session by a session_id."""
input_ids = [self.tm_model.eos_id]
......@@ -100,7 +157,7 @@ class AsyncEngine:
return self.gens_set.pop()
def batch_infer(self,
prompts: List[str],
prompts: Union[List[str], str],
request_output_len=512,
top_k=40,
top_p=0.8,
......@@ -112,7 +169,7 @@ class AsyncEngine:
"""Inference a batch of prompts.
Args:
prompts (List[str]): a batch of prompts
prompts (List[str] | str): a batch of prompts
request_output_len (int): output token nums
top_k (int): The number of the highest probability vocabulary
tokens to keep for top-k-filtering
......@@ -125,6 +182,8 @@ class AsyncEngine:
ignore_eos (bool): indicator for ignoring eos
do_preprocess (bool): whether pre-process the messages.
"""
input_str = isinstance(prompts, str)
prompts = [prompts] if input_str else prompts
assert isinstance(prompts, List), 'prompts should be a list'
batch_size = len(prompts)
outputs = [''] * batch_size
......@@ -154,6 +213,7 @@ class AsyncEngine:
*[_inner_call(i, generators[i]) for i in range(batch_size)])
self.loop.run_until_complete(gather())
outputs = outputs[0] if input_str else outputs
return outputs
async def generate(
......
# Copyright (c) OpenMMLab. All rights reserved.
from threading import Lock
from typing import Sequence
from typing import Optional, Sequence
import gradio as gr
......@@ -115,6 +115,7 @@ async def cancel_local_func(state_chatbot: Sequence, cancel_btn: gr.Button,
def run_local(model_path: str,
model_name: Optional[str] = None,
server_name: str = 'localhost',
server_port: int = 6006,
batch_size: int = 4,
......@@ -123,13 +124,29 @@ def run_local(model_path: str,
"""chat with AI assistant through web ui.
Args:
model_path (str): the path of the deployed model
model_path (str): the path of a model.
It could be one of the following options:
- i) A local directory path of a turbomind model which is
converted by `lmdeploy convert` command or download from
ii) and iii).
- ii) The model_id of a lmdeploy-quantized model hosted
inside a model repo on huggingface.co, such as
"InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on.
model_name (str): needed when model_path is a pytorch model on
huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
server_name (str): the ip address of gradio server
server_port (int): the port of gradio server
batch_size (int): batch size for running Turbomind directly
tp (int): tensor parallel for Turbomind
"""
InterFace.async_engine = AsyncEngine(model_path=model_path,
model_name=model_name,
instance_num=batch_size,
tp=tp,
**kwargs)
......
......@@ -476,7 +476,8 @@ async def chat_interactive_v1(request: GenerateRequest,
return JSONResponse(ret)
def main(model_path: str,
def serve(model_path: str,
model_name: Optional[str] = None,
server_name: str = '0.0.0.0',
server_port: int = 23333,
instance_num: int = 64,
......@@ -491,7 +492,21 @@ def main(model_path: str,
interface.
Args:
model_path (str): the path of the deployed model
model_path (str): the path of a model.
It could be one of the following options:
- i) A local directory path of a turbomind model which is
converted by `lmdeploy convert` command or download from
ii) and iii).
- ii) The model_id of a lmdeploy-quantized model hosted
inside a model repo on huggingface.co, such as
"InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on.
model_name (str): needed when model_path is a pytorch model on
huggingface.co, such as "InternLM/internlm-chat-7b"
server_name (str): host ip for serving
server_port (int): server port
instance_num (int): number of instances of turbomind model
......@@ -514,6 +529,7 @@ def main(model_path: str,
)
VariableInterface.async_engine = AsyncEngine(model_path=model_path,
model_name=model_name,
instance_num=instance_num,
tp=tp,
**kwargs)
......@@ -526,4 +542,4 @@ def main(model_path: str,
if __name__ == '__main__':
import fire
fire.Fire(main)
fire.Fire(serve)
......@@ -43,7 +43,7 @@ def test_subcli_serve():
from lmdeploy.serve.client import main as run_triton_client
from lmdeploy.serve.gradio.app import run as run_gradio
from lmdeploy.serve.openai.api_client import main as run_api_client
from lmdeploy.serve.openai.api_server import main as run_api_server
from lmdeploy.serve.openai.api_server import serve as run_api_server
compare_func(SubCliServe.gradio, run_gradio)
compare_func(SubCliServe.api_server, run_api_server)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment