Commit 704137ae authored by chenzk's avatar chenzk
Browse files
parents ecaf86cf 268abe98
# Baichuan-13B_fastllm
# Baichuan-13B
## 论文
......@@ -33,7 +33,7 @@ Baichuan整体模型基于标准的Transformer结构,采用了和LLaMA一样
在光源可拉取推理的docker镜像,拉取方式如下:
```
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py38-latest
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk23.10.1-py38
```
### 容器启动
......@@ -43,7 +43,7 @@ docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk
```
# <container_name> 自定义容器名
# <project_path> 当前工程所在路径
docker run -it --name=<container_name> -v <project_path>:/work -w /work --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --shm-size=16G --group-add 39 image.sourcefind.cn:5000/dcu/admin/base/pytorch:1.13.1-centos7.6-dtk-23.04-py38-latest /bin/bash
docker run -it --name=<container_name> -v <project_path>:/work -w /work --privileged -v /opt/hyhal:/opt/hyhal --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --cap-add=SYS_PTRACE --ipc=host --network host --shm-size=16G --group-add video image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk23.10.1-py38 /bin/bash
```
### 加载环境
......@@ -51,7 +51,7 @@ docker run -it --name=<container_name> -v <project_path>:/work -w /work --device
进入容器后执行如下命令,加载运行环境变量
```
source /opt/dtk-23.04/cuda/env.sh
source /opt/dtk/cuda/env.sh
```
### 安装方法
......@@ -99,6 +99,15 @@ python cli_demo.py -p baichuan-13b-fp16.bin
# 简易webui,需要先安装streamlit-chat,并且需要在容器启动时映射streamlit的端口到外部网络
streamlit run web_demo.py baichuan-13b-fp16.bin
# 按照openai接口实现的api_server的实例:
# 需要先进入api_server_demo,安装所需依赖:
cd api_server_demo
pip install -r requirements.txt
# 运行api_server服务,使用-p指定转换后的模型文件,客户端代码可以参考openai-client.py实现:
python fastllm-openai.py -p ../baichuan-13b-fp16.bin
# 如果需要测试服务的并发性能,可以使用openai-client.py,修改其中的prompt和concurrencys变量值后执行:
python openai-client.py
```
### 推理性能测试
......
# coding=utf-8
# Implements API for ChatGLM3-6B in OpenAI's format. (https://platform.openai.com/docs/api-reference/chat)
# Usage: python openai_api.py
# Visit http://localhost:8100/docs for documents.
import time
import json
import torch
import uvicorn
import argparse
from pydantic import BaseModel, Field
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
from typing import Any, Dict, List, Literal, Optional, Union
#from transformers import AutoTokenizer, AutoModel
from sse_starlette.sse import ServerSentEvent, EventSourceResponse
from fastllm_pytools import llm
@asynccontextmanager
async def lifespan(app: FastAPI): # collects GPU memory
yield
global device_map
if torch.cuda.is_available():
for device in device_map:
with torch.cuda.device(device):
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
app = FastAPI(lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
class ModelCard(BaseModel):
id: str
object: str = "model"
created: int = Field(default_factory=lambda: int(time.time()))
owned_by: str = "owner"
root: Optional[str] = None
parent: Optional[str] = None
permission: Optional[list] = None
class ModelList(BaseModel):
object: str = "list"
data: List[ModelCard] = []
class ChatMessage(BaseModel):
role: Literal["user", "assistant", "system"]
content: str
class Usage(BaseModel):
prompt_tokens: int = None
total_tokens: int = None
completion_tokens: int = None
class DeltaMessage(BaseModel):
role: Optional[Literal["user", "assistant", "system"]] = None
content: Optional[str] = None
class ChatCompletionRequest(BaseModel):
model: str
messages: List[ChatMessage]
temperature: Optional[float] = None
top_p: Optional[float] = None
max_length: Optional[int] = None
stream: Optional[bool] = False
class ChatCompletionResponseChoice(BaseModel):
index: int
message: ChatMessage
finish_reason: Literal["stop", "length"]
class ChatCompletionResponseStreamChoice(BaseModel):
index: int
delta: DeltaMessage
finish_reason: Optional[Literal["stop", "length"]]
class ChatCompletionResponse(BaseModel):
id: str
object: Literal["chat.completion", "chat.completion.chunk"]
created: Optional[int] = Field(default_factory=lambda: int(time.time()))
model: str
choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
usage: Usage = None
@app.get("/v1/models", response_model=ModelList)
def list_models():
global model_list
for model in model_list:
ModelCard(id=model)
ModelList.data.append(ModelCard)
return ModelList()
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
def create_chat_completion(request: ChatCompletionRequest):
if request.model not in model_list:
raise HTTPException(status_code=400, detail="Invalid Model Name")
global model
id = "chatcmpl-A"
if request.messages[-1].role != "user":
raise HTTPException(status_code=400, detail="Invalid request")
query = request.messages[-1].content
if request.max_length is not None:
max_length = request.max_length
else:
max_length = 1024
if request.temperature is not None:
temperature = request.temperature
else:
temperature = 0.1
if request.top_p is not None:
top_p = request.top_p
else:
top_p = 0.8
prev_messages = request.messages[:-1]
# print(prev_messages)
if len(prev_messages) > 0 and prev_messages[0].role == "system":
query = prev_messages.pop(0).content + query
history = []
if len(prev_messages) % 2 == 0:
for i in range(0, len(prev_messages), 2):
if prev_messages[i].role == "user" and prev_messages[i+1].role == "assistant":
history.append([prev_messages[i].content, prev_messages[i+1].content])
if request.stream:
generate = predict(id=id, query=query, history=history, max_length=max_length, top_p = top_p, temperature = temperature, model_id = request.model)
return EventSourceResponse(generate, media_type="text/event-stream")
response = model.response(query=query, history=history, max_length=max_length, top_p = top_p, temperature = temperature)
choice_data = ChatCompletionResponseChoice(
index=0,
message=ChatMessage(role="assistant", content=response),
finish_reason="stop"
)
prompt_tokens = len(model.tokenizer_encode_string(query))
completion_tokens = len(model.tokenizer_encode_string(response))
usage = Usage(
prompt_tokens = prompt_tokens,
completion_tokens = completion_tokens,
total_tokens = prompt_tokens+completion_tokens,
)
return ChatCompletionResponse(id=id ,model=request.model, choices=[choice_data], object="chat.completion", usage=usage)
def predict(id: str, query: str, history: List[List[str]], model_id: str, max_length: int, top_p: float, temperature: float):
global model
creat_time = int(time.time())
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(role="assistant"),
finish_reason=None
)
chunk = ChatCompletionResponse(id=id, created=creat_time, model=model_id, choices=[choice_data], object="chat.completion.chunk")
#yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False)) //pydantic从1.8.0开始不支持dumps_kwags参数,参考https://github.com/THUDM/ChatGLM2-6B/issues/308
yield json.dumps(chunk.model_dump(exclude_unset=True), ensure_ascii=False)
for new_response in model.stream_response(query=query, history=history, max_length=max_length, top_p = top_p, temperature = temperature):
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(content=new_response),
finish_reason=None
)
chunk = ChatCompletionResponse(id=id, created=creat_time, model=model_id, choices=[choice_data], object="chat.completion.chunk")
#yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))
yield json.dumps(chunk.model_dump(exclude_unset=True), ensure_ascii=False)
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(),
finish_reason="stop"
)
chunk = ChatCompletionResponse(id=id, created=creat_time, model=model_id, choices=[choice_data], object="chat.completion.chunk")
#yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))
yield json.dumps(chunk.model_dump(exclude_unset=True), ensure_ascii=False)
yield '[DONE]'
def args_parser():
parser = argparse.ArgumentParser(description = 'baichuan2_chat_demo')
parser.add_argument('-p', '--path', type = str, default = "/model", help = '模型文件的路径')
parser.add_argument('-g', '--gpus', type = str, default = "0", help = '指定运行的gpu卡,例如“0,1”')
args = parser.parse_args()
return args
if __name__ == "__main__":
args = args_parser()
global model_list
model_list = ["baichuan2-fastllm"]
global device_map
device_map = ["cuda:"+num for num in args.gpus.split(',')]
llm.set_device_map(device_map)
model = llm.model(args.path)
uvicorn.run(app, host='127.0.0.1', port=8100)
import openai
import time
import threading
import queue
from concurrent.futures import ThreadPoolExecutor, as_completed
def jls_extract_def(model, messages, temperature, max_length, stream, index):
openai.api_base = "http://127.0.0.1:8100/v1"
openai.api_key = "none"
output_tokens = 0
ret = ""
t0 = time.time()
result = openai.ChatCompletion.create(model=model,messages=messages, temperature=temperature, max_length=max_length, stream=stream)
for chunk in result:
# print(chunk)
output_tokens += 1
if hasattr(chunk.choices[0].delta, "content"):
if (index == 0):
print(chunk.choices[0].delta.content, end="", flush=True)
ret += chunk.choices[0].delta.content
t1 = time.time()
# print("\ntoken/s: {:.2f}, output_tokens: {}".format(output_tokens/(t1-t0),output_tokens))
result = output_tokens, ret, output_tokens/(t1-t0)
return result
if __name__ == "__main__":
prompt = "满江红全文"
concurrencys = [1]
temperature = 0.1
max_length = 4096
stream = True
prompts = [prompt]
model="baichuan2-fastllm"
messages=[{"role": "user", "content": "你好"}]
pool = ThreadPoolExecutor(max_workers=32)
for i in range(len(concurrencys)):
cur_prompts = prompts * concurrencys[i]
token_count = 0
threads = []
t0 = time.time()
for index, prompt in enumerate(cur_prompts):
messages[0]["content"] = prompt
t = pool.submit(jls_extract_def, model, messages, temperature, max_length, stream, index)
t.index = index
threads.append(t)
for future in as_completed(threads):
result = future.result()
print(future.index)
print(result)
print("\n")
token_count += result[0]
t1 = time.time()
print("\n---------------------------------------------\n")
print("\nconcurrency: {}".format(concurrencys[i]))
print("\ntotal use: {:.2f}".format(t1-t0))
print("\ntoken/s: {:.2f}, token_count: {}".format(token_count/(t1-t0),token_count))
print("\n---------------------------------------------\n")
uvicorn==0.23.2
pydantic==2.5.1
fastapi==0.103.1
sse_starlette
openai==0.28
No preview for this file type
......@@ -26,6 +26,9 @@ def create(model,
exit(0);
# 0.1 model info
# if model.config.model_type == "chatglm" and model.config.transformers_version == "4.30.2":
# model.config.model_type = "chatglm3"
# print("model.config.model_type: chatglm3!")
modelInfo = model.config.__dict__
if model.generation_config is not None:
modelInfo.update(model.generation_config.__dict__)
......@@ -48,6 +51,12 @@ def create(model,
if modelInfo["chat_format"] == "chatml":
modelInfo["im_end_id"] = tokenizer.im_end_id
modelInfo["im_start_id"] = tokenizer.im_start_id
if (modelInfo["model_type"] == "chatglm" and hasattr(tokenizer, "build_chat_input")):
# chatglm3
modelInfo["pre_prompt"] = "";
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(tokenizer.get_command("<|user|>")) + ">\n");
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(tokenizer.get_command("<|assistant|>")) + ">");
modelInfo["history_sep"] = "";
weight_type_dict = {};
......
This diff is collapsed.
......@@ -80,6 +80,8 @@ def tofile(exportPath,
fo.write(struct.pack('i', 2))
# 0.1 model info
#if model.config.model_type == "chatglm" and model.config.transformers_version == "4.30.2":
# model.config.model_type = "chatglm3"
modelInfo = model.config.__dict__
if model.generation_config is not None:
modelInfo.update(model.generation_config.__dict__)
......@@ -112,6 +114,13 @@ def tofile(exportPath,
if modelInfo["chat_format"] == "chatml":
modelInfo["im_end_id"] = tokenizer.im_end_id
modelInfo["im_start_id"] = tokenizer.im_start_id
if (modelInfo["model_type"] == "chatglm" and hasattr(tokenizer, "build_chat_input")):
print("chatglm3")
# chatglm3
modelInfo["pre_prompt"] = "";
modelInfo["user_role"] = ("<FLM_FIX_TOKEN_" + str(tokenizer.get_command("<|user|>")) + ">\n");
modelInfo["bot_role"] = ("<FLM_FIX_TOKEN_" + str(tokenizer.get_command("<|assistant|>")) + ">");
modelInfo["history_sep"] = "";
modelInfo["tokenizer_use_score"] = "1" # 分词带分数
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment