Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenych
chat_demo
Commits
cefce2e9
Commit
cefce2e9
authored
Jul 15, 2024
by
Rayyyyy
Browse files
update
parent
241da631
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
107 additions
and
88 deletions
+107
-88
README.md
README.md
+19
-17
config.ini
config.ini
+3
-2
docker/Dockerfile
docker/Dockerfile
+1
-0
dtk-cuda.zip
dtk-cuda.zip
+0
-0
faiss-1.7.2_dtk24.04_gitb7348e7df780-py3-none-any.whl
faiss-1.7.2_dtk24.04_gitb7348e7df780-py3-none-any.whl
+0
-0
llm_service/feature_database.py
llm_service/feature_database.py
+5
-5
llm_service/inferencer.py
llm_service/inferencer.py
+27
-13
llm_service/worker.py
llm_service/worker.py
+2
-1
server_start.py
server_start.py
+50
-50
技服-接口文档.docx
技服-接口文档.docx
+0
-0
No files found.
README.md
View file @
cefce2e9
...
...
@@ -5,9 +5,11 @@
### Docker(方式一)
-v 路径、docker_name和imageID根据实际情况修改
```
bash
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk2
3.10.1
-py3
8
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk2
4.04
-py3
10
docker run
-it
-v
/path/your_code_data/:/path/your_code_data/
--shm-size
=
80G
--privileged
=
true
--device
=
/dev/kfd
--device
=
/dev/dri/
--group-add
video
--name
docker_name imageID bash
# 加载运行环境变量
unzip dtk-cuda.zip
mv
./cuda /opt/dtk/
source
/opt/dtk/cuda/env.sh
# 下载fastllm库
git clone http://developer.hpccube.com/codes/OpenDAS/fastllm.git
...
...
@@ -21,32 +23,32 @@ make -j
cd
tools
# 这时在fastllm/build/tools目录下
python setup.py
install
pip
install
transformers
==
4.28.0
-i
http://mirrors.aliyun.com/pypi/simple/
--trusted-host
mirrors.aliyun.com
pip
install
accelerate sentencepiece mdtex2html gradio rouge_chinese nltk jieba datasets protobuf
peft
==
0.5.0
pydantic
==
1.10.9
-i
http://mirrors.aliyun.com/pypi/simple/
--trusted-host
mirrors.aliyun.com
cd
/path/of/chat_demo
pip
install
faiss-1.7.2_dtk24.04_gitb7348e7df780-py3-none-any.whl
pip
install
-r
requirements.txt
```
### Dockerfile(方式二)
```
docker build -t chatglm2:latest .
docker run -dit --network=host --name=chatglm2 --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 chatglm2:latest
docker exec -it chatglm2 /bin/bash
```
docker build -t chat_demo:latest .
docker run -dit --network=host --name=chat_demo --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 chat_demo:latest
docker exec -it chat_demo /bin/bash
### Conda(方法三)
1.
创建conda虚拟环境:
```
conda create -n chatglm python=3.8
# 其他步骤同上面的Docker(方式一)
```
2.
关于本项目DCU显卡所需的工具包、深度学习库等均可从
[
光合
](
https://developer.hpccube.com/tool/
)
开发者社区下载安装。
-
[
DTK 23.04
](
https://cancon.hpccube.com:65024/1/main/DTK-23.04.1
)
-
[
Pytorch 1.13.1
](
https://cancon.hpccube.com:65024/4/main/pytorch/dtk23.04
)
-
[
Deepspeed 0.9.2
](
https://cancon.hpccube.com:65024/4/main/deepspeed/dtk23.04
)
### Conda(方法三)
关于本项目DCU显卡所需的工具包、深度学习库等均可从
[
光合
](
https://developer.hpccube.com/tool/
)
开发者社区下载安装。
Tips:以上dtk驱动、python、deepspeed等工具版本需要严格一一对应。
```
bash
DTK驱动: dtk24.04
python: python3.10
torch: 2.1.0
```
`Tips:以上dtk驱动、python、deepspeed等工具版本需要严格一一对应。`
3.
其它依赖库参照requirements.txt安装:
```
pip install faiss-1.7.2_dtk24.04_gitb7348e7df780-py3-none-any.whl
pip install -r requirements.txt
```
\ No newline at end of file
config.ini
View file @
cefce2e9
...
...
@@ -5,10 +5,11 @@ mem_threshold=50
dcu_threshold
=
100
[feature_database]
reject_throttle
=
0.
6165309870679363
reject_throttle
=
0.
82
embedding_model_path
=
/path/to/your/text2vec-large-chinese
reranker_model_path
=
/path/to/your/bce-reranker-base_v1
[llm]
local_llm_path
=
/path/to/your/internlm-chat-7b
use_vllm
=
False
stream_chat
=
False
\ No newline at end of file
docker/Dockerfile
0 → 100644
View file @
cefce2e9
FROM
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310
\ No newline at end of file
dtk-cuda.zip
0 → 100644
View file @
cefce2e9
File added
faiss-1.7.2_dtk24.04_gitb7348e7df780-py3-none-any.whl
0 → 100644
View file @
cefce2e9
File added
llm_service/feature_database.py
View file @
cefce2e9
...
...
@@ -461,16 +461,16 @@ def parse_args():
description
=
'Feature store for processing directories.'
)
parser
.
add_argument
(
'--work_dir'
,
type
=
str
,
default
=
'/
ai
/work_dir'
,
default
=
'/
home/chat_demo
/work_dir
/
'
,
help
=
'自定义.'
)
parser
.
add_argument
(
'--repo_dir'
,
type
=
str
,
default
=
''
,
default
=
'
/home/chat_demo/work_dir/jifu/original
'
,
help
=
'需要读取的文件目录.'
)
parser
.
add_argument
(
'--config_path'
,
default
=
'/home/
AI_project/
chat_demo/config.ini'
,
default
=
'/home/chat_demo/config.ini'
,
help
=
'config目录'
)
args
=
parser
.
parse_args
()
return
args
...
...
@@ -511,12 +511,12 @@ if __name__ == '__main__':
# with open(os.path.join(args.work_dir, 'sample', 'negative.json')) as f:
# negative_sample = json.load(f)
with
open
(
os
.
path
.
join
(
args
.
work_dir
,
'
sample
'
,
'positive.txt'
),
'r'
,
encoding
=
'utf-8'
)
as
file
:
with
open
(
os
.
path
.
join
(
args
.
work_dir
,
'
jifu
'
,
'positive.txt'
),
'r'
,
encoding
=
'utf-8'
)
as
file
:
positive_sample
=
[]
for
line
in
file
:
positive_sample
.
append
(
line
.
strip
())
with
open
(
os
.
path
.
join
(
args
.
work_dir
,
'
sample
'
,
'negative.txt'
),
'r'
,
encoding
=
'utf-8'
)
as
file
:
with
open
(
os
.
path
.
join
(
args
.
work_dir
,
'
jifu
'
,
'negative.txt'
),
'r'
,
encoding
=
'utf-8'
)
as
file
:
negative_sample
=
[]
for
line
in
file
:
negative_sample
.
append
(
line
.
strip
())
...
...
llm_service/inferencer.py
View file @
cefce2e9
...
...
@@ -23,34 +23,45 @@ def build_history_messages(prompt, history, system: str = None):
class
InferenceWrapper
:
def
__init__
(
self
,
model_path
:
str
,
use_vllm
:
bool
,
stream_chat
:
bool
):
def
__init__
(
self
,
model_path
:
str
,
use_vllm
:
bool
,
stream_chat
:
bool
,
tensor_parallel_size
:
int
):
self
.
use_vllm
=
use_vllm
self
.
stream_chat
=
stream_chat
# huggingface
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
,
device_map
=
'auto'
,
torch_dtype
=
torch
.
bfloat16
).
eval
()
# self.model = AutoModelForCausalLM.from_pretrained(model_path,
# trust_remote_code=True,
# torch_dtype=torch.float16).cuda().eval()
model
=
AutoModel
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
).
half
().
cuda
()
self
.
model
=
model
.
eval
()
if
self
.
use_vllm
:
try
:
# fastllm
## vllm
# from vllm import LLM, SamplingParams
#
# self.sampling_params = SamplingParams(temperature=1, top_p=0.95)
# self.llm = LLM(model=model_path,
# trust_remote_code=True,
# enforce_eager=True,
# tensor_parallel_size=tensor_parallel_size)
## fastllm
from
fastllm_pytools
import
llm
try
:
if
self
.
stream_chat
:
# fastllm的流式初始化
self
.
model
=
llm
.
model
(
model_path
)
else
:
self
.
model
=
llm
.
from_hf
(
self
.
model
,
self
.
tokenizer
,
dtype
=
"float16"
)
.
cuda
()
self
.
model
=
llm
.
from_hf
(
self
.
model
,
self
.
tokenizer
,
dtype
=
"float16"
)
except
Exception
as
e
:
logger
.
error
(
f
"fastllm initial failed,
{
e
}
"
)
def
chat
(
self
,
prompt
:
str
,
history
=
[]):
'''问答'''
'''
单轮
问答'''
output_text
=
''
try
:
if
self
.
use_vllm
:
output_text
=
self
.
model
.
response
(
prompt
)
else
:
output_text
,
_
=
self
.
model
.
chat
(
self
.
tokenizer
,
...
...
@@ -87,13 +98,15 @@ class LLMInference:
model_path
:
str
,
tensor_parallel_size
:
int
,
device
:
str
=
'cuda'
,
use_vllm
:
bool
=
False
use_vllm
:
bool
=
False
,
stream_chat
:
bool
=
False
)
->
None
:
self
.
device
=
device
self
.
inference
=
InferenceWrapper
(
model_path
,
self
.
inference
=
InferenceWrapper
(
model_path
=
model_path
,
use_vllm
=
use_vllm
,
stream_chat
=
stream_chat
,
tensor_parallel_size
=
tensor_parallel_size
)
def
generate_response
(
self
,
prompt
,
history
=
[]):
...
...
@@ -126,6 +139,7 @@ def llm_inference(args):
use_vllm
=
config
.
getboolean
(
'llm'
,
'use_vllm'
)
inference_wrapper
=
InferenceWrapper
(
model_path
,
use_vllm
=
use_vllm
,
tensor_parallel_size
=
1
,
stream_chat
=
args
.
stream_chat
)
async
def
inference
(
request
):
start
=
time
.
time
()
...
...
@@ -161,7 +175,7 @@ def parse_args():
description
=
'Feature store for processing directories.'
)
parser
.
add_argument
(
'--config_path'
,
default
=
'
/path/of
/config.ini'
,
default
=
'
..
/config.ini'
,
help
=
'config目录'
)
parser
.
add_argument
(
'--query'
,
...
...
@@ -170,7 +184,7 @@ def parse_args():
parser
.
add_argument
(
'--DCU_ID'
,
type
=
str
,
default
=
'
0
'
,
default
=
'
1
'
,
help
=
'设置DCU卡号,卡号之间用英文逗号隔开,输入样例:"0,1,2"'
)
parser
.
add_argument
(
'--stream_chat'
,
...
...
llm_service/worker.py
View file @
cefce2e9
...
...
@@ -14,11 +14,12 @@ class ChatAgent:
reject_throttle
=
float
(
config
[
'feature_database'
][
'reject_throttle'
])
local_llm_path
=
config
[
'llm'
][
'local_llm_path'
]
use_vllm
=
config
.
getboolean
(
'llm'
,
'use_vllm'
)
stream_chat
=
config
.
getboolean
(
'llm'
,
'stream_chat'
)
self
.
retriever
=
CacheRetriever
(
self
.
embedding_model_path
,
self
.
reranker_model_path
).
get
(
reject_throttle
=
reject_throttle
,
work_dir
=
self
.
work_dir
)
self
.
llm_server
=
LLMInference
(
local_llm_path
,
tensor_parallel_size
,
use_vllm
=
use_vllm
)
self
.
llm_server
=
LLMInference
(
model_path
=
local_llm_path
,
tensor_parallel_size
=
tensor_parallel_size
,
use_vllm
=
use_vllm
,
stream_chat
=
stream_chat
)
def
generate_prompt
(
self
,
history_pair
,
...
...
server_start.py
View file @
cefce2e9
...
...
@@ -13,6 +13,56 @@ divisible_by_32 = [1, 2, 4, 8, 16, 32]
recv_file_path
=
"%s/upload"
def
auto_select_dcu
(
config
):
# Read threshold in config file
mem_threshold
=
config
.
getint
(
'default'
,
'mem_threshold'
)
dcu_threshold
=
config
.
getint
(
'default'
,
'dcu_threshold'
)
# Get dcu usage
process
=
subprocess
.
Popen
(
"hy-smi | grep '^[0-9]' | awk '{print $1,$6,$7}' | sed 's/%//g'"
,
shell
=
True
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
output
,
error
=
process
.
communicate
()
result
=
output
.
decode
().
strip
().
split
(
'
\n
'
)
if
not
result
:
raise
Exception
(
"There is no dcu on this node."
)
dcu_map
=
{}
for
line
in
result
:
dcu_info
=
line
.
split
()
if
int
(
dcu_info
[
1
])
>=
mem_threshold
or
int
(
dcu_info
[
2
])
>=
dcu_threshold
:
logger
.
debug
(
"filter dcu:%s, which mem usage (%s) and dcu usage (%s) above the threshold."
%
(
dcu_info
[
0
],
dcu_info
[
1
],
dcu_info
[
2
]))
continue
logger
.
debug
(
"dcu id:%s, mem usage: %s dcu usage: %s"
%
(
dcu_info
[
0
],
dcu_info
[
1
],
dcu_info
[
2
]))
dcu_map
[
dcu_info
[
0
]]
=
[
int
(
dcu_info
[
1
]),
int
(
dcu_info
[
2
])]
# Select dcu count must be divisible by 32.
# TODO temporary use 40% of available count
count
=
round
(
len
(
dcu_map
.
keys
())
*
0.4
)
if
not
count
:
logger
.
error
(
"There is no available dcu device, can not start the service."
)
raise
Exception
(
"There is no available dcu device, can not start the service."
)
insert_index
=
bisect
.
bisect_left
(
divisible_by_32
,
count
)
if
insert_index
>
0
and
count
!=
divisible_by_32
[
insert_index
]:
index
=
insert_index
-
1
elif
count
==
divisible_by_32
[
insert_index
]:
index
=
insert_index
else
:
index
=
0
select_count
=
divisible_by_32
[
index
]
# Based on the ranking of memory and dcu usage.
dcu_mem_use_rank
=
[
item
[
0
]
for
item
in
dcu_map
.
values
()]
dcu_use_rank
=
[
item
[
1
]
for
item
in
dcu_map
.
values
()]
# Calculate the final ranking
final_rank
=
[(
name
,
dcu_mem_use_rank
[
i
]
+
dcu_use_rank
[
i
])
for
i
,
name
in
enumerate
(
dcu_map
.
keys
())]
sorted_rank
=
sorted
(
final_rank
,
key
=
lambda
x
:
x
[
1
])
sorted_dcu_ids
=
[
item
[
0
]
for
item
in
sorted_rank
]
select_dcu_ids
=
sorted_dcu_ids
[:
select_count
]
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
','
.
join
(
map
(
str
,
select_dcu_ids
))
logger
.
info
(
f
"Set environment variable CUDA_VISIBLE_DEVICES to
{
select_dcu_ids
}
"
)
return
select_dcu_ids
def
workflow
(
args
):
config
=
configparser
.
ConfigParser
()
config
.
read
(
args
.
config_path
)
...
...
@@ -63,56 +113,6 @@ def workflow(args):
web
.
run_app
(
app
,
host
=
'0.0.0.0'
,
port
=
bind_port
)
def
auto_select_dcu
(
config
):
# Read threshold in config file
mem_threshold
=
config
.
getint
(
'default'
,
'mem_threshold'
)
dcu_threshold
=
config
.
getint
(
'default'
,
'dcu_threshold'
)
# Get dcu usage
process
=
subprocess
.
Popen
(
"hy-smi | grep '^[0-9]' | awk '{print $1,$6,$7}' | sed 's/%//g'"
,
shell
=
True
,
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
output
,
error
=
process
.
communicate
()
result
=
output
.
decode
().
strip
().
split
(
'
\n
'
)
if
not
result
:
raise
Exception
(
"There is no dcu on this node."
)
dcu_map
=
{}
for
line
in
result
:
dcu_info
=
line
.
split
()
if
int
(
dcu_info
[
1
])
>=
mem_threshold
or
int
(
dcu_info
[
2
])
>=
dcu_threshold
:
logger
.
debug
(
"filter dcu:%s, which mem usage (%s) and dcu usage (%s) above the threshold."
%
(
dcu_info
[
0
],
dcu_info
[
1
],
dcu_info
[
2
]))
continue
logger
.
debug
(
"dcu id:%s, mem usage: %s dcu usage: %s"
%
(
dcu_info
[
0
],
dcu_info
[
1
],
dcu_info
[
2
]))
dcu_map
[
dcu_info
[
0
]]
=
[
int
(
dcu_info
[
1
]),
int
(
dcu_info
[
2
])]
# Select dcu count must be divisible by 32.
# TODO temporary use 40% of available count
count
=
round
(
len
(
dcu_map
.
keys
())
*
0.4
)
if
not
count
:
logger
.
error
(
"There is no available dcu device, can not start the service."
)
raise
Exception
(
"There is no available dcu device, can not start the service."
)
insert_index
=
bisect
.
bisect_left
(
divisible_by_32
,
count
)
#insert_index = bisect.bisect_left(divisible_by_32, len(dcu_map.keys()))
if
insert_index
>
0
and
count
!=
divisible_by_32
[
insert_index
]:
index
=
insert_index
-
1
elif
count
==
divisible_by_32
[
insert_index
]:
index
=
insert_index
else
:
index
=
0
select_count
=
divisible_by_32
[
index
]
# Based on the ranking of memory and dcu usage.
dcu_mem_use_rank
=
[
item
[
0
]
for
item
in
dcu_map
.
values
()]
dcu_use_rank
=
[
item
[
1
]
for
item
in
dcu_map
.
values
()]
# Calculate the final ranking
final_rank
=
[(
name
,
dcu_mem_use_rank
[
i
]
+
dcu_use_rank
[
i
])
for
i
,
name
in
enumerate
(
dcu_map
.
keys
())]
sorted_rank
=
sorted
(
final_rank
,
key
=
lambda
x
:
x
[
1
])
sorted_dcu_ids
=
[
item
[
0
]
for
item
in
sorted_rank
]
select_dcu_ids
=
sorted_dcu_ids
[:
select_count
]
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
','
.
join
(
map
(
str
,
select_dcu_ids
))
logger
.
info
(
f
"Set environment variable CUDA_VISIBLE_DEVICES to
{
select_dcu_ids
}
"
)
return
select_dcu_ids
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
'Start all services.'
)
parser
.
add_argument
(
'--config_path'
,
...
...
技服-接口文档.docx
0 → 100644
View file @
cefce2e9
File added
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment