Commit cefce2e9 authored by Rayyyyy's avatar Rayyyyy
Browse files

update

parent 241da631
......@@ -5,9 +5,11 @@
### Docker(方式一)
-v 路径、docker_name和imageID根据实际情况修改
```bash
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk23.10.1-py38
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310
docker run -it -v /path/your_code_data/:/path/your_code_data/ --shm-size=80G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash
# 加载运行环境变量
unzip dtk-cuda.zip
mv ./cuda /opt/dtk/
source /opt/dtk/cuda/env.sh
# 下载fastllm库
git clone http://developer.hpccube.com/codes/OpenDAS/fastllm.git
......@@ -21,32 +23,32 @@ make -j
cd tools # 这时在fastllm/build/tools目录下
python setup.py install
pip install transformers==4.28.0 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
pip install accelerate sentencepiece mdtex2html gradio rouge_chinese nltk jieba datasets protobuf peft==0.5.0 pydantic==1.10.9 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
cd /path/of/chat_demo
pip install faiss-1.7.2_dtk24.04_gitb7348e7df780-py3-none-any.whl
pip install -r requirements.txt
```
### Dockerfile(方式二)
```
docker build -t chatglm2:latest .
docker run -dit --network=host --name=chatglm2 --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 chatglm2:latest
docker exec -it chatglm2 /bin/bash
```
docker build -t chat_demo:latest .
docker run -dit --network=host --name=chat_demo --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 chat_demo:latest
docker exec -it chat_demo /bin/bash
### Conda(方法三)
1. 创建conda虚拟环境:
```
conda create -n chatglm python=3.8
# 其他步骤同上面的Docker(方式一)
```
2. 关于本项目DCU显卡所需的工具包、深度学习库等均可从[光合](https://developer.hpccube.com/tool/)开发者社区下载安装。
- [DTK 23.04](https://cancon.hpccube.com:65024/1/main/DTK-23.04.1)
- [Pytorch 1.13.1](https://cancon.hpccube.com:65024/4/main/pytorch/dtk23.04)
- [Deepspeed 0.9.2](https://cancon.hpccube.com:65024/4/main/deepspeed/dtk23.04)
### Conda(方法三)
关于本项目DCU显卡所需的工具包、深度学习库等均可从[光合](https://developer.hpccube.com/tool/)开发者社区下载安装。
Tips:以上dtk驱动、python、deepspeed等工具版本需要严格一一对应。
```bash
DTK驱动: dtk24.04
python: python3.10
torch: 2.1.0
```
`Tips:以上dtk驱动、python、deepspeed等工具版本需要严格一一对应。`
3. 其它依赖库参照requirements.txt安装:
```
pip install faiss-1.7.2_dtk24.04_gitb7348e7df780-py3-none-any.whl
pip install -r requirements.txt
```
\ No newline at end of file
......@@ -5,10 +5,11 @@ mem_threshold=50
dcu_threshold=100
[feature_database]
reject_throttle=0.6165309870679363
reject_throttle=0.82
embedding_model_path=/path/to/your/text2vec-large-chinese
reranker_model_path=/path/to/your/bce-reranker-base_v1
[llm]
local_llm_path=/path/to/your/internlm-chat-7b
use_vllm=False
\ No newline at end of file
use_vllm=False
stream_chat=False
\ No newline at end of file
FROM docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310
\ No newline at end of file
File added
......@@ -461,16 +461,16 @@ def parse_args():
description='Feature store for processing directories.')
parser.add_argument('--work_dir',
type=str,
default='/ai/work_dir',
default='/home/chat_demo/work_dir/',
help='自定义.')
parser.add_argument(
'--repo_dir',
type=str,
default='',
default='/home/chat_demo/work_dir/jifu/original',
help='需要读取的文件目录.')
parser.add_argument(
'--config_path',
default='/home/AI_project/chat_demo/config.ini',
default='/home/chat_demo/config.ini',
help='config目录')
args = parser.parse_args()
return args
......@@ -511,12 +511,12 @@ if __name__ == '__main__':
# with open(os.path.join(args.work_dir, 'sample', 'negative.json')) as f:
# negative_sample = json.load(f)
with open(os.path.join(args.work_dir, 'sample', 'positive.txt'), 'r', encoding='utf-8') as file:
with open(os.path.join(args.work_dir, 'jifu', 'positive.txt'), 'r', encoding='utf-8') as file:
positive_sample = []
for line in file:
positive_sample.append(line.strip())
with open(os.path.join(args.work_dir, 'sample', 'negative.txt'), 'r', encoding='utf-8') as file:
with open(os.path.join(args.work_dir, 'jifu', 'negative.txt'), 'r', encoding='utf-8') as file:
negative_sample = []
for line in file:
negative_sample.append(line.strip())
......
......@@ -23,34 +23,45 @@ def build_history_messages(prompt, history, system: str = None):
class InferenceWrapper:
def __init__(self, model_path: str, use_vllm: bool, stream_chat: bool):
def __init__(self, model_path: str, use_vllm: bool, stream_chat: bool, tensor_parallel_size: int):
self.use_vllm = use_vllm
self.stream_chat = stream_chat
# huggingface
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(model_path,
trust_remote_code=True,
device_map='auto',
torch_dtype=torch.bfloat16).eval()
# self.model = AutoModelForCausalLM.from_pretrained(model_path,
# trust_remote_code=True,
# torch_dtype=torch.float16).cuda().eval()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
self.model = model.eval()
if self.use_vllm:
## vllm
# from vllm import LLM, SamplingParams
#
# self.sampling_params = SamplingParams(temperature=1, top_p=0.95)
# self.llm = LLM(model=model_path,
# trust_remote_code=True,
# enforce_eager=True,
# tensor_parallel_size=tensor_parallel_size)
## fastllm
from fastllm_pytools import llm
try:
# fastllm
from fastllm_pytools import llm
if self.stream_chat:
# fastllm的流式初始化
self.model = llm.model(model_path)
else:
self.model = llm.from_hf(self.model, self.tokenizer, dtype="float16").cuda()
self.model = llm.from_hf(self.model, self.tokenizer, dtype="float16")
except Exception as e:
logger.error(f"fastllm initial failed, {e}")
def chat(self, prompt: str, history=[]):
'''问答'''
'''单轮问答'''
output_text = ''
try:
if self.use_vllm:
output_text = self.model.response(prompt)
else:
output_text, _ = self.model.chat(self.tokenizer,
......@@ -87,13 +98,15 @@ class LLMInference:
model_path: str,
tensor_parallel_size: int,
device: str = 'cuda',
use_vllm: bool = False
use_vllm: bool = False,
stream_chat: bool = False
) -> None:
self.device = device
self.inference = InferenceWrapper(model_path,
self.inference = InferenceWrapper(model_path=model_path,
use_vllm=use_vllm,
stream_chat=stream_chat,
tensor_parallel_size=tensor_parallel_size)
def generate_response(self, prompt, history=[]):
......@@ -126,6 +139,7 @@ def llm_inference(args):
use_vllm = config.getboolean('llm', 'use_vllm')
inference_wrapper = InferenceWrapper(model_path,
use_vllm=use_vllm,
tensor_parallel_size=1,
stream_chat=args.stream_chat)
async def inference(request):
start = time.time()
......@@ -161,7 +175,7 @@ def parse_args():
description='Feature store for processing directories.')
parser.add_argument(
'--config_path',
default='/path/of/config.ini',
default='../config.ini',
help='config目录')
parser.add_argument(
'--query',
......@@ -170,7 +184,7 @@ def parse_args():
parser.add_argument(
'--DCU_ID',
type=str,
default='0',
default='1',
help='设置DCU卡号,卡号之间用英文逗号隔开,输入样例:"0,1,2"')
parser.add_argument(
'--stream_chat',
......
......@@ -14,11 +14,12 @@ class ChatAgent:
reject_throttle = float(config['feature_database']['reject_throttle'])
local_llm_path = config['llm']['local_llm_path']
use_vllm = config.getboolean('llm', 'use_vllm')
stream_chat = config.getboolean('llm', 'stream_chat')
self.retriever = CacheRetriever(self.embedding_model_path,
self.reranker_model_path).get(reject_throttle=reject_throttle,
work_dir=self.work_dir)
self.llm_server = LLMInference(local_llm_path, tensor_parallel_size, use_vllm=use_vllm)
self.llm_server = LLMInference(model_path=local_llm_path, tensor_parallel_size=tensor_parallel_size, use_vllm=use_vllm, stream_chat=stream_chat)
def generate_prompt(self,
history_pair,
......
......@@ -13,6 +13,56 @@ divisible_by_32 = [1, 2, 4, 8, 16, 32]
recv_file_path = "%s/upload"
def auto_select_dcu(config):
# Read threshold in config file
mem_threshold = config.getint('default', 'mem_threshold')
dcu_threshold = config.getint('default', 'dcu_threshold')
# Get dcu usage
process = subprocess.Popen("hy-smi | grep '^[0-9]' | awk '{print $1,$6,$7}' | sed 's/%//g'", shell=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, error = process.communicate()
result = output.decode().strip().split('\n')
if not result:
raise Exception("There is no dcu on this node.")
dcu_map = {}
for line in result:
dcu_info = line.split()
if int(dcu_info[1]) >= mem_threshold or int(dcu_info[2]) >= dcu_threshold:
logger.debug("filter dcu:%s, which mem usage (%s) and dcu usage (%s) above the threshold." % (dcu_info[0],
dcu_info[1],
dcu_info[2]))
continue
logger.debug("dcu id:%s, mem usage: %s dcu usage: %s" % (dcu_info[0], dcu_info[1], dcu_info[2]))
dcu_map[dcu_info[0]] = [int(dcu_info[1]), int(dcu_info[2])]
# Select dcu count must be divisible by 32.
# TODO temporary use 40% of available count
count = round(len(dcu_map.keys()) * 0.4)
if not count:
logger.error("There is no available dcu device, can not start the service.")
raise Exception("There is no available dcu device, can not start the service.")
insert_index = bisect.bisect_left(divisible_by_32, count)
if insert_index > 0 and count != divisible_by_32[insert_index]:
index = insert_index - 1
elif count == divisible_by_32[insert_index]:
index = insert_index
else:
index = 0
select_count = divisible_by_32[index]
# Based on the ranking of memory and dcu usage.
dcu_mem_use_rank = [item[0] for item in dcu_map.values()]
dcu_use_rank = [item[1] for item in dcu_map.values()]
# Calculate the final ranking
final_rank = [(name, dcu_mem_use_rank[i] + dcu_use_rank[i]) for i, name in enumerate(dcu_map.keys())]
sorted_rank = sorted(final_rank, key=lambda x: x[1])
sorted_dcu_ids = [item[0] for item in sorted_rank]
select_dcu_ids = sorted_dcu_ids[:select_count]
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(map(str, select_dcu_ids))
logger.info(f"Set environment variable CUDA_VISIBLE_DEVICES to {select_dcu_ids}")
return select_dcu_ids
def workflow(args):
config = configparser.ConfigParser()
config.read(args.config_path)
......@@ -63,56 +113,6 @@ def workflow(args):
web.run_app(app, host='0.0.0.0', port=bind_port)
def auto_select_dcu(config):
# Read threshold in config file
mem_threshold = config.getint('default', 'mem_threshold')
dcu_threshold = config.getint('default', 'dcu_threshold')
# Get dcu usage
process = subprocess.Popen("hy-smi | grep '^[0-9]' | awk '{print $1,$6,$7}' | sed 's/%//g'", shell=True,
stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, error = process.communicate()
result = output.decode().strip().split('\n')
if not result:
raise Exception("There is no dcu on this node.")
dcu_map = {}
for line in result:
dcu_info = line.split()
if int(dcu_info[1]) >= mem_threshold or int(dcu_info[2]) >= dcu_threshold:
logger.debug("filter dcu:%s, which mem usage (%s) and dcu usage (%s) above the threshold." % (dcu_info[0],
dcu_info[1],
dcu_info[2]))
continue
logger.debug("dcu id:%s, mem usage: %s dcu usage: %s" % (dcu_info[0], dcu_info[1], dcu_info[2]))
dcu_map[dcu_info[0]] = [int(dcu_info[1]), int(dcu_info[2])]
# Select dcu count must be divisible by 32.
# TODO temporary use 40% of available count
count = round(len(dcu_map.keys()) * 0.4)
if not count:
logger.error("There is no available dcu device, can not start the service.")
raise Exception("There is no available dcu device, can not start the service.")
insert_index = bisect.bisect_left(divisible_by_32, count)
#insert_index = bisect.bisect_left(divisible_by_32, len(dcu_map.keys()))
if insert_index > 0 and count != divisible_by_32[insert_index]:
index = insert_index - 1
elif count == divisible_by_32[insert_index]:
index = insert_index
else:
index = 0
select_count = divisible_by_32[index]
# Based on the ranking of memory and dcu usage.
dcu_mem_use_rank = [item[0] for item in dcu_map.values()]
dcu_use_rank = [item[1] for item in dcu_map.values()]
# Calculate the final ranking
final_rank = [(name, dcu_mem_use_rank[i] + dcu_use_rank[i]) for i, name in enumerate(dcu_map.keys())]
sorted_rank = sorted(final_rank, key=lambda x: x[1])
sorted_dcu_ids = [item[0] for item in sorted_rank]
select_dcu_ids = sorted_dcu_ids[:select_count]
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(map(str, select_dcu_ids))
logger.info(f"Set environment variable CUDA_VISIBLE_DEVICES to {select_dcu_ids}")
return select_dcu_ids
def parse_args():
parser = argparse.ArgumentParser(description='Start all services.')
parser.add_argument('--config_path',
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment