update

cefce2e9 · Rayyyyy · 241da631 · cefce2e9 · cefce2e9 · cefce2e9
Commit cefce2e9 authored Jul 15, 2024 by Rayyyyy
10 changed files
--- a/README.md
+++ b/README.md
@@ -5,9 +5,11 @@
 ### Docker(方式一)
 -v 路径、docker_name和imageID根据实际情况修改
 ```bash
-docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk23.10.1-py38
+docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310
 docker run -it -v /path/your_code_data/:/path/your_code_data/ --shm-size=80G --privileged=true --device=/dev/kfd --device=/dev/dri/ --group-add video --name docker_name imageID bash
 # 加载运行环境变量
+unzip dtk-cuda.zip
+mv ./cuda /opt/dtk/
 source /opt/dtk/cuda/env.sh
 # 下载fastllm库
 git clone http://developer.hpccube.com/codes/OpenDAS/fastllm.git
@@ -21,32 +23,32 @@ make -j
 cd tools # 这时在fastllm/build/tools目录下
 python setup.py install

-
-pip install transformers==4.28.0 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
-pip install accelerate sentencepiece mdtex2html gradio rouge_chinese nltk jieba datasets protobuf peft==0.5.0 pydantic==1.10.9 -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+cd /path/of/chat_demo
+pip install faiss-1.7.2_dtk24.04_gitb7348e7df780-py3-none-any.whl
+pip install -r requirements.txt
 ```

 ### Dockerfile(方式二)
 ```
-docker build -t chatglm2:latest .
-docker run -dit --network=host --name=chatglm2 --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G  --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 chatglm2:latest
-docker exec -it chatglm2 /bin/bash
-```
+docker build -t chat_demo:latest .
+docker run -dit --network=host --name=chat_demo --privileged --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size=16G  --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -u root --ulimit stack=-1:-1 --ulimit memlock=-1:-1 chat_demo:latest
+docker exec -it chat_demo /bin/bash

-### Conda（方法三）
-1. 创建conda虚拟环境：
-```
-conda create -n chatglm python=3.8
+# 其他步骤同上面的Docker(方式一)
 ```

-2. 关于本项目DCU显卡所需的工具包、深度学习库等均可从[光合](https://developer.hpccube.com/tool/)开发者社区下载安装。
- [DTK 23.04](https://cancon.hpccube.com:65024/1/main/DTK-23.04.1)
- [Pytorch 1.13.1](https://cancon.hpccube.com:65024/4/main/pytorch/dtk23.04)
- [Deepspeed 0.9.2](https://cancon.hpccube.com:65024/4/main/deepspeed/dtk23.04)
+### Conda（方法三）
+关于本项目DCU显卡所需的工具包、深度学习库等均可从[光合](https://developer.hpccube.com/tool/)开发者社区下载安装。

-    Tips：以上dtk驱动、python、deepspeed等工具版本需要严格一一对应。
+```bash
+DTK驱动: dtk24.04
+python: python3.10
+torch: 2.1.0
+```
+`Tips：以上dtk驱动、python、deepspeed等工具版本需要严格一一对应。`

 3. 其它依赖库参照requirements.txt安装：
 ```
+pip install faiss-1.7.2_dtk24.04_gitb7348e7df780-py3-none-any.whl
 pip install -r requirements.txt
 ```
\ No newline at end of file
--- a/config.ini
+++ b/config.ini
@@ -5,10 +5,11 @@ mem_threshold=50
 dcu_threshold=100

 [feature_database]
-reject_throttle=0.6165309870679363
+reject_throttle=0.82
 embedding_model_path=/path/to/your/text2vec-large-chinese
 reranker_model_path=/path/to/your/bce-reranker-base_v1

 [llm]
 local_llm_path=/path/to/your/internlm-chat-7b
-use_vllm=False
\ No newline at end of file
+use_vllm=False
+stream_chat=False
\ No newline at end of file
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
+FROM docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk24.04-py310
\ No newline at end of file
--- a/dtk-cuda.zip
+++ b/dtk-cuda.zip
--- a/faiss-1.7.2_dtk24.04_gitb7348e7df780-py3-none-any.whl
+++ b/faiss-1.7.2_dtk24.04_gitb7348e7df780-py3-none-any.whl
--- a/llm_service/feature_database.py
+++ b/llm_service/feature_database.py
@@ -461,16 +461,16 @@ def parse_args():
        description='Feature store for processing directories.')
    parser.add_argument('--work_dir',
                        type=str,
-                        default='/ai/work_dir',
+                        default='/home/chat_demo/work_dir/',
                        help='自定义.')
    parser.add_argument(
        '--repo_dir',
        type=str,
-        default='',
+        default='/home/chat_demo/work_dir/jifu/original',
        help='需要读取的文件目录.')
    parser.add_argument(
        '--config_path',
-        default='/home/AI_project/chat_demo/config.ini',
+        default='/home/chat_demo/config.ini',
        help='config目录')
    args = parser.parse_args()
    return args
@@ -511,12 +511,12 @@ if __name__ == '__main__':
    # with open(os.path.join(args.work_dir, 'sample', 'negative.json')) as f:
    #     negative_sample = json.load(f)

-    with open(os.path.join(args.work_dir, 'sample', 'positive.txt'), 'r', encoding='utf-8') as file:
+    with open(os.path.join(args.work_dir, 'jifu', 'positive.txt'), 'r', encoding='utf-8') as file:
        positive_sample = []
        for line in file:
            positive_sample.append(line.strip())

-    with open(os.path.join(args.work_dir, 'sample', 'negative.txt'), 'r', encoding='utf-8') as file:
+    with open(os.path.join(args.work_dir, 'jifu', 'negative.txt'), 'r', encoding='utf-8') as file:
        negative_sample = []
        for line in file:
            negative_sample.append(line.strip())

--- a/llm_service/inferencer.py
+++ b/llm_service/inferencer.py
@@ -23,34 +23,45 @@ def build_history_messages(prompt, history, system: str = None):

 class InferenceWrapper:

-    def __init__(self, model_path: str, use_vllm: bool, stream_chat: bool):
+    def __init__(self, model_path: str, use_vllm: bool, stream_chat: bool, tensor_parallel_size: int):
        self.use_vllm = use_vllm
        self.stream_chat = stream_chat
        # huggingface
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-        self.model = AutoModelForCausalLM.from_pretrained(model_path,
-                                                          trust_remote_code=True,
-                                                          device_map='auto',
-                                                          torch_dtype=torch.bfloat16).eval()
+        # self.model = AutoModelForCausalLM.from_pretrained(model_path,
+        #                                                   trust_remote_code=True,
+        #                                                   torch_dtype=torch.float16).cuda().eval()
+        model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
+        self.model = model.eval()
+
        if self.use_vllm:
+            ## vllm
+            # from vllm import LLM, SamplingParams
+            #
+            # self.sampling_params = SamplingParams(temperature=1, top_p=0.95)
+            # self.llm = LLM(model=model_path,
+            #                trust_remote_code=True,
+            #                enforce_eager=True,
+            #                tensor_parallel_size=tensor_parallel_size)
+            ## fastllm
+            from fastllm_pytools import llm
            try:
-                # fastllm
-                from fastllm_pytools import llm
                if self.stream_chat:
                    # fastllm的流式初始化
                    self.model = llm.model(model_path)
                else:
-                    self.model = llm.from_hf(self.model, self.tokenizer, dtype="float16").cuda()
+                    self.model = llm.from_hf(self.model, self.tokenizer, dtype="float16")

            except Exception as e:
                logger.error(f"fastllm initial failed, {e}")


    def chat(self, prompt: str, history=[]):
-        '''问答'''
+        '''单轮问答'''
        output_text = ''
        try:
            if self.use_vllm:
+
                output_text = self.model.response(prompt)
            else:
                output_text, _ = self.model.chat(self.tokenizer,
@@ -87,13 +98,15 @@ class LLMInference:
                 model_path: str,
                 tensor_parallel_size: int,
                 device: str = 'cuda',
-                 use_vllm: bool = False
+                 use_vllm: bool = False,
+                 stream_chat: bool = False
                 ) -> None:

        self.device = device

-        self.inference = InferenceWrapper(model_path,
+        self.inference = InferenceWrapper(model_path=model_path,
                                          use_vllm=use_vllm,
+                                          stream_chat=stream_chat,
                                          tensor_parallel_size=tensor_parallel_size)

    def generate_response(self, prompt, history=[]):
@@ -126,6 +139,7 @@ def llm_inference(args):
    use_vllm = config.getboolean('llm', 'use_vllm')
    inference_wrapper = InferenceWrapper(model_path,
                                         use_vllm=use_vllm,
+                                         tensor_parallel_size=1,
                                         stream_chat=args.stream_chat)
    async def inference(request):
        start = time.time()
@@ -161,7 +175,7 @@ def parse_args():
        description='Feature store for processing directories.')
    parser.add_argument(
        '--config_path',
-        default='/path/of/config.ini',
+        default='../config.ini',
        help='config目录')
    parser.add_argument(
        '--query',
@@ -170,7 +184,7 @@ def parse_args():
    parser.add_argument(
        '--DCU_ID',
        type=str,
-        default='0',
+        default='1',
        help='设置DCU卡号,卡号之间用英文逗号隔开,输入样例:"0,1,2"')
    parser.add_argument(
        '--stream_chat',

--- a/llm_service/worker.py
+++ b/llm_service/worker.py
@@ -14,11 +14,12 @@ class ChatAgent:
        reject_throttle = float(config['feature_database']['reject_throttle'])
        local_llm_path = config['llm']['local_llm_path']
        use_vllm = config.getboolean('llm', 'use_vllm')
+        stream_chat = config.getboolean('llm', 'stream_chat')

        self.retriever = CacheRetriever(self.embedding_model_path,
                                        self.reranker_model_path).get(reject_throttle=reject_throttle,
                                                                      work_dir=self.work_dir)
-        self.llm_server = LLMInference(local_llm_path, tensor_parallel_size, use_vllm=use_vllm)
+        self.llm_server = LLMInference(model_path=local_llm_path, tensor_parallel_size=tensor_parallel_size, use_vllm=use_vllm, stream_chat=stream_chat)

    def generate_prompt(self,
                        history_pair,

--- a/server_start.py
+++ b/server_start.py
@@ -13,6 +13,56 @@ divisible_by_32 = [1, 2, 4, 8, 16, 32]
 recv_file_path = "%s/upload"


+def auto_select_dcu(config):
+    # Read threshold in config file
+    mem_threshold = config.getint('default', 'mem_threshold')
+    dcu_threshold = config.getint('default', 'dcu_threshold')
+    # Get dcu usage
+    process = subprocess.Popen("hy-smi | grep '^[0-9]' | awk '{print $1,$6,$7}' | sed 's/%//g'", shell=True,
+                               stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    output, error = process.communicate()
+    result = output.decode().strip().split('\n')
+    if not result:
+        raise Exception("There is no dcu on this node.")
+
+    dcu_map = {}
+    for line in result:
+        dcu_info = line.split()
+        if int(dcu_info[1]) >= mem_threshold or int(dcu_info[2]) >= dcu_threshold:
+            logger.debug("filter dcu:%s, which mem usage (%s) and dcu usage (%s) above the threshold." % (dcu_info[0],
+                                                                                                          dcu_info[1],
+                                                                                                          dcu_info[2]))
+            continue
+        logger.debug("dcu id:%s, mem usage: %s dcu usage: %s" % (dcu_info[0], dcu_info[1], dcu_info[2]))
+        dcu_map[dcu_info[0]] = [int(dcu_info[1]), int(dcu_info[2])]
+    # Select dcu count must be divisible by 32.
+    # TODO temporary use 40% of available count
+    count = round(len(dcu_map.keys()) * 0.4)
+    if not count:
+        logger.error("There is no available dcu device, can not start the service.")
+        raise Exception("There is no available dcu device, can not start the service.")
+    insert_index = bisect.bisect_left(divisible_by_32, count)
+
+    if insert_index > 0 and count != divisible_by_32[insert_index]:
+        index = insert_index - 1
+    elif count == divisible_by_32[insert_index]:
+        index = insert_index
+    else:
+        index = 0
+    select_count = divisible_by_32[index]
+    # Based on the ranking of memory and dcu usage.
+    dcu_mem_use_rank = [item[0] for item in dcu_map.values()]
+    dcu_use_rank = [item[1] for item in dcu_map.values()]
+    # Calculate the final ranking
+    final_rank = [(name, dcu_mem_use_rank[i] + dcu_use_rank[i]) for i, name in enumerate(dcu_map.keys())]
+    sorted_rank = sorted(final_rank, key=lambda x: x[1])
+    sorted_dcu_ids = [item[0] for item in sorted_rank]
+    select_dcu_ids = sorted_dcu_ids[:select_count]
+    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(map(str, select_dcu_ids))
+    logger.info(f"Set environment variable CUDA_VISIBLE_DEVICES to {select_dcu_ids}")
+    return select_dcu_ids
+
+
 def workflow(args):
    config = configparser.ConfigParser()
    config.read(args.config_path)
@@ -63,56 +113,6 @@ def workflow(args):
    web.run_app(app, host='0.0.0.0', port=bind_port)


-def auto_select_dcu(config):
-    # Read threshold in config file
-    mem_threshold = config.getint('default', 'mem_threshold')
-    dcu_threshold = config.getint('default', 'dcu_threshold')
-    # Get dcu usage
-    process = subprocess.Popen("hy-smi | grep '^[0-9]' | awk '{print $1,$6,$7}' | sed 's/%//g'", shell=True,
-                               stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    output, error = process.communicate()
-    result = output.decode().strip().split('\n')
-    if not result:
-        raise Exception("There is no dcu on this node.")
-
-    dcu_map = {}
-    for line in result:
-        dcu_info = line.split()
-        if int(dcu_info[1]) >= mem_threshold or int(dcu_info[2]) >= dcu_threshold:
-            logger.debug("filter dcu:%s, which mem usage (%s) and dcu usage (%s) above the threshold." % (dcu_info[0],
-                                                                                                          dcu_info[1],
-                                                                                                          dcu_info[2]))
-            continue
-        logger.debug("dcu id:%s, mem usage: %s dcu usage: %s" % (dcu_info[0], dcu_info[1], dcu_info[2]))
-        dcu_map[dcu_info[0]] = [int(dcu_info[1]), int(dcu_info[2])]
-    # Select dcu count must be divisible by 32.
-    # TODO temporary use 40% of available count
-    count = round(len(dcu_map.keys()) * 0.4)
-    if not count:
-        logger.error("There is no available dcu device, can not start the service.")
-        raise Exception("There is no available dcu device, can not start the service.")
-    insert_index = bisect.bisect_left(divisible_by_32, count)
-    #insert_index = bisect.bisect_left(divisible_by_32, len(dcu_map.keys()))
-    if insert_index > 0 and count != divisible_by_32[insert_index]:
-        index = insert_index - 1
-    elif count == divisible_by_32[insert_index]:
-        index = insert_index
-    else:
-        index = 0
-    select_count = divisible_by_32[index]
-    # Based on the ranking of memory and dcu usage.
-    dcu_mem_use_rank = [item[0] for item in dcu_map.values()]
-    dcu_use_rank = [item[1] for item in dcu_map.values()]
-    # Calculate the final ranking
-    final_rank = [(name, dcu_mem_use_rank[i] + dcu_use_rank[i]) for i, name in enumerate(dcu_map.keys())]
-    sorted_rank = sorted(final_rank, key=lambda x: x[1])
-    sorted_dcu_ids = [item[0] for item in sorted_rank]
-    select_dcu_ids = sorted_dcu_ids[:select_count]
-    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(map(str, select_dcu_ids))
-    logger.info(f"Set environment variable CUDA_VISIBLE_DEVICES to {select_dcu_ids}")
-    return select_dcu_ids
-
-
 def parse_args():
    parser = argparse.ArgumentParser(description='Start all services.')
    parser.add_argument('--config_path',

--- a/技服-接口文档.docx
+++ b/技服-接口文档.docx