inferencer.py 7.01 KB
Newer Older
Rayyyyy's avatar
Rayyyyy committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import time
import os
import configparser
import argparse
from multiprocessing import Value
from aiohttp import web
import torch
from loguru import logger

from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel


def build_history_messages(prompt, history, system: str = None):
    history_messages = []
    if system is not None and len(system) > 0:
        history_messages.append({'role': 'system', 'content': system})
    for item in history:
        history_messages.append({'role': 'user', 'content': item[0]})
        history_messages.append({'role': 'assistant', 'content': item[1]})
    history_messages.append({'role': 'user', 'content': prompt})
    return history_messages


class InferenceWrapper:

Rayyyyy's avatar
update  
Rayyyyy committed
26
    def __init__(self, model_path: str, use_vllm: bool, stream_chat: bool, tensor_parallel_size: int):
Rayyyyy's avatar
Rayyyyy committed
27
        self.use_vllm = use_vllm
Rayyyyy's avatar
Rayyyyy committed
28
29
30
        self.stream_chat = stream_chat
        # huggingface
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
Rayyyyy's avatar
update  
Rayyyyy committed
31
32
33
34
35
36
        # self.model = AutoModelForCausalLM.from_pretrained(model_path,
        #                                                   trust_remote_code=True,
        #                                                   torch_dtype=torch.float16).cuda().eval()
        model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
        self.model = model.eval()

Rayyyyy's avatar
Rayyyyy committed
37
        if self.use_vllm:
Rayyyyy's avatar
update  
Rayyyyy committed
38
39
40
41
42
43
44
45
46
47
            ## vllm
            # from vllm import LLM, SamplingParams
            #
            # self.sampling_params = SamplingParams(temperature=1, top_p=0.95)
            # self.llm = LLM(model=model_path,
            #                trust_remote_code=True,
            #                enforce_eager=True,
            #                tensor_parallel_size=tensor_parallel_size)
            ## fastllm
            from fastllm_pytools import llm
Rayyyyy's avatar
Rayyyyy committed
48
49
            try:
                if self.stream_chat:
Rayyyyy's avatar
Rayyyyy committed
50
                    # fastllm的流式初始化
Rayyyyy's avatar
Rayyyyy committed
51
52
                    self.model = llm.model(model_path)
                else:
Rayyyyy's avatar
update  
Rayyyyy committed
53
                    self.model = llm.from_hf(self.model, self.tokenizer, dtype="float16")
Rayyyyy's avatar
Rayyyyy committed
54
55

            except Exception as e:
Rayyyyy's avatar
Rayyyyy committed
56
                logger.error(f"fastllm initial failed, {e}")
Rayyyyy's avatar
Rayyyyy committed
57
58
59


    def chat(self, prompt: str, history=[]):
Rayyyyy's avatar
update  
Rayyyyy committed
60
        '''单轮问答'''
Rayyyyy's avatar
Rayyyyy committed
61
62
        output_text = ''
        try:
Rayyyyy's avatar
Rayyyyy committed
63
            if self.use_vllm:
Rayyyyy's avatar
update  
Rayyyyy committed
64

Rayyyyy's avatar
Rayyyyy committed
65
66
67
68
69
70
71
                output_text = self.model.response(prompt)
            else:
                output_text, _ = self.model.chat(self.tokenizer,
                                                    prompt,
                                                    history,
                                                    do_sample=False)
        except Exception as e:
Rayyyyy's avatar
Rayyyyy committed
72
            logger.error(f"chat inference failed, {e}")
Rayyyyy's avatar
Rayyyyy committed
73
74
75
76
77
        return output_text


    def chat_stream(self, prompt: str, history=[]):
        '''流式服务'''
Rayyyyy's avatar
Rayyyyy committed
78
        if self.use_vllm:
Rayyyyy's avatar
Rayyyyy committed
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
            from fastllm_pytools import llm
            # Fastllm
            for response in self.model.stream_response(prompt, history=[]):
                yield response
        else:
            # HuggingFace
            current_length = 0
            past_key_values = None
            for response, _, past_key_values in self.model.stream_chat(self.tokenizer, prompt, history=history,
                                                                past_key_values=past_key_values,
                                                                return_past_key_values=True):
                output_text = response[current_length:]

                yield output_text
                current_length = len(response)


class LLMInference:
    def __init__(self,
                 model_path: str,
                 tensor_parallel_size: int,
                 device: str = 'cuda',
Rayyyyy's avatar
update  
Rayyyyy committed
101
102
                 use_vllm: bool = False,
                 stream_chat: bool = False
Rayyyyy's avatar
Rayyyyy committed
103
104
105
106
                 ) -> None:

        self.device = device

Rayyyyy's avatar
update  
Rayyyyy committed
107
        self.inference = InferenceWrapper(model_path=model_path,
Rayyyyy's avatar
Rayyyyy committed
108
                                          use_vllm=use_vllm,
Rayyyyy's avatar
update  
Rayyyyy committed
109
                                          stream_chat=stream_chat,
Rayyyyy's avatar
Rayyyyy committed
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
                                          tensor_parallel_size=tensor_parallel_size)

    def generate_response(self, prompt, history=[]):
        output_text = ''
        error = ''
        time_tokenizer = time.time()

        try:
            output_text = self.inference.chat(prompt, history)

        except Exception as e:
            error = str(e)
            logger.error(error)

        time_finish = time.time()

        logger.debug('output_text:{} \ntimecost {} '.format(output_text,
            time_finish - time_tokenizer))

        return output_text, error


def llm_inference(args):
Rayyyyy's avatar
Rayyyyy committed
133
    '''启动 Web 服务器,接收 HTTP 请求,并通过调用本地的 LLM 推理服务生成响应. '''
Rayyyyy's avatar
Rayyyyy committed
134
135
136
137
138
    config = configparser.ConfigParser()
    config.read(args.config_path)

    bind_port = int(config['default']['bind_port'])
    model_path = config['llm']['local_llm_path']
Rayyyyy's avatar
Rayyyyy committed
139
    use_vllm = config.getboolean('llm', 'use_vllm')
Rayyyyy's avatar
Rayyyyy committed
140
    inference_wrapper = InferenceWrapper(model_path,
Rayyyyy's avatar
Rayyyyy committed
141
                                         use_vllm=use_vllm,
Rayyyyy's avatar
update  
Rayyyyy committed
142
                                         tensor_parallel_size=1,
Rayyyyy's avatar
Rayyyyy committed
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
                                         stream_chat=args.stream_chat)
    async def inference(request):
        start = time.time()
        input_json = await request.json()

        prompt = input_json['prompt']
        history = input_json['history']
        if args.stream_chat:
            text = inference_wrapper.stream_chat(prompt=prompt, history=history)
        else:
            text = inference_wrapper.chat(prompt=prompt, history=history)
        end = time.time()
        logger.debug('问题:{} 回答:{} \ntimecost {} '.format(prompt, text, end - start))
        return web.json_response({'text': text})

    app = web.Application()
    app.add_routes([web.post('/inference', inference)])
    web.run_app(app, host='0.0.0.0', port=bind_port)


Rayyyyy's avatar
Rayyyyy committed
163
164
165
166
167
168
169
170
171
def set_envs(dcu_ids):
    try:
        os.environ["CUDA_VISIBLE_DEVICES"] = dcu_ids
        logger.info(f"Set environment variable CUDA_VISIBLE_DEVICES to {dcu_ids}")
    except Exception as e:
        logger.error(f"{e}, but got {dcu_ids}")
        raise ValueError(f"{e}")


Rayyyyy's avatar
Rayyyyy committed
172
173
174
175
176
177
def parse_args():
    '''参数'''
    parser = argparse.ArgumentParser(
        description='Feature store for processing directories.')
    parser.add_argument(
        '--config_path',
Rayyyyy's avatar
update  
Rayyyyy committed
178
        default='../config.ini',
Rayyyyy's avatar
Rayyyyy committed
179
180
181
182
183
184
185
        help='config目录')
    parser.add_argument(
        '--query',
        default=['请问下产品的服务器保修或保修政策?'],
        help='提问的问题.')
    parser.add_argument(
        '--DCU_ID',
Rayyyyy's avatar
Rayyyyy committed
186
        type=str,
Rayyyyy's avatar
update  
Rayyyyy committed
187
        default='1',
Rayyyyy's avatar
Rayyyyy committed
188
        help='设置DCU卡号,卡号之间用英文逗号隔开,输入样例:"0,1,2"')
Rayyyyy's avatar
Rayyyyy committed
189
190
191
192
193
194
195
196
197
198
    parser.add_argument(
        '--stream_chat',
        action='store_true',
        help='启用流式对话方式')
    args = parser.parse_args()
    return args


def main():
    args = parse_args()
Rayyyyy's avatar
Rayyyyy committed
199
    set_envs(args.DCU_ID)
Rayyyyy's avatar
Rayyyyy committed
200
201
202
203
204
    llm_inference(args)


if __name__ == '__main__':
    main()