Commit 073c3410 authored by Rayyyyy's avatar Rayyyyy
Browse files

Fix bugs

parent 7e2f06a3
......@@ -12,4 +12,5 @@ reranker_model_path=/path/to/your/bce-reranker-base_v1
[llm]
local_llm_path=/path/to/your/internlm-chat-7b
use_vllm=False
stream_chat=False
\ No newline at end of file
stream_chat=False
tensor_parallel_size=1
\ No newline at end of file
......@@ -68,7 +68,7 @@ class LLMInference:
def __init__(self,
model,
tokenzier,
tokenizer,
device: str = 'cuda',
use_vllm: bool = False,
stream_chat: bool = False
......@@ -76,7 +76,7 @@ class LLMInference:
self.device = device
self.model = model
self.tokenzier = tokenzier
self.tokenizer = tokenizer
self.use_vllm = use_vllm
self.stream_chat = stream_chat
......@@ -170,16 +170,14 @@ class LLMInference:
def init_model(model_path, use_vllm=False, tp_size=1):
## init models
# huggingface
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True,device_map="auto").half().cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if use_vllm:
try:
# vllm
from vllm import LLM, SamplingParams
tokenizer = SamplingParams(temperature=1,
sampling_params = SamplingParams(temperature=1,
top_p=0.95,
max_tokens=1024,
stop_token_ids=[tokenizer.eos_token_id])
......@@ -189,10 +187,13 @@ def init_model(model_path, use_vllm=False, tp_size=1):
enforce_eager=True,
dtype="float16",
tensor_parallel_size=tp_size)
return model, sampling_params
except Exception as e:
logger.error(f"fastllm initial failed, {e}")
return model, tokenizer
logger.error(f"vllm initial failed, {e}")
else:
# huggingface
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).half().cuda().eval()
return model, tokenizer
def llm_inference(args):
......@@ -204,13 +205,13 @@ def llm_inference(args):
model_path = config['llm']['local_llm_path']
tensor_parallel_size = config.getint('llm', 'tensor_parallel_size')
use_vllm = config.getboolean('llm', 'use_vllm')
print("inference")
stream_chat = config.getboolean('llm', 'stream_chat')
logger.info(f"Get params: model_path {model_path}, use_vllm {use_vllm}, tensor_parallel_size {tensor_parallel_size}, stream_chat {stream_chat}")
model, tokenzier = init_model(model_path, use_vllm, tensor_parallel_size)
inference = LLMInference(model,
tokenzier,
use_vllm=use_vllm,
tensor_parallel_size=tensor_parallel_size,
stream_chat=args.stream_chat)
async def inference(request):
start = time.time()
......@@ -239,21 +240,19 @@ def infer_test(args):
use_vllm = config.getboolean('llm', 'use_vllm')
tensor_parallel_size = config.getint('llm', 'tensor_parallel_size')
stream_chat = config.getboolean('llm', 'stream_chat')
logger.info(f"Get params: model_path {model_path}, use_vllm {use_vllm}, tensor_parallel_size {tensor_parallel_size}, stream_chat {stream_chat}")
model, tokenzier = init_model(model_path, use_vllm, tensor_parallel_size)
inference = LLMInference(model,
tokenzier,
use_vllm=use_vllm,
tensor_parallel_size=tensor_parallel_size,
stream_chat=stream_chat)
# prompt = "hello,please introduce yourself..."
prompt ='65N32-US主板清除CMOS配置的方法'
tokenzier,
use_vllm=use_vllm,
stream_chat=stream_chat)
time_first = time.time()
output_text = inference.chat(prompt)
output_text = inference.chat(args.query)
time_second = time.time()
logger.debug('问题:{} 回答:{} \ntimecost {} '.format(
prompt, output_text, time_second - time_first))
args.query, output_text, time_second - time_first))
def set_envs(dcu_ids):
......@@ -282,10 +281,6 @@ def parse_args():
type=str,
default='0,1',
help='设置DCU卡号,卡号之间用英文逗号隔开,输入样例:"0,1,2"')
parser.add_argument(
'--stream_chat',
action='store_true',
help='启用流式对话方式')
args = parser.parse_args()
return args
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment