"vscode:/vscode.git/clone" did not exist on "6c58bb59bed875752fe3cb90edc499da7bb72957"
Commit 073c3410 authored by Rayyyyy's avatar Rayyyyy
Browse files

Fix bugs

parent 7e2f06a3
...@@ -13,3 +13,4 @@ reranker_model_path=/path/to/your/bce-reranker-base_v1 ...@@ -13,3 +13,4 @@ reranker_model_path=/path/to/your/bce-reranker-base_v1
local_llm_path=/path/to/your/internlm-chat-7b local_llm_path=/path/to/your/internlm-chat-7b
use_vllm=False use_vllm=False
stream_chat=False stream_chat=False
tensor_parallel_size=1
\ No newline at end of file
...@@ -68,7 +68,7 @@ class LLMInference: ...@@ -68,7 +68,7 @@ class LLMInference:
def __init__(self, def __init__(self,
model, model,
tokenzier, tokenizer,
device: str = 'cuda', device: str = 'cuda',
use_vllm: bool = False, use_vllm: bool = False,
stream_chat: bool = False stream_chat: bool = False
...@@ -76,7 +76,7 @@ class LLMInference: ...@@ -76,7 +76,7 @@ class LLMInference:
self.device = device self.device = device
self.model = model self.model = model
self.tokenzier = tokenzier self.tokenizer = tokenizer
self.use_vllm = use_vllm self.use_vllm = use_vllm
self.stream_chat = stream_chat self.stream_chat = stream_chat
...@@ -170,16 +170,14 @@ class LLMInference: ...@@ -170,16 +170,14 @@ class LLMInference:
def init_model(model_path, use_vllm=False, tp_size=1): def init_model(model_path, use_vllm=False, tp_size=1):
## init models ## init models
# huggingface
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True,device_map="auto").half().cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if use_vllm: if use_vllm:
try: try:
# vllm # vllm
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
tokenizer = SamplingParams(temperature=1, sampling_params = SamplingParams(temperature=1,
top_p=0.95, top_p=0.95,
max_tokens=1024, max_tokens=1024,
stop_token_ids=[tokenizer.eos_token_id]) stop_token_ids=[tokenizer.eos_token_id])
...@@ -189,9 +187,12 @@ def init_model(model_path, use_vllm=False, tp_size=1): ...@@ -189,9 +187,12 @@ def init_model(model_path, use_vllm=False, tp_size=1):
enforce_eager=True, enforce_eager=True,
dtype="float16", dtype="float16",
tensor_parallel_size=tp_size) tensor_parallel_size=tp_size)
return model, sampling_params
except Exception as e: except Exception as e:
logger.error(f"fastllm initial failed, {e}") logger.error(f"vllm initial failed, {e}")
else:
# huggingface
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).half().cuda().eval()
return model, tokenizer return model, tokenizer
...@@ -204,13 +205,13 @@ def llm_inference(args): ...@@ -204,13 +205,13 @@ def llm_inference(args):
model_path = config['llm']['local_llm_path'] model_path = config['llm']['local_llm_path']
tensor_parallel_size = config.getint('llm', 'tensor_parallel_size') tensor_parallel_size = config.getint('llm', 'tensor_parallel_size')
use_vllm = config.getboolean('llm', 'use_vllm') use_vllm = config.getboolean('llm', 'use_vllm')
print("inference") stream_chat = config.getboolean('llm', 'stream_chat')
logger.info(f"Get params: model_path {model_path}, use_vllm {use_vllm}, tensor_parallel_size {tensor_parallel_size}, stream_chat {stream_chat}")
model, tokenzier = init_model(model_path, use_vllm, tensor_parallel_size) model, tokenzier = init_model(model_path, use_vllm, tensor_parallel_size)
inference = LLMInference(model, inference = LLMInference(model,
tokenzier, tokenzier,
use_vllm=use_vllm, use_vllm=use_vllm,
tensor_parallel_size=tensor_parallel_size,
stream_chat=args.stream_chat) stream_chat=args.stream_chat)
async def inference(request): async def inference(request):
start = time.time() start = time.time()
...@@ -239,21 +240,19 @@ def infer_test(args): ...@@ -239,21 +240,19 @@ def infer_test(args):
use_vllm = config.getboolean('llm', 'use_vllm') use_vllm = config.getboolean('llm', 'use_vllm')
tensor_parallel_size = config.getint('llm', 'tensor_parallel_size') tensor_parallel_size = config.getint('llm', 'tensor_parallel_size')
stream_chat = config.getboolean('llm', 'stream_chat') stream_chat = config.getboolean('llm', 'stream_chat')
logger.info(f"Get params: model_path {model_path}, use_vllm {use_vllm}, tensor_parallel_size {tensor_parallel_size}, stream_chat {stream_chat}")
model, tokenzier = init_model(model_path, use_vllm, tensor_parallel_size) model, tokenzier = init_model(model_path, use_vllm, tensor_parallel_size)
inference = LLMInference(model, inference = LLMInference(model,
tokenzier, tokenzier,
use_vllm=use_vllm, use_vllm=use_vllm,
tensor_parallel_size=tensor_parallel_size,
stream_chat=stream_chat) stream_chat=stream_chat)
# prompt = "hello,please introduce yourself..."
prompt ='65N32-US主板清除CMOS配置的方法'
time_first = time.time() time_first = time.time()
output_text = inference.chat(prompt) output_text = inference.chat(args.query)
time_second = time.time() time_second = time.time()
logger.debug('问题:{} 回答:{} \ntimecost {} '.format( logger.debug('问题:{} 回答:{} \ntimecost {} '.format(
prompt, output_text, time_second - time_first)) args.query, output_text, time_second - time_first))
def set_envs(dcu_ids): def set_envs(dcu_ids):
...@@ -282,10 +281,6 @@ def parse_args(): ...@@ -282,10 +281,6 @@ def parse_args():
type=str, type=str,
default='0,1', default='0,1',
help='设置DCU卡号,卡号之间用英文逗号隔开,输入样例:"0,1,2"') help='设置DCU卡号,卡号之间用英文逗号隔开,输入样例:"0,1,2"')
parser.add_argument(
'--stream_chat',
action='store_true',
help='启用流式对话方式')
args = parser.parse_args() args = parser.parse_args()
return args return args
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment