Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenych
chat_demo
Commits
073c3410
"vscode:/vscode.git/clone" did not exist on "6c58bb59bed875752fe3cb90edc499da7bb72957"
Commit
073c3410
authored
Aug 01, 2024
by
Rayyyyy
Browse files
Fix bugs
parent
7e2f06a3
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
20 additions
and
24 deletions
+20
-24
config.ini
config.ini
+2
-1
llm_service/inferencer.py
llm_service/inferencer.py
+18
-23
No files found.
config.ini
View file @
073c3410
...
@@ -13,3 +13,4 @@ reranker_model_path=/path/to/your/bce-reranker-base_v1
...
@@ -13,3 +13,4 @@ reranker_model_path=/path/to/your/bce-reranker-base_v1
local_llm_path
=
/path/to/your/internlm-chat-7b
local_llm_path
=
/path/to/your/internlm-chat-7b
use_vllm
=
False
use_vllm
=
False
stream_chat
=
False
stream_chat
=
False
tensor_parallel_size
=
1
\ No newline at end of file
llm_service/inferencer.py
View file @
073c3410
...
@@ -68,7 +68,7 @@ class LLMInference:
...
@@ -68,7 +68,7 @@ class LLMInference:
def
__init__
(
self
,
def
__init__
(
self
,
model
,
model
,
token
z
ier
,
tokeni
z
er
,
device
:
str
=
'cuda'
,
device
:
str
=
'cuda'
,
use_vllm
:
bool
=
False
,
use_vllm
:
bool
=
False
,
stream_chat
:
bool
=
False
stream_chat
:
bool
=
False
...
@@ -76,7 +76,7 @@ class LLMInference:
...
@@ -76,7 +76,7 @@ class LLMInference:
self
.
device
=
device
self
.
device
=
device
self
.
model
=
model
self
.
model
=
model
self
.
token
z
ier
=
token
z
ier
self
.
tokeni
z
er
=
tokeni
z
er
self
.
use_vllm
=
use_vllm
self
.
use_vllm
=
use_vllm
self
.
stream_chat
=
stream_chat
self
.
stream_chat
=
stream_chat
...
@@ -170,16 +170,14 @@ class LLMInference:
...
@@ -170,16 +170,14 @@ class LLMInference:
def
init_model
(
model_path
,
use_vllm
=
False
,
tp_size
=
1
):
def
init_model
(
model_path
,
use_vllm
=
False
,
tp_size
=
1
):
## init models
## init models
# huggingface
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
,
device_map
=
"auto"
).
half
().
cuda
().
eval
()
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
if
use_vllm
:
if
use_vllm
:
try
:
try
:
# vllm
# vllm
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
tokenizer
=
SamplingParams
(
temperature
=
1
,
sampling_params
=
SamplingParams
(
temperature
=
1
,
top_p
=
0.95
,
top_p
=
0.95
,
max_tokens
=
1024
,
max_tokens
=
1024
,
stop_token_ids
=
[
tokenizer
.
eos_token_id
])
stop_token_ids
=
[
tokenizer
.
eos_token_id
])
...
@@ -189,9 +187,12 @@ def init_model(model_path, use_vllm=False, tp_size=1):
...
@@ -189,9 +187,12 @@ def init_model(model_path, use_vllm=False, tp_size=1):
enforce_eager
=
True
,
enforce_eager
=
True
,
dtype
=
"float16"
,
dtype
=
"float16"
,
tensor_parallel_size
=
tp_size
)
tensor_parallel_size
=
tp_size
)
return
model
,
sampling_params
except
Exception
as
e
:
except
Exception
as
e
:
logger
.
error
(
f
"fastllm initial failed,
{
e
}
"
)
logger
.
error
(
f
"vllm initial failed,
{
e
}
"
)
else
:
# huggingface
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
).
half
().
cuda
().
eval
()
return
model
,
tokenizer
return
model
,
tokenizer
...
@@ -204,13 +205,13 @@ def llm_inference(args):
...
@@ -204,13 +205,13 @@ def llm_inference(args):
model_path
=
config
[
'llm'
][
'local_llm_path'
]
model_path
=
config
[
'llm'
][
'local_llm_path'
]
tensor_parallel_size
=
config
.
getint
(
'llm'
,
'tensor_parallel_size'
)
tensor_parallel_size
=
config
.
getint
(
'llm'
,
'tensor_parallel_size'
)
use_vllm
=
config
.
getboolean
(
'llm'
,
'use_vllm'
)
use_vllm
=
config
.
getboolean
(
'llm'
,
'use_vllm'
)
print
(
"inference"
)
stream_chat
=
config
.
getboolean
(
'llm'
,
'stream_chat'
)
logger
.
info
(
f
"Get params: model_path
{
model_path
}
, use_vllm
{
use_vllm
}
, tensor_parallel_size
{
tensor_parallel_size
}
, stream_chat
{
stream_chat
}
"
)
model
,
tokenzier
=
init_model
(
model_path
,
use_vllm
,
tensor_parallel_size
)
model
,
tokenzier
=
init_model
(
model_path
,
use_vllm
,
tensor_parallel_size
)
inference
=
LLMInference
(
model
,
inference
=
LLMInference
(
model
,
tokenzier
,
tokenzier
,
use_vllm
=
use_vllm
,
use_vllm
=
use_vllm
,
tensor_parallel_size
=
tensor_parallel_size
,
stream_chat
=
args
.
stream_chat
)
stream_chat
=
args
.
stream_chat
)
async
def
inference
(
request
):
async
def
inference
(
request
):
start
=
time
.
time
()
start
=
time
.
time
()
...
@@ -239,21 +240,19 @@ def infer_test(args):
...
@@ -239,21 +240,19 @@ def infer_test(args):
use_vllm
=
config
.
getboolean
(
'llm'
,
'use_vllm'
)
use_vllm
=
config
.
getboolean
(
'llm'
,
'use_vllm'
)
tensor_parallel_size
=
config
.
getint
(
'llm'
,
'tensor_parallel_size'
)
tensor_parallel_size
=
config
.
getint
(
'llm'
,
'tensor_parallel_size'
)
stream_chat
=
config
.
getboolean
(
'llm'
,
'stream_chat'
)
stream_chat
=
config
.
getboolean
(
'llm'
,
'stream_chat'
)
logger
.
info
(
f
"Get params: model_path
{
model_path
}
, use_vllm
{
use_vllm
}
, tensor_parallel_size
{
tensor_parallel_size
}
, stream_chat
{
stream_chat
}
"
)
model
,
tokenzier
=
init_model
(
model_path
,
use_vllm
,
tensor_parallel_size
)
model
,
tokenzier
=
init_model
(
model_path
,
use_vllm
,
tensor_parallel_size
)
inference
=
LLMInference
(
model
,
inference
=
LLMInference
(
model
,
tokenzier
,
tokenzier
,
use_vllm
=
use_vllm
,
use_vllm
=
use_vllm
,
tensor_parallel_size
=
tensor_parallel_size
,
stream_chat
=
stream_chat
)
stream_chat
=
stream_chat
)
# prompt = "hello,please introduce yourself..."
prompt
=
'65N32-US主板清除CMOS配置的方法'
time_first
=
time
.
time
()
time_first
=
time
.
time
()
output_text
=
inference
.
chat
(
prompt
)
output_text
=
inference
.
chat
(
args
.
query
)
time_second
=
time
.
time
()
time_second
=
time
.
time
()
logger
.
debug
(
'问题:{} 回答:{}
\n
timecost {} '
.
format
(
logger
.
debug
(
'问题:{} 回答:{}
\n
timecost {} '
.
format
(
prompt
,
output_text
,
time_second
-
time_first
))
args
.
query
,
output_text
,
time_second
-
time_first
))
def
set_envs
(
dcu_ids
):
def
set_envs
(
dcu_ids
):
...
@@ -282,10 +281,6 @@ def parse_args():
...
@@ -282,10 +281,6 @@ def parse_args():
type
=
str
,
type
=
str
,
default
=
'0,1'
,
default
=
'0,1'
,
help
=
'设置DCU卡号,卡号之间用英文逗号隔开,输入样例:"0,1,2"'
)
help
=
'设置DCU卡号,卡号之间用英文逗号隔开,输入样例:"0,1,2"'
)
parser
.
add_argument
(
'--stream_chat'
,
action
=
'store_true'
,
help
=
'启用流式对话方式'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
return
args
return
args
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment