Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Lmdeploy
Commits
15d1cc2e
Unverified
Commit
15d1cc2e
authored
Nov 03, 2023
by
AllentDan
Committed by
GitHub
Nov 03, 2023
Browse files
update turbomind session_len with model.session_len (#634)
parent
994027ff
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
25 additions
and
21 deletions
+25
-21
lmdeploy/serve/async_engine.py
lmdeploy/serve/async_engine.py
+3
-4
lmdeploy/turbomind/chat.py
lmdeploy/turbomind/chat.py
+6
-4
lmdeploy/turbomind/turbomind.py
lmdeploy/turbomind/turbomind.py
+16
-13
No files found.
lmdeploy/serve/async_engine.py
View file @
15d1cc2e
...
...
@@ -6,8 +6,6 @@ import random
from
contextlib
import
contextmanager
from
typing
import
List
,
Literal
,
Optional
from
lmdeploy.model
import
MODELS
,
BaseModel
@
dataclasses
.
dataclass
class
GenOut
:
...
...
@@ -36,13 +34,14 @@ class AsyncEngine:
tokenizer
=
Tokenizer
(
tokenizer_model_path
)
self
.
tm_model
=
tm
.
TurboMind
(
model_path
,
eos_id
=
tokenizer
.
eos_token_id
,
tp
=
tp
)
tp
=
tp
,
**
kwargs
)
self
.
tokenizer
=
tokenizer
self
.
generators
=
[
self
.
tm_model
.
create_instance
()
for
i
in
range
(
instance_num
)
]
self
.
instance_num
=
instance_num
self
.
model
:
BaseModel
=
MODELS
.
get
(
self
.
tm_model
.
model
_name
)(
**
kwargs
)
self
.
model
=
self
.
tm_model
.
model
self
.
available
=
[
True
]
*
instance_num
self
.
starts
=
[
None
]
*
instance_num
self
.
steps
=
{}
...
...
lmdeploy/turbomind/chat.py
View file @
15d1cc2e
...
...
@@ -4,8 +4,6 @@ import os
import
os.path
as
osp
import
random
from
lmdeploy.model
import
MODELS
os
.
environ
[
'TM_LOG_LEVEL'
]
=
'ERROR'
...
...
@@ -90,14 +88,18 @@ def main(model_path,
tokenizer_model_path
=
osp
.
join
(
model_path
,
'triton_models'
,
'tokenizer'
)
tokenizer
=
Tokenizer
(
tokenizer_model_path
)
tm_model
=
tm
.
TurboMind
(
model_path
,
eos_id
=
tokenizer
.
eos_token_id
,
tp
=
tp
)
tm_model
=
tm
.
TurboMind
(
model_path
,
eos_id
=
tokenizer
.
eos_token_id
,
tp
=
tp
,
capability
=
cap
,
**
kwargs
)
generator
=
tm_model
.
create_instance
()
nth_round
=
1
step
=
0
seed
=
random
.
getrandbits
(
64
)
model_name
=
tm_model
.
model_name
model
=
MODELS
.
get
(
model_name
)(
capability
=
cap
,
**
kwargs
)
model
=
tm_model
.
model
print
(
f
'session
{
session_id
}
'
)
while
True
:
...
...
lmdeploy/turbomind/turbomind.py
View file @
15d1cc2e
...
...
@@ -13,7 +13,7 @@ import torch
from
torch.nn.utils.rnn
import
pad_sequence
import
lmdeploy
from
lmdeploy.model
import
MODELS
from
lmdeploy.model
import
MODELS
,
BaseModel
from
lmdeploy.tokenizer
import
Tokenizer
from
lmdeploy.utils
import
get_logger
...
...
@@ -78,7 +78,11 @@ class TurboMind:
tp (int): tensor parallel
"""
def
__init__
(
self
,
model_path
:
str
,
eos_id
:
int
=
2
,
tp
:
int
=
1
):
def
__init__
(
self
,
model_path
:
str
,
eos_id
:
int
=
2
,
tp
:
int
=
1
,
**
kwargs
):
self
.
eos_id
=
eos_id
# TODO: support mpi
...
...
@@ -88,7 +92,6 @@ class TurboMind:
# read meta from model path
assert
((
tp
&
(
tp
-
1
)
==
0
)
and
tp
!=
0
),
'tp should be 2^n'
self
.
gpu_count
=
tp
self
.
session_len
=
2048
data_type
=
'fp16'
ini_path
=
osp
.
join
(
model_path
,
'triton_models/weights/config.ini'
)
with
open
(
ini_path
,
'r'
)
as
f
:
...
...
@@ -102,18 +105,18 @@ class TurboMind:
if
len
(
section_name
)
>
0
:
tp_cfg
=
parser
.
getint
(
section_name
,
'tensor_para_size'
)
self
.
session_len
=
parser
.
getint
(
section_name
,
'session_len'
)
if
tp_cfg
!=
1
and
tp_cfg
!=
tp
:
get_logger
(
'turbomind'
).
info
(
f
'found tp=
{
tp_cfg
}
in config.ini.'
)
self
.
gpu_count
=
tp_cfg
self
.
model_name
=
parser
.
get
(
section_name
,
'model_name'
)
data_type
=
parser
.
get
(
section_name
,
'weight_type'
)
model
=
MODELS
.
get
(
self
.
model_name
)()
self
.
model
:
BaseModel
=
MODELS
.
get
(
self
.
model_name
)(
**
kwargs
)
self
.
session_len
=
self
.
model
.
session_len
tokenizer_model_path
=
osp
.
join
(
model_path
,
'triton_models'
,
'tokenizer'
)
tokenizer
=
Tokenizer
(
tokenizer_model_path
)
self
.
stop_words
=
_stop_words
(
model
.
stop_words
,
tokenizer
)
self
.
stop_words
=
_stop_words
(
self
.
model
.
stop_words
,
tokenizer
)
# params
self
.
node_id
=
node_id
...
...
@@ -122,17 +125,17 @@ class TurboMind:
# create model
weight_dir
=
osp
.
join
(
model_path
,
'triton_models'
,
'weights'
)
model
=
_tm
.
AbstractTransformerModel
.
create_llama_model
(
model
_comm
=
_tm
.
AbstractTransformerModel
.
create_llama_model
(
weight_dir
,
tensor_para_size
=
self
.
gpu_count
,
data_type
=
data_type
)
self
.
model
=
model
self
.
nccl_params
=
model
.
create_nccl_params
(
self
.
node_id
)
self
.
model
_comm
=
model
_comm
self
.
nccl_params
=
model
_comm
.
create_nccl_params
(
self
.
node_id
)
torch
.
cuda
.
synchronize
()
# create weight
def
_create_weight
(
device_id
):
with
cuda_ctx
(
device_id
):
rank
=
self
.
node_id
*
self
.
gpu_count
+
device_id
model
.
create_shared_weights
(
device_id
,
rank
)
model
_comm
.
create_shared_weights
(
device_id
,
rank
)
threads
=
[]
for
device_id
in
range
(
self
.
gpu_count
):
...
...
@@ -161,7 +164,7 @@ class TurboMindInstance:
cuda_stream_id(int): identity of a cuda stream
"""
def
__init__
(
self
,
tm_model
,
cuda_stream_id
=
0
):
def
__init__
(
self
,
tm_model
:
TurboMind
,
cuda_stream_id
:
int
=
0
):
self
.
tm_model
=
tm_model
self
.
cuda_stream_id
=
cuda_stream_id
...
...
@@ -175,7 +178,7 @@ class TurboMindInstance:
self
.
session_len
=
tm_model
.
session_len
self
.
nccl_params
=
tm_model
.
nccl_params
self
.
instance_comm
=
tm_model
.
model
.
create_instance_comm
(
self
.
instance_comm
=
tm_model
.
model
_comm
.
create_instance_comm
(
self
.
gpu_count
)
# create model instances
...
...
@@ -196,7 +199,7 @@ class TurboMindInstance:
def
_create_model_instance
(
self
,
device_id
,
model_insts
):
with
cuda_ctx
(
device_id
):
rank
=
self
.
node_id
*
self
.
gpu_count
+
device_id
model_inst
=
self
.
tm_model
.
model
.
create_model_instance
(
model_inst
=
self
.
tm_model
.
model
_comm
.
create_model_instance
(
device_id
,
rank
,
self
.
cuda_stream_id
,
self
.
nccl_params
)
model_insts
[
device_id
]
=
model_inst
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment