Unverified Commit 15d1cc2e authored by AllentDan's avatar AllentDan Committed by GitHub
Browse files

update turbomind session_len with model.session_len (#634)

parent 994027ff
...@@ -6,8 +6,6 @@ import random ...@@ -6,8 +6,6 @@ import random
from contextlib import contextmanager from contextlib import contextmanager
from typing import List, Literal, Optional from typing import List, Literal, Optional
from lmdeploy.model import MODELS, BaseModel
@dataclasses.dataclass @dataclasses.dataclass
class GenOut: class GenOut:
...@@ -36,13 +34,14 @@ class AsyncEngine: ...@@ -36,13 +34,14 @@ class AsyncEngine:
tokenizer = Tokenizer(tokenizer_model_path) tokenizer = Tokenizer(tokenizer_model_path)
self.tm_model = tm.TurboMind(model_path, self.tm_model = tm.TurboMind(model_path,
eos_id=tokenizer.eos_token_id, eos_id=tokenizer.eos_token_id,
tp=tp) tp=tp,
**kwargs)
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.generators = [ self.generators = [
self.tm_model.create_instance() for i in range(instance_num) self.tm_model.create_instance() for i in range(instance_num)
] ]
self.instance_num = instance_num self.instance_num = instance_num
self.model: BaseModel = MODELS.get(self.tm_model.model_name)(**kwargs) self.model = self.tm_model.model
self.available = [True] * instance_num self.available = [True] * instance_num
self.starts = [None] * instance_num self.starts = [None] * instance_num
self.steps = {} self.steps = {}
......
...@@ -4,8 +4,6 @@ import os ...@@ -4,8 +4,6 @@ import os
import os.path as osp import os.path as osp
import random import random
from lmdeploy.model import MODELS
os.environ['TM_LOG_LEVEL'] = 'ERROR' os.environ['TM_LOG_LEVEL'] = 'ERROR'
...@@ -90,14 +88,18 @@ def main(model_path, ...@@ -90,14 +88,18 @@ def main(model_path,
tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer') tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer')
tokenizer = Tokenizer(tokenizer_model_path) tokenizer = Tokenizer(tokenizer_model_path)
tm_model = tm.TurboMind(model_path, eos_id=tokenizer.eos_token_id, tp=tp) tm_model = tm.TurboMind(model_path,
eos_id=tokenizer.eos_token_id,
tp=tp,
capability=cap,
**kwargs)
generator = tm_model.create_instance() generator = tm_model.create_instance()
nth_round = 1 nth_round = 1
step = 0 step = 0
seed = random.getrandbits(64) seed = random.getrandbits(64)
model_name = tm_model.model_name model_name = tm_model.model_name
model = MODELS.get(model_name)(capability=cap, **kwargs) model = tm_model.model
print(f'session {session_id}') print(f'session {session_id}')
while True: while True:
......
...@@ -13,7 +13,7 @@ import torch ...@@ -13,7 +13,7 @@ import torch
from torch.nn.utils.rnn import pad_sequence from torch.nn.utils.rnn import pad_sequence
import lmdeploy import lmdeploy
from lmdeploy.model import MODELS from lmdeploy.model import MODELS, BaseModel
from lmdeploy.tokenizer import Tokenizer from lmdeploy.tokenizer import Tokenizer
from lmdeploy.utils import get_logger from lmdeploy.utils import get_logger
...@@ -78,7 +78,11 @@ class TurboMind: ...@@ -78,7 +78,11 @@ class TurboMind:
tp (int): tensor parallel tp (int): tensor parallel
""" """
def __init__(self, model_path: str, eos_id: int = 2, tp: int = 1): def __init__(self,
model_path: str,
eos_id: int = 2,
tp: int = 1,
**kwargs):
self.eos_id = eos_id self.eos_id = eos_id
# TODO: support mpi # TODO: support mpi
...@@ -88,7 +92,6 @@ class TurboMind: ...@@ -88,7 +92,6 @@ class TurboMind:
# read meta from model path # read meta from model path
assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n' assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n'
self.gpu_count = tp self.gpu_count = tp
self.session_len = 2048
data_type = 'fp16' data_type = 'fp16'
ini_path = osp.join(model_path, 'triton_models/weights/config.ini') ini_path = osp.join(model_path, 'triton_models/weights/config.ini')
with open(ini_path, 'r') as f: with open(ini_path, 'r') as f:
...@@ -102,18 +105,18 @@ class TurboMind: ...@@ -102,18 +105,18 @@ class TurboMind:
if len(section_name) > 0: if len(section_name) > 0:
tp_cfg = parser.getint(section_name, 'tensor_para_size') tp_cfg = parser.getint(section_name, 'tensor_para_size')
self.session_len = parser.getint(section_name, 'session_len')
if tp_cfg != 1 and tp_cfg != tp: if tp_cfg != 1 and tp_cfg != tp:
get_logger('turbomind').info( get_logger('turbomind').info(
f'found tp={tp_cfg} in config.ini.') f'found tp={tp_cfg} in config.ini.')
self.gpu_count = tp_cfg self.gpu_count = tp_cfg
self.model_name = parser.get(section_name, 'model_name') self.model_name = parser.get(section_name, 'model_name')
data_type = parser.get(section_name, 'weight_type') data_type = parser.get(section_name, 'weight_type')
model = MODELS.get(self.model_name)() self.model: BaseModel = MODELS.get(self.model_name)(**kwargs)
self.session_len = self.model.session_len
tokenizer_model_path = osp.join(model_path, 'triton_models', tokenizer_model_path = osp.join(model_path, 'triton_models',
'tokenizer') 'tokenizer')
tokenizer = Tokenizer(tokenizer_model_path) tokenizer = Tokenizer(tokenizer_model_path)
self.stop_words = _stop_words(model.stop_words, tokenizer) self.stop_words = _stop_words(self.model.stop_words, tokenizer)
# params # params
self.node_id = node_id self.node_id = node_id
...@@ -122,17 +125,17 @@ class TurboMind: ...@@ -122,17 +125,17 @@ class TurboMind:
# create model # create model
weight_dir = osp.join(model_path, 'triton_models', 'weights') weight_dir = osp.join(model_path, 'triton_models', 'weights')
model = _tm.AbstractTransformerModel.create_llama_model( model_comm = _tm.AbstractTransformerModel.create_llama_model(
weight_dir, tensor_para_size=self.gpu_count, data_type=data_type) weight_dir, tensor_para_size=self.gpu_count, data_type=data_type)
self.model = model self.model_comm = model_comm
self.nccl_params = model.create_nccl_params(self.node_id) self.nccl_params = model_comm.create_nccl_params(self.node_id)
torch.cuda.synchronize() torch.cuda.synchronize()
# create weight # create weight
def _create_weight(device_id): def _create_weight(device_id):
with cuda_ctx(device_id): with cuda_ctx(device_id):
rank = self.node_id * self.gpu_count + device_id rank = self.node_id * self.gpu_count + device_id
model.create_shared_weights(device_id, rank) model_comm.create_shared_weights(device_id, rank)
threads = [] threads = []
for device_id in range(self.gpu_count): for device_id in range(self.gpu_count):
...@@ -161,7 +164,7 @@ class TurboMindInstance: ...@@ -161,7 +164,7 @@ class TurboMindInstance:
cuda_stream_id(int): identity of a cuda stream cuda_stream_id(int): identity of a cuda stream
""" """
def __init__(self, tm_model, cuda_stream_id=0): def __init__(self, tm_model: TurboMind, cuda_stream_id: int = 0):
self.tm_model = tm_model self.tm_model = tm_model
self.cuda_stream_id = cuda_stream_id self.cuda_stream_id = cuda_stream_id
...@@ -175,7 +178,7 @@ class TurboMindInstance: ...@@ -175,7 +178,7 @@ class TurboMindInstance:
self.session_len = tm_model.session_len self.session_len = tm_model.session_len
self.nccl_params = tm_model.nccl_params self.nccl_params = tm_model.nccl_params
self.instance_comm = tm_model.model.create_instance_comm( self.instance_comm = tm_model.model_comm.create_instance_comm(
self.gpu_count) self.gpu_count)
# create model instances # create model instances
...@@ -196,7 +199,7 @@ class TurboMindInstance: ...@@ -196,7 +199,7 @@ class TurboMindInstance:
def _create_model_instance(self, device_id, model_insts): def _create_model_instance(self, device_id, model_insts):
with cuda_ctx(device_id): with cuda_ctx(device_id):
rank = self.node_id * self.gpu_count + device_id rank = self.node_id * self.gpu_count + device_id
model_inst = self.tm_model.model.create_model_instance( model_inst = self.tm_model.model_comm.create_model_instance(
device_id, rank, self.cuda_stream_id, self.nccl_params) device_id, rank, self.cuda_stream_id, self.nccl_params)
model_insts[device_id] = model_inst model_insts[device_id] = model_inst
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment