Unverified Commit 15d1cc2e authored by AllentDan's avatar AllentDan Committed by GitHub
Browse files

update turbomind session_len with model.session_len (#634)

parent 994027ff
......@@ -6,8 +6,6 @@ import random
from contextlib import contextmanager
from typing import List, Literal, Optional
from lmdeploy.model import MODELS, BaseModel
@dataclasses.dataclass
class GenOut:
......@@ -36,13 +34,14 @@ class AsyncEngine:
tokenizer = Tokenizer(tokenizer_model_path)
self.tm_model = tm.TurboMind(model_path,
eos_id=tokenizer.eos_token_id,
tp=tp)
tp=tp,
**kwargs)
self.tokenizer = tokenizer
self.generators = [
self.tm_model.create_instance() for i in range(instance_num)
]
self.instance_num = instance_num
self.model: BaseModel = MODELS.get(self.tm_model.model_name)(**kwargs)
self.model = self.tm_model.model
self.available = [True] * instance_num
self.starts = [None] * instance_num
self.steps = {}
......
......@@ -4,8 +4,6 @@ import os
import os.path as osp
import random
from lmdeploy.model import MODELS
os.environ['TM_LOG_LEVEL'] = 'ERROR'
......@@ -90,14 +88,18 @@ def main(model_path,
tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer')
tokenizer = Tokenizer(tokenizer_model_path)
tm_model = tm.TurboMind(model_path, eos_id=tokenizer.eos_token_id, tp=tp)
tm_model = tm.TurboMind(model_path,
eos_id=tokenizer.eos_token_id,
tp=tp,
capability=cap,
**kwargs)
generator = tm_model.create_instance()
nth_round = 1
step = 0
seed = random.getrandbits(64)
model_name = tm_model.model_name
model = MODELS.get(model_name)(capability=cap, **kwargs)
model = tm_model.model
print(f'session {session_id}')
while True:
......
......@@ -13,7 +13,7 @@ import torch
from torch.nn.utils.rnn import pad_sequence
import lmdeploy
from lmdeploy.model import MODELS
from lmdeploy.model import MODELS, BaseModel
from lmdeploy.tokenizer import Tokenizer
from lmdeploy.utils import get_logger
......@@ -78,7 +78,11 @@ class TurboMind:
tp (int): tensor parallel
"""
def __init__(self, model_path: str, eos_id: int = 2, tp: int = 1):
def __init__(self,
model_path: str,
eos_id: int = 2,
tp: int = 1,
**kwargs):
self.eos_id = eos_id
# TODO: support mpi
......@@ -88,7 +92,6 @@ class TurboMind:
# read meta from model path
assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n'
self.gpu_count = tp
self.session_len = 2048
data_type = 'fp16'
ini_path = osp.join(model_path, 'triton_models/weights/config.ini')
with open(ini_path, 'r') as f:
......@@ -102,18 +105,18 @@ class TurboMind:
if len(section_name) > 0:
tp_cfg = parser.getint(section_name, 'tensor_para_size')
self.session_len = parser.getint(section_name, 'session_len')
if tp_cfg != 1 and tp_cfg != tp:
get_logger('turbomind').info(
f'found tp={tp_cfg} in config.ini.')
self.gpu_count = tp_cfg
self.model_name = parser.get(section_name, 'model_name')
data_type = parser.get(section_name, 'weight_type')
model = MODELS.get(self.model_name)()
self.model: BaseModel = MODELS.get(self.model_name)(**kwargs)
self.session_len = self.model.session_len
tokenizer_model_path = osp.join(model_path, 'triton_models',
'tokenizer')
tokenizer = Tokenizer(tokenizer_model_path)
self.stop_words = _stop_words(model.stop_words, tokenizer)
self.stop_words = _stop_words(self.model.stop_words, tokenizer)
# params
self.node_id = node_id
......@@ -122,17 +125,17 @@ class TurboMind:
# create model
weight_dir = osp.join(model_path, 'triton_models', 'weights')
model = _tm.AbstractTransformerModel.create_llama_model(
model_comm = _tm.AbstractTransformerModel.create_llama_model(
weight_dir, tensor_para_size=self.gpu_count, data_type=data_type)
self.model = model
self.nccl_params = model.create_nccl_params(self.node_id)
self.model_comm = model_comm
self.nccl_params = model_comm.create_nccl_params(self.node_id)
torch.cuda.synchronize()
# create weight
def _create_weight(device_id):
with cuda_ctx(device_id):
rank = self.node_id * self.gpu_count + device_id
model.create_shared_weights(device_id, rank)
model_comm.create_shared_weights(device_id, rank)
threads = []
for device_id in range(self.gpu_count):
......@@ -161,7 +164,7 @@ class TurboMindInstance:
cuda_stream_id(int): identity of a cuda stream
"""
def __init__(self, tm_model, cuda_stream_id=0):
def __init__(self, tm_model: TurboMind, cuda_stream_id: int = 0):
self.tm_model = tm_model
self.cuda_stream_id = cuda_stream_id
......@@ -175,7 +178,7 @@ class TurboMindInstance:
self.session_len = tm_model.session_len
self.nccl_params = tm_model.nccl_params
self.instance_comm = tm_model.model.create_instance_comm(
self.instance_comm = tm_model.model_comm.create_instance_comm(
self.gpu_count)
# create model instances
......@@ -196,7 +199,7 @@ class TurboMindInstance:
def _create_model_instance(self, device_id, model_insts):
with cuda_ctx(device_id):
rank = self.node_id * self.gpu_count + device_id
model_inst = self.tm_model.model.create_model_instance(
model_inst = self.tm_model.model_comm.create_model_instance(
device_id, rank, self.cuda_stream_id, self.nccl_params)
model_insts[device_id] = model_inst
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment