update turbomind session_len with model.session_len (#634)

15d1cc2e · AllentDan · GitHub · 994027ff · 15d1cc2e · 15d1cc2e
Unverified Commit 15d1cc2e authored Nov 03, 2023 by AllentDan Committed by GitHub Nov 03, 2023
Showing with 25 additions and 21 deletions

lmdeploy/serve/async_engine.py lmdeploy/serve/async_engine.py +3 -4

lmdeploy/turbomind/chat.py lmdeploy/turbomind/chat.py +6 -4

lmdeploy/turbomind/turbomind.py lmdeploy/turbomind/turbomind.py +16 -13

No files found.
--- a/lmdeploy/serve/async_engine.py
+++ b/lmdeploy/serve/async_engine.py
@@ -6,8 +6,6 @@ import random
 from contextlib import contextmanager
 from typing import List, Literal, Optional

-from lmdeploy.model import MODELS, BaseModel
-

 @dataclasses.dataclass
 class GenOut:
@@ -36,13 +34,14 @@ class AsyncEngine:
        tokenizer = Tokenizer(tokenizer_model_path)
        self.tm_model = tm.TurboMind(model_path,
                                     eos_id=tokenizer.eos_token_id,
-                                     tp=tp)
+                                     tp=tp,
+                                     **kwargs)
        self.tokenizer = tokenizer
        self.generators = [
            self.tm_model.create_instance() for i in range(instance_num)
        ]
        self.instance_num = instance_num
-        self.model: BaseModel = MODELS.get(self.tm_model.model_name)(**kwargs)
+        self.model = self.tm_model.model
        self.available = [True] * instance_num
        self.starts = [None] * instance_num
        self.steps = {}

--- a/lmdeploy/turbomind/chat.py
+++ b/lmdeploy/turbomind/chat.py
@@ -4,8 +4,6 @@ import os
 import os.path as osp
 import random

-from lmdeploy.model import MODELS
-
 os.environ['TM_LOG_LEVEL'] = 'ERROR'


@@ -90,14 +88,18 @@ def main(model_path,

    tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer')
    tokenizer = Tokenizer(tokenizer_model_path)
-    tm_model = tm.TurboMind(model_path, eos_id=tokenizer.eos_token_id, tp=tp)
+    tm_model = tm.TurboMind(model_path,
+                            eos_id=tokenizer.eos_token_id,
+                            tp=tp,
+                            capability=cap,
+                            **kwargs)
    generator = tm_model.create_instance()

    nth_round = 1
    step = 0
    seed = random.getrandbits(64)
    model_name = tm_model.model_name
-    model = MODELS.get(model_name)(capability=cap, **kwargs)
+    model = tm_model.model

    print(f'session {session_id}')
    while True:

--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
@@ -13,7 +13,7 @@ import torch
 from torch.nn.utils.rnn import pad_sequence

 import lmdeploy
-from lmdeploy.model import MODELS
+from lmdeploy.model import MODELS, BaseModel
 from lmdeploy.tokenizer import Tokenizer
 from lmdeploy.utils import get_logger

@@ -78,7 +78,11 @@ class TurboMind:
        tp (int): tensor parallel
    """

-    def __init__(self, model_path: str, eos_id: int = 2, tp: int = 1):
+    def __init__(self,
+                 model_path: str,
+                 eos_id: int = 2,
+                 tp: int = 1,
+                 **kwargs):
        self.eos_id = eos_id

        # TODO: support mpi
@@ -88,7 +92,6 @@ class TurboMind:
        # read meta from model path
        assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n'
        self.gpu_count = tp
-        self.session_len = 2048
        data_type = 'fp16'
        ini_path = osp.join(model_path, 'triton_models/weights/config.ini')
        with open(ini_path, 'r') as f:
@@ -102,18 +105,18 @@ class TurboMind:

            if len(section_name) > 0:
                tp_cfg = parser.getint(section_name, 'tensor_para_size')
-                self.session_len = parser.getint(section_name, 'session_len')
                if tp_cfg != 1 and tp_cfg != tp:
                    get_logger('turbomind').info(
                        f'found tp={tp_cfg} in config.ini.')
                    self.gpu_count = tp_cfg
            self.model_name = parser.get(section_name, 'model_name')
            data_type = parser.get(section_name, 'weight_type')
-        model = MODELS.get(self.model_name)()
+        self.model: BaseModel = MODELS.get(self.model_name)(**kwargs)
+        self.session_len = self.model.session_len
        tokenizer_model_path = osp.join(model_path, 'triton_models',
                                        'tokenizer')
        tokenizer = Tokenizer(tokenizer_model_path)
-        self.stop_words = _stop_words(model.stop_words, tokenizer)
+        self.stop_words = _stop_words(self.model.stop_words, tokenizer)

        # params
        self.node_id = node_id
@@ -122,17 +125,17 @@ class TurboMind:

        # create model
        weight_dir = osp.join(model_path, 'triton_models', 'weights')
-        model = _tm.AbstractTransformerModel.create_llama_model(
+        model_comm = _tm.AbstractTransformerModel.create_llama_model(
            weight_dir, tensor_para_size=self.gpu_count, data_type=data_type)
-        self.model = model
-        self.nccl_params = model.create_nccl_params(self.node_id)
+        self.model_comm = model_comm
+        self.nccl_params = model_comm.create_nccl_params(self.node_id)
        torch.cuda.synchronize()

        # create weight
        def _create_weight(device_id):
            with cuda_ctx(device_id):
                rank = self.node_id * self.gpu_count + device_id
-                model.create_shared_weights(device_id, rank)
+                model_comm.create_shared_weights(device_id, rank)

        threads = []
        for device_id in range(self.gpu_count):
@@ -161,7 +164,7 @@ class TurboMindInstance:
        cuda_stream_id(int): identity of a cuda stream
    """

-    def __init__(self, tm_model, cuda_stream_id=0):
+    def __init__(self, tm_model: TurboMind, cuda_stream_id: int = 0):
        self.tm_model = tm_model
        self.cuda_stream_id = cuda_stream_id

@@ -175,7 +178,7 @@ class TurboMindInstance:
        self.session_len = tm_model.session_len

        self.nccl_params = tm_model.nccl_params
-        self.instance_comm = tm_model.model.create_instance_comm(
+        self.instance_comm = tm_model.model_comm.create_instance_comm(
            self.gpu_count)

        # create model instances
@@ -196,7 +199,7 @@ class TurboMindInstance:
    def _create_model_instance(self, device_id, model_insts):
        with cuda_ctx(device_id):
            rank = self.node_id * self.gpu_count + device_id
-            model_inst = self.tm_model.model.create_model_instance(
+            model_inst = self.tm_model.model_comm.create_model_instance(
                device_id, rank, self.cuda_stream_id, self.nccl_params)
            model_insts[device_id] = model_inst