Tensor Parallel python api (#82)

* wip * profile disable tp * fix profile * lint * fix dlpack * remove comment * add tp flag * add session len check * add eos * remove tp and session len inputs * warp tokenizer * multithread load weight * update profile * refactor tokenizer * remove pre/post process * remove mpi4py requirement * remove * remove bind * remove mpi requirement * check backend_tokenizer

Tensor Parallel python api (#82)
* wip * profile disable tp * fix profile * lint * fix dlpack * remove comment * add tp flag * add session len check * add eos * remove tp and session len inputs * warp tokenizer * multithread load weight * update profile * refactor tokenizer * remove pre/post process * remove mpi4py requirement * remove * remove bind * remove mpi requirement * check backend_tokenizer
7cbfe2ea · q.yao · GitHub · 1f88baa5 · 7cbfe2ea · 7cbfe2ea
Unverified Commit 7cbfe2ea authored Jul 18, 2023 by q.yao Committed by GitHub Jul 18, 2023
14 changed files
--- a/benchmark/profile_generation.py
+++ b/benchmark/profile_generation.py
@@ -6,10 +6,9 @@ from threading import Thread

 import fire
 import numpy as np
-from transformers import AutoTokenizer

 from lmdeploy.model import MODELS
-from lmdeploy.turbomind import TurboMind
+from lmdeploy.turbomind import Tokenizer, TurboMind


 def infer(model, session_id: int, input_ids: str, output_seqlen: int,
@@ -42,11 +41,7 @@ def infer(model, session_id: int, input_ids: str, output_seqlen: int,
    que.put((session_id, stats))


-def warmup(model,
-           concurrency: int,
-           session_len: int,
-           output_seqlen: int,
-           warmup_round: int = 4):
+def warmup(model, concurrency: int, output_seqlen: int, warmup_round: int = 4):
    print('start to warmup ...')

    def _infer(model, session_id):
@@ -81,18 +76,16 @@ def warmup(model,
 def main(model_path: str,
         model_name: str,
         concurrency: int = 1,
-         session_len: int = 2056,
         input_seqlen: int = 0,
         output_seqlen: int = 512,
         test_round: int = 10):
    tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer')
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_path,
-                                              trust_remote_code=True)
+    tokenizer = Tokenizer(tokenizer_model_path)
    model = MODELS.get(model_name)()
    stop_words = model.stop_words
    tm_model = TurboMind(model_path=model_path, stop_words=stop_words)

-    warmup(tm_model, concurrency, session_len, output_seqlen)
+    warmup(tm_model, concurrency, output_seqlen)

    # make up a prompt that can be tokenized into {input_seqlen} tokens
    prompt = '' if input_seqlen == 0 else 'hi' + ' hi' * (input_seqlen - 1)

--- a/lmdeploy/turbomind/__init__.py
+++ b/lmdeploy/turbomind/__init__.py
 # Copyright (c) OpenMMLab. All rights reserved.
-from .tokenizer import Postprocessor, Preprocessor, Tokenizer
+from .tokenizer import Tokenizer
 from .turbomind import TurboMind

-__all__ = ['Postprocessor', 'Preprocessor', 'Tokenizer', 'TurboMind']
+__all__ = ['Tokenizer', 'TurboMind']
--- a/lmdeploy/turbomind/chat.py
+++ b/lmdeploy/turbomind/chat.py
@@ -4,10 +4,10 @@ import os.path as osp
 import random

 import fire
-from transformers import AutoTokenizer

 from lmdeploy import turbomind as tm
 from lmdeploy.model import MODELS
+from lmdeploy.turbomind.tokenizer import Tokenizer

 os.environ['TM_LOG_LEVEL'] = 'ERROR'

@@ -39,12 +39,12 @@ def main(model_name, model_path, session_id: int = 1):
        session_id (int): the identical id of a session
    """
    model = MODELS.get(model_name)()
-    tm_model = tm.TurboMind(model_path, stop_words=model.stop_words)
-    generator = tm_model.create_instance()
    tokenizer_model_path = osp.join(model_path, 'triton_models', 'tokenizer')
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_path,
-                                              trust_remote_code=True)
-    model = MODELS.get(model_name)()
+    tokenizer = Tokenizer(tokenizer_model_path)
+    tm_model = tm.TurboMind(model_path,
+                            eos_id=tokenizer.eos_token_id,
+                            stop_words=model.stop_words)
+    generator = tm_model.create_instance()

    nth_round = 1
    step = 0
@@ -56,7 +56,7 @@ def main(model_name, model_path, session_id: int = 1):
            exit(0)
        elif prompt == 'end':
            prompt = model.get_prompt('', nth_round == 1)
-            input_ids = tokenizer.encode(prompt, add_special_tokens=False)
+            input_ids = tokenizer.encode(prompt)
            for outputs in generator.stream_infer(session_id=session_id,
                                                  input_ids=[input_ids],
                                                  request_output_len=512,
@@ -67,10 +67,14 @@ def main(model_name, model_path, session_id: int = 1):
            step = 0
            seed = random.getrandbits(64)
        else:
-            prompt = model.get_prompt(prompt, nth_round == 1)
-            input_ids = tokenizer.encode(prompt, add_special_tokens=False)
            print(f'session {session_id}')
-            print(f'{prompt}', end='', flush=True)
+            if step >= tm_model.session_len:
+                print('WARNING: exceed session max length.'
+                      ' Please end the session.')
+                continue
+            prompt = model.get_prompt(prompt, nth_round == 1)
+            input_ids = tokenizer.encode(prompt)
+            print(f'{prompt} ', end='', flush=True)
            response_size = 0
            for outputs in generator.stream_infer(
                    session_id=session_id,
@@ -89,8 +93,7 @@ def main(model_name, model_path, session_id: int = 1):
                    random_seed=seed if nth_round == 1 else None):
                res, tokens = outputs[0]
                # decode res
-                response = tokenizer.decode(
-                    res, skip_special_tokens=True)[response_size:]
+                response = tokenizer.decode(res)[response_size:]
                response = valid_str(response)
                print(f'{response}', end='', flush=True)
                response_size += len(response)

--- a/lmdeploy/turbomind/tokenizer.py
+++ b/lmdeploy/turbomind/tokenizer.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import os.path as osp
-from typing import Sequence, Union
+from typing import Sequence

 import torch
-from torch.nn.utils.rnn import pad_sequence


-class Tokenizer:
-    """Tokenize prompts or de-tokenize tokens into texts.
+class SentencePieceTokenizer:
+    """Tokenizer of sentencepiece.

    Args:
        model_file (str): the path of the tokenizer model
    """

    def __init__(self, model_file: str):
-        if model_file.endswith('.model'):
-            model_folder = osp.split(model_file)[0]
-        else:
-            model_folder = model_file
-        tokenizer_config_file = osp.join(model_folder, 'tokenizer_config.json')
+        from sentencepiece import SentencePieceProcessor
+        self.model = SentencePieceProcessor(model_file=model_file)

-        model_file_exists = osp.exists(model_file)
-        config_exists = osp.exists(tokenizer_config_file)
-        use_hf_model = not config_exists or not model_file_exists
-
-        self.use_hf_model = use_hf_model
-        if not self.use_hf_model:
-            from sentencepiece import SentencePieceProcessor
-            self.model = SentencePieceProcessor(model_file=model_file)
-            self.vocab_size = self.model.vocab_size()
-            self.bos_token_id = self.model.bos_id()
-            self.eos_token_id = self.model.eos_id()
-        else:
-            from transformers import AutoTokenizer
-            backend_tokenizer_file = osp.join(model_folder, 'tokenizer.json')
-            if not osp.exists(backend_tokenizer_file) and model_file_exists:
-                print('WARNING: Can not find tokenizer.json. '
-                      'It may take long time to initialize the tokenizer.')
-            self.model = AutoTokenizer.from_pretrained(model_folder)
-            self.vocab_size = self.model.vocab_size
-            self.bos_token_id = self.model.bos_token_id
-            self.eos_token_id = self.model.eos_token_id
-            # save tokenizer.json to reuse
-            if not osp.exists(backend_tokenizer_file) and model_file_exists:
-                self.model.backend_tokenizer.save(backend_tokenizer_file)
+    @property
+    def vocab_size(self):
+        """vocabulary size."""
+        return self.model.vocab_size()
+
+    @property
+    def bos_token_id(self):
+        """begine of the sentence token id."""
+        return self.model.bos_id()
+
+    @property
+    def eos_token_id(self):
+        """end of the sentence token id."""
+        return self.model.eos_id()

    def encode(self, s: str):
        """Tokenize a prompt.
@@ -53,25 +39,15 @@ class Tokenizer:
        Returns:
            list[int]: token ids
        """
-        if not self.use_hf_model:
-            add_bos = False
-            add_eos = False
-            if s.find('<BOS>') != -1:
-                s = s.replace('<BOS>', '')
-                add_bos = True
-            if s == '<EOS>':
-                s = ''
-                add_eos = True
-            return self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)
-        else:
-            add_special_tokens = False
-            if s.find('<BOS>') != -1:
-                s = s.replace('<BOS>', '<s>')
-            if s == '<EOS>':
-                s = '</s>'
-            if len(s) == 0:
-                add_special_tokens = True
-            return self.model.encode(s, add_special_tokens=add_special_tokens)
+        add_bos = False
+        add_eos = False
+        if s.find('<BOS>') != -1:
+            s = s.replace('<BOS>', '')
+            add_bos = True
+        if s == '<EOS>':
+            s = ''
+            add_eos = True
+        return self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)

    def decode(self, t: Sequence[int]):
        """De-tokenize.
@@ -81,85 +57,132 @@ class Tokenizer:
        Returns:
            str: text of decoding tokens
        """
-        if not self.use_hf_model:
-            return self.model.Decode(t)
-        else:
-            skip_special_tokens = False
-            return self.model.decode(t,
-                                     skip_special_tokens=skip_special_tokens)
+        if isinstance(t, torch.Tensor):
+            t = t.tolist()
+        return self.model.Decode(t)


-class Preprocessor:
-    """Tokenize prompts.
+class HuggingFaceTokenizer:
+    """Tokenizer of sentencepiece.

    Args:
-        tokenizer (Tokenizer): an instance of tokenizer
+        model_dir (str): the directory of the tokenizer model
    """

-    def __init__(self, tokenizer: Tokenizer):
-        self.tokenizer = tokenizer
-        self.bos_token_id = tokenizer.bos_token_id
-        self.eos_token_id = tokenizer.eos_token_id
+    def __init__(self, model_dir: str):
+        from transformers import AutoTokenizer
+        model_file = osp.join(model_dir, 'tokenizer.model')
+        backend_tokenizer_file = osp.join(model_dir, 'tokenizer.json')
+        model_file_exists = osp.exists(model_file)
+        if not osp.exists(backend_tokenizer_file) and model_file_exists:
+            print('WARNING: Can not find tokenizer.json. '
+                  'It may take long time to initialize the tokenizer.')
+        self.model = AutoTokenizer.from_pretrained(model_dir,
+                                                   trust_remote_code=True)
+        # save tokenizer.json to reuse
+        if not osp.exists(backend_tokenizer_file) and model_file_exists:
+            if hasattr(self.model, 'backend_tokenizer'):
+                self.model.backend_tokenizer.save(backend_tokenizer_file)

-    def __call__(self, *args, **kwargs):
-        return self.infer(*args, **kwargs)
+    @property
+    def vocab_size(self):
+        """vocabulary size."""
+        return self.model.vocab_size

-    def infer(self, prompts: Union[str, Sequence[str]]) -> tuple:
-        """Tokenize the input prompts.
+    @property
+    def bos_token_id(self):
+        """begine of the sentence token id."""
+        return self.model.bos_token_id

-        Args:
-            prompts(str | Sequence[str]): user's prompt, or a batch prompts
+    @property
+    def eos_token_id(self):
+        """end of the sentence token id."""
+        return self.model.eos_token_id
+
+    def encode(self, s: str):
+        """Tokenize a prompt.

+        Args:
+            s (str): a prompt
        Returns:
-            Tuple(torch.Tensor, torch.Tensor): prompt's token
-            ids, ids' length and requested output length
+            list[int]: token ids
        """
-        if isinstance(prompts, str):
-            _ = [[prompts]]
-        elif isinstance(prompts, Sequence):
-            _ = [[prompt] for prompt in prompts]
-        else:
-            assert 0, f'str or Sequence[str] prompts are expected but got ' \
-                      f'{type(prompts)}'
+        add_special_tokens = False
+        if s.find('<BOS>') != -1:
+            s = s.replace('<BOS>', '<s>')
+        if s == '<EOS>':
+            s = '</s>'
+        if len(s) == 0:
+            add_special_tokens = True
+        return self.model.encode(s, add_special_tokens=add_special_tokens)

-        start_ids = [
-            torch.IntTensor(self.tokenizer.encode(prompt))
-            for prompt in prompts
-        ]
-        start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids])
-        start_ids = pad_sequence(start_ids,
-                                 batch_first=True,
-                                 padding_value=self.eos_token_id)
-        return start_ids, start_lengths
+    def decode(self, t: Sequence[int]):
+        """De-tokenize.
+
+        Args:
+            t (List[int]): a list of token ids
+        Returns:
+            str: text of decoding tokens
+        """
+        skip_special_tokens = True
+        return self.model.decode(t, skip_special_tokens=skip_special_tokens)


-class Postprocessor:
-    """De-tokenize token ids.
+class Tokenizer:
+    """Tokenize prompts or de-tokenize tokens into texts.

    Args:
-        tokenizer (Tokenizer): an instance of tokenizer
+        model_file (str): the path of the tokenizer model
    """

-    def __init__(self, tokenizer: Tokenizer):
-        self.tokenizer = tokenizer
-        self.bos_token_id = tokenizer.bos_token_id
-        self.eos_token_id = tokenizer.eos_token_id
+    def __init__(self, model_file: str):
+        if model_file.endswith('.model'):
+            model_folder = osp.split(model_file)[0]
+        else:
+            model_folder = model_file
+            model_file = osp.join(model_folder, 'tokenizer.model')
+        tokenizer_config_file = osp.join(model_folder, 'tokenizer_config.json')
+
+        model_file_exists = osp.exists(model_file)
+        config_exists = osp.exists(tokenizer_config_file)
+        use_hf_model = config_exists or not model_file_exists
+
+        if not use_hf_model:
+            self.model = SentencePieceTokenizer(model_file)
+        else:
+            self.model = HuggingFaceTokenizer(model_folder)
+
+    @property
+    def vocab_size(self):
+        """vocabulary size."""
+        return self.model.vocab_size

-    def __call__(self, *args, **kwargs):
-        return self.infer(*args, **kwargs)
+    @property
+    def bos_token_id(self):
+        """begine of the sentence token id."""
+        return self.model.bos_token_id

-    def infer(self, output_ids: torch.Tensor, seqlen: torch.Tensor):
-        """De-tokenize tokens for text.
+    @property
+    def eos_token_id(self):
+        """end of the sentence token id."""
+        return self.model.eos_token_id
+
+    def encode(self, s: str):
+        """Tokenize a prompt.

        Args:
-            output_ids(torch.Tensor): tokens' id
-            seqlen(torch.Tensor): sequence length
+            s (str): a prompt
+        Returns:
+            list[int]: token ids
+        """
+        return self.model.encode(s)
+
+    def decode(self, t: Sequence[int]):
+        """De-tokenize.

+        Args:
+            t (List[int]): a list of token ids
        Returns:
-            str: decoded tokens
+            str: text of decoding tokens
        """
-        outputs = []
-        for tokens, _len in zip(output_ids, seqlen):
-            output = self.tokenizer.decode(tokens[:_len])
-            outputs.append(output)
-        return outputs
+        return self.model.decode(t)
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
 # Copyright (c) OpenMMLab. All rights reserved.
 import os.path as osp
 import sys
+from configparser import ConfigParser
+from contextlib import contextmanager
 from queue import Queue
 from threading import Thread
 from typing import Iterable, List
@@ -53,47 +55,80 @@ def _tm_dict_to_torch_dict(tm_dict: _tm.TensorMap):
    return ret


+@contextmanager
+def cuda_ctx(device_id):
+    old_device = torch.cuda.current_device()
+    torch.cuda.set_device(device_id)
+    yield
+    torch.cuda.set_device(old_device)
+
+
 class TurboMind:
    """LMDeploy's inference engine.

    Args:
        model_path (str): the path of turbomind's model
        data_type (str): the data type
-        session_len (int): the max length of a session
        eos_id (int): eos token id
        stop_words (List[int]): token ids of stop-words
-        device_id (int): the id of a gpu card
-        node_id (int): the id of a node
-        device_num (int): the number of gpu cards
-        node_num (int): the number of node
    """

    def __init__(self,
                 model_path: str,
                 data_type: str = 'fp16',
-                 session_len: int = 2048,
                 eos_id: int = 2,
-                 stop_words: List[int] = None,
-                 device_id: int = 0,
-                 node_id: int = 0,
-                 device_num: int = 1,
-                 node_num: int = 1):
+                 stop_words: List[int] = None):
        self.eos_id = eos_id

-        # create model instance
+        # TODO: support mpi
+        node_id = 0
+        node_num = 1
+
+        # read meta from model path
+        self.gpu_count = 1
+        self.session_len = 2048
+        ini_path = osp.join(model_path, 'triton_models/weights/config.ini')
+        with open(ini_path, 'r') as f:
+            parser = ConfigParser()
+            parser.read_file(f)
+            section_name = ''
+            if 'turbomind' in parser:
+                section_name = 'turbomind'
+            elif 'llama' in parser:
+                section_name = 'llama'
+
+            if len(section_name) > 0:
+                self.gpu_count = parser.getint(section_name,
+                                               'tensor_para_size')
+                self.session_len = parser.getint(section_name, 'session_len')
+
+        # params
        self.node_id = node_id
        self.node_num = node_num
-        self.gpu_count = device_num
-        self.device_id = device_id
        self.world_size = self.node_num * self.gpu_count
-        self.rank = self.node_id * self.gpu_count + self.device_id
-        self.session_len = session_len

+        # create model
        weight_dir = osp.join(model_path, 'triton_models', 'weights')
        model = _tm.AbstractTransformerModel.create_llama_model(
            weight_dir, tensor_para_size=self.gpu_count, data_type=data_type)
-        model.create_shared_weights(self.device_id, self.rank)
        self.model = model
+        self.nccl_params = model.create_nccl_params(self.node_id)
+        torch.cuda.synchronize()
+
+        # create weight
+        def _create_weight(device_id):
+            with cuda_ctx(device_id):
+                rank = self.node_id * self.gpu_count + device_id
+                model.create_shared_weights(device_id, rank)
+
+        threads = []
+        for device_id in range(self.gpu_count):
+            t = Thread(target=_create_weight, args=(device_id, ))
+            t.start()
+            threads.append(t)
+        for t in threads:
+            t.join()
+
        self.stop_words = _stop_words(stop_words)

    def create_instance(self, cuda_stream_id=0):
@@ -117,40 +152,57 @@ class TurboMindInstance:

    def __init__(self, tm_model, cuda_stream_id=0):
        self.tm_model = tm_model
+        self.cuda_stream_id = cuda_stream_id
+
+        self.node_id = tm_model.node_id
+        self.gpu_count = tm_model.gpu_count

-        self.device_id = tm_model.device_id
-        self.rank = tm_model.rank
        self.stop_words = tm_model.stop_words
        self.eos_id = tm_model.eos_id
        self.session_len = tm_model.session_len
-        self.cuda_stream_id = cuda_stream_id

-        # create instance
-        model = tm_model.model
-        nccl_params = model.create_nccl_params(tm_model.node_id)
-        custom_comms = model.create_custom_comms(tm_model.world_size)
-        instance_comm = model.create_instance_comm(tm_model.gpu_count)
-
-        model_inst = model.create_model_instance(self.device_id, self.rank,
-                                                 self.cuda_stream_id,
-                                                 nccl_params, custom_comms[0])
-        # model_inst.register_callback(self._forward_callback)
-        self.model_inst = model_inst
-        self.instance_comm = instance_comm
+        self.nccl_params = tm_model.nccl_params
+        self.instance_comm = tm_model.model.create_instance_comm(
+            self.gpu_count)
+
+        # create model instances
+        model_insts = [None] * self.gpu_count
+        threads = []
+        for device_id in range(self.gpu_count):
+            t = Thread(target=self._create_model_instance,
+                       args=(device_id, model_insts))
+            t.start()
+            threads.append(t)
+        for t in threads:
+            t.join()
+
+        self.model_insts = model_insts
        self.que = Queue()
-        self.thread = None
+        self.threads = [None] * self.gpu_count
+
+    def _create_model_instance(self, device_id, model_insts):
+        with cuda_ctx(device_id):
+            rank = self.node_id * self.gpu_count + device_id
+            model_inst = self.tm_model.model.create_model_instance(
+                device_id, rank, self.cuda_stream_id, self.nccl_params)
+            model_insts[device_id] = model_inst

    def _forward_callback(self, result, ctx):
        self.que.put((False, result))

    def _forward_thread(self, inputs):

-        def _func():
-            output = self.model_inst.forward(inputs, self.instance_comm)
-            self.que.put((True, output))
+        def _func(device_id, enque_output):
+            with cuda_ctx(device_id):
+                output = self.model_insts[device_id].forward(
+                    inputs, self.instance_comm)
+                if enque_output:
+                    self.que.put((True, output))

-        self.thread = Thread(target=_func)
-        self.thread.start()
+        for device_id in range(self.gpu_count):
+            t = Thread(target=_func, args=(device_id, device_id == 0))
+            t.start()
+            self.threads[device_id] = t

    def stream_infer(self,
                     session_id,
@@ -190,7 +242,7 @@ class TurboMindInstance:
            stream_output (bool): indicator for stream output
        """
        if stream_output:
-            self.model_inst.register_callback(self._forward_callback)
+            self.model_insts[0].register_callback(self._forward_callback)

        if len(input_ids) == 0:
            input_ids = []
@@ -281,10 +333,11 @@ class TurboMindInstance:
                   for output, l in zip(output_ids, sequence_length)]

            if finish:
+                for t in self.threads:
+                    t.join()
                while self.que.qsize() > 0:
                    self.que.get()
-                self.thread.join()
                break

        if stream_output:
-            self.model_inst.unregister_callback()
+            self.model_insts[0].unregister_callback()
--- a/src/turbomind/python/CMakeLists.txt
+++ b/src/turbomind/python/CMakeLists.txt
@@ -14,7 +14,7 @@ endif()

 pybind11_add_module(${PROJECT_NAME} bind.cpp)
 target_link_libraries(${PROJECT_NAME} PRIVATE TransformerTritonBackend
-    LlamaTritonBackend custom_ar_comm memory_utils)
+    LlamaTritonBackend)
 target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_14)

 set_target_properties(${PROJECT_NAME} PROPERTIES

--- a/src/turbomind/python/bind.cpp
+++ b/src/turbomind/python/bind.cpp
 #include "src/turbomind/python/dlpack.h"
 #include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
 #include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
+#include "src/turbomind/utils/nccl_utils.h"
+#include <cuda_runtime.h>
 #include <memory>
 #include <pybind11/functional.h>
 #include <pybind11/pybind11.h>
@@ -26,7 +28,14 @@ std::shared_ptr<T> make_shared_nodel(T data)

 DLDevice getDLDevice(triton::Tensor& tensor)
 {
-    DLDevice device{.device_id = 0};
+    int device_id = 0;
+    if (tensor.where == triton::MEMORY_GPU) {
+        cudaPointerAttributes ptr_attr;
+        cudaPointerGetAttributes(&ptr_attr, tensor.data);
+        device_id = ptr_attr.device;
+    }
+
+    DLDevice device{.device_id = device_id};

    switch (tensor.where) {
        case triton::MEMORY_CPU:
@@ -204,7 +213,6 @@ std::shared_ptr<triton::Tensor> DLManagedTensorToTritonTensor(DLManagedTensor* t

 PYBIND11_MODULE(_turbomind, m)
 {
-
    // nccl param
    py::class_<ft::NcclParam>(m, "NcclParam")
        .def(py::init<int, int>(), "rank"_a = 0, "world_size"_a = 1)
@@ -320,7 +328,6 @@ PYBIND11_MODULE(_turbomind, m)

    // transformer model
    py::class_<AbstractTransformerModel, std::shared_ptr<AbstractTransformerModel>>(m, "AbstractTransformerModel")
-        // .def_static("create_llama_model", &AbstractTransformerModel::createLlamaModel, "model_dir"_a)
        .def_static(
            "create_llama_model",
            [](std::string model_dir,
@@ -349,7 +356,7 @@ PYBIND11_MODULE(_turbomind, m)
             "multi_node"_a      = false)
        .def(
            "create_custom_comms",
-            [](std::shared_ptr<AbstractTransformerModel>& model, int world_size) {
+            [](AbstractTransformerModel* model, int world_size) {
                std::vector<std::shared_ptr<ft::AbstractCustomComm>> ret;
                model->createCustomComms(&ret, world_size);
                return ret;
@@ -358,7 +365,7 @@ PYBIND11_MODULE(_turbomind, m)
        .def("create_instance_comm", &AbstractTransformerModel::createInstanceComm, "size"_a)
        .def(
            "create_model_instance",
-            [](std::shared_ptr<AbstractTransformerModel>&                        model,
+            [](AbstractTransformerModel*                                         model,
               int                                                               deviceId,
               int                                                               rank,
               long                                                              stream_id,
@@ -367,12 +374,17 @@ PYBIND11_MODULE(_turbomind, m)
                cudaStream_t stream = reinterpret_cast<cudaStream_t>(stream_id);
                return model->createModelInstance(deviceId, rank, stream, nccl_params, custom_all_reduce_comm);
            },
+            py::call_guard<py::gil_scoped_release>(),
            "device_id"_a,
            "rank"_a,
            "stream"_a,
            "nccl_params"_a,
            "custom_all_reduce_comm"_a = nullptr)
-        .def("create_shared_weights", &AbstractTransformerModel::createSharedWeights, "device_id"_a, "rank"_a)
+        .def("create_shared_weights",
+             &AbstractTransformerModel::createSharedWeights,
+             py::call_guard<py::gil_scoped_release>(),
+             "device_id"_a,
+             "rank"_a)
        .def("__str__", &AbstractTransformerModel::toString)
        .def("__repr__", &AbstractTransformerModel::toString)
        .def("get_tensor_para_size", &AbstractTransformerModel::getTensorParaSize)

--- a/src/turbomind/triton_backend/CMakeLists.txt
+++ b/src/turbomind/triton_backend/CMakeLists.txt
@@ -283,7 +283,7 @@ export(PACKAGE TritonTurboMindBackend)
 # limitations under the License.

 add_library(TransformerTritonBackend SHARED transformer_triton_backend.cpp)
-target_link_libraries(TransformerTritonBackend PRIVATE nccl_utils mpi_utils)
+target_link_libraries(TransformerTritonBackend PRIVATE nccl_utils)
 install(TARGETS TransformerTritonBackend DESTINATION ${CMAKE_INSTALL_LIBDIR})

 add_subdirectory(llama)
--- a/src/turbomind/triton_backend/transformer_triton_backend.cpp
+++ b/src/turbomind/triton_backend/transformer_triton_backend.cpp
@@ -39,9 +39,6 @@ AbstractTransformerModel::createNcclParams(const int node_id, const int device_i
                ft::ftNcclGetUniqueId(nccl_ids[i]);
            }
        }
-        for (size_t i = 0; i < nccl_ids.size(); i++) {
-            ft::mpi::bcast(&nccl_ids[i], sizeof(nccl_ids[i]), ft::mpi::MPI_TYPE_BYTE, 0, ft::mpi::COMM_WORLD);
-        }
    }

    std::vector<ft::NcclParam> tensor_para_params(local_comm_size);

--- a/src/turbomind/triton_backend/transformer_triton_backend.hpp
+++ b/src/turbomind/triton_backend/transformer_triton_backend.hpp
@@ -29,7 +29,6 @@
 #include "src/turbomind/utils/Tensor.h"
 #include "src/turbomind/utils/custom_ar_comm.h"
 #include "src/turbomind/utils/instance_comm.h"
-#include "src/turbomind/utils/mpi_utils.h"
 #include "src/turbomind/utils/nccl_utils.h"

 namespace ft = turbomind;

--- a/src/turbomind/utils/CMakeLists.txt
+++ b/src/turbomind/utils/CMakeLists.txt
@@ -64,7 +64,7 @@ add_library(nccl_utils STATIC nccl_utils.cc)
 set_property(TARGET nccl_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
 set_property(TARGET nccl_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 if (BUILD_MULTI_GPU)
-    target_link_libraries(nccl_utils PUBLIC ${NCCL_LIBRARIES} mpi_utils logger)
+    target_link_libraries(nccl_utils PUBLIC ${NCCL_LIBRARIES} logger)
 endif()

 add_library(cublasINT8MMWrapper STATIC cublasINT8MMWrapper.cc)

--- a/src/turbomind/utils/nccl_utils.cc
+++ b/src/turbomind/utils/nccl_utils.cc
@@ -306,118 +306,6 @@ void ftNcclParamDestroy(NcclParam& param)
 #endif
 }

-void ftNcclInitialize(NcclParam& tensor_para,
-                      NcclParam& pipeline_para,
-                      const int  tensor_para_size,
-                      const int  pipeline_para_size)
-{
-    TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
-    // Initialize nccl communication grid of tensor and pipeline parallel groups.
-#ifndef BUILD_MULTI_GPU
-    FT_CHECK_WITH_INFO(tensor_para_size == 1,
-                       fmtstr("tensor_para_size=%d although BUILD_MULTI_GPU is disabled. "
-                              "Please use the cmake flag -DBUILD_MULTI_GPU=ON if you want "
-                              "to use tensor/pipeline parallelism.",
-                              tensor_para_size));
-    FT_CHECK_WITH_INFO(pipeline_para_size == 1,
-                       fmtstr("pipeline_para_size=%d although BUILD_MULTI_GPU is disabled. "
-                              "Please use the cmake flag -DBUILD_MULTI_GPU=ON if you want "
-                              "to use tensor/pipeline parallelism.",
-                              pipeline_para_size));
-    tensor_para.rank_         = 0;
-    tensor_para.world_size_   = tensor_para_size;
-    pipeline_para.rank_       = 0;
-    pipeline_para.world_size_ = pipeline_para_size;
-#else
-    // Initialize a nccl communicator.
-    if (tensor_para.nccl_comm_ != nullptr && pipeline_para.nccl_comm_ != nullptr) {
-        TM_LOG_WARNING("NcclParam is already initialized. Skip NCCL initialization.");
-        return;
-    }
-    FT_CHECK(tensor_para.nccl_comm_ == nullptr);
-    FT_CHECK(pipeline_para.nccl_comm_ == nullptr);
-    FT_CHECK(tensor_para_size > 0);
-    FT_CHECK(pipeline_para_size > 0);
-
-    if (tensor_para_size == 1 && pipeline_para_size == 1) {
-        TM_LOG_WARNING("Skip NCCL initialization since requested tensor/pipeline parallel sizes are equals to 1.");
-        tensor_para.rank_         = 0;
-        tensor_para.world_size_   = tensor_para_size;
-        pipeline_para.rank_       = 0;
-        pipeline_para.world_size_ = pipeline_para_size;
-        return;
-    }
-
-    int mpi_initialized;
-    MPICHECK(MPI_Initialized(&mpi_initialized));
-    FT_CHECK_WITH_INFO(mpi_initialized, "Fail to nccl initialization because MPI is not initialized.");
-
-    int rank, world_size;
-    MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &rank));
-    MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &world_size));
-
-    FT_CHECK_WITH_INFO(tensor_para_size * pipeline_para_size <= world_size,
-                       fmtstr("tensor_para_size (%d) * pipeline_para_size (%d) should equal to the world size (%d).",
-                              tensor_para_size,
-                              pipeline_para_size,
-                              world_size));
-
-    // Convert WORLD communicator into 2D grid (k * n) communicator.
-    //  row = a tensor parallel group, col = a pipeline parallel group.
-    MPI_Comm grid_comm, tp_comm, pp_comm;
-
-    int dims[2]    = {pipeline_para_size, tensor_para_size};
-    int periods[2] = {0, 0};
-    MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 0, &grid_comm);
-
-    // Split 2D communicator into rows and cols.
-    int tp_remain_dims[2] = {false, true};
-    int pp_remain_dims[2] = {true, false};
-    MPI_Cart_sub(grid_comm, tp_remain_dims, &tp_comm);
-    MPI_Cart_sub(grid_comm, pp_remain_dims, &pp_comm);
-
-    int tp_rank, pp_rank;
-    MPI_Comm_rank(tp_comm, &tp_rank);
-    MPI_Comm_rank(pp_comm, &pp_rank);
-
-    ncclUniqueId tp_uid;
-    ncclUniqueId pp_uid;
-    // The root of each group creates a nccl uid.
-    if (tp_rank == 0) {
-        TM_LOG_DEBUG("rank %d pp rank %d creates nccl uid.", rank, tp_rank);
-        NCCLCHECK(ncclGetUniqueId(&tp_uid));
-    }
-    if (pp_rank == 0) {
-        TM_LOG_DEBUG("rank %d pp rank %d creates nccl uid.", rank, pp_rank);
-        NCCLCHECK(ncclGetUniqueId(&pp_uid));
-    }
-    // Broadcast nccl uid to share the same nccl uid across gpus in the same group.
-    TM_LOG_DEBUG("Broadcast nccl uid to the others in the same parallel groups.");
-    MPI_Bcast(&tp_uid, sizeof(tp_uid), MPI_BYTE, 0, tp_comm);
-    MPI_Bcast(&pp_uid, sizeof(pp_uid), MPI_BYTE, 0, pp_comm);
-
-    TM_LOG_DEBUG("Initialize NCCL communicators.");
-    ncclComm_t tp_nccl_comm, pp_nccl_comm;
-    NCCLCHECK(ncclCommInitRank(&tp_nccl_comm, tensor_para_size, tp_uid, tp_rank));
-    NCCLCHECK(ncclCommInitRank(&pp_nccl_comm, pipeline_para_size, pp_uid, pp_rank));
-
-    tensor_para.world_size_   = tensor_para_size;
-    tensor_para.rank_         = tp_rank;
-    tensor_para.nccl_uid_     = tp_uid;
-    tensor_para.nccl_comm_    = tp_nccl_comm;
-    pipeline_para.world_size_ = pipeline_para_size;
-    pipeline_para.rank_       = pp_rank;
-    pipeline_para.nccl_uid_   = pp_uid;
-    pipeline_para.nccl_comm_  = pp_nccl_comm;
-    TM_LOG_INFO("NCCL initialized rank=%d world_size=%d tensor_para=%s pipeline_para=%s",
-                rank,
-                world_size,
-                tensor_para.toString().c_str(),
-                pipeline_para.toString().c_str());
-#endif
-    TM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
-}
-
 static std::atomic<int>& ncclGroupCount()
 {
    static std::atomic<int> value{};

--- a/src/turbomind/utils/nccl_utils.h
+++ b/src/turbomind/utils/nccl_utils.h
@@ -18,11 +18,9 @@

 #include "src/turbomind/utils/cuda_utils.h"
 #include "src/turbomind/utils/logger.h"
-#include "src/turbomind/utils/mpi_utils.h"

 #include <cuda_runtime.h>
 #ifdef BUILD_MULTI_GPU
-#include <mpi.h>
 #include <nccl.h>
 #endif
 #include <stdio.h>
@@ -118,11 +116,6 @@ void ftNcclGetUniqueId(NcclUid& uid);
 void ftNcclCommInitRank(NcclParam& param, const int rank, const int world_size, const NcclUid uid);
 void ftNcclParamDestroy(NcclParam& param);

-void ftNcclInitialize(NcclParam& tensor_para,
-                      NcclParam& pipeline_para,
-                      const int  tensor_para_size,
-                      const int  pipeline_para_size);
-
 int ftNcclNextGroupId();
 int ftNcclGroupCount();


--- a/tests/python/test_tokenizer.py
+++ b/tests/python/test_tokenizer.py
-from lmdeploy.turbomind.tokenizer import Postprocessor, Preprocessor, Tokenizer
+from lmdeploy.turbomind.tokenizer import Tokenizer


 def main():
    tokenizer = Tokenizer('huggyllama/llama-7b')
-    preprocessor = Preprocessor(tokenizer)
-    postprocessor = Postprocessor(tokenizer)

    prompts = ['cest la vie', '上帝已死']
-    tokens = preprocessor(prompts)
-    print(tokens)
-
-    decode_prompts = postprocessor(*tokens)
-    print(decode_prompts)
+    for prompt in prompts:
+        tokens = tokenizer.encode(prompt)
+        output = tokenizer.decode(tokens)
+        print(output)


 if __name__ == '__main__':