Python ffi (#34)

* wip * wip * example finish * fix include and namespace * wtf * install lib * batchize * update cmake install * multithread * fix comment * fix * add mmengine * bind llamamodel --------- Co-authored-by: grimoire <yaoqian@pjlab.org.cn>

Python ffi (#34)
* wip * wip * example finish * fix include and namespace * wtf * install lib * batchize * update cmake install * multithread * fix comment * fix * add mmengine * bind llamamodel --------- Co-authored-by: grimoire <yaoqian@pjlab.org.cn>
4fd6e710 · q.yao · GitHub · 5ea40abf · 4fd6e710 · 4fd6e710
Unverified Commit 4fd6e710 authored Jul 05, 2023 by q.yao Committed by GitHub Jul 05, 2023
15 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,7 @@ __pycache__/
 workspace/
 .cache
 *build*/
+lmdeploy/lib/
+dist/
 examples/cpp/llama/*.csv
+
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,6 +39,7 @@ endif()
 if(NOT USE_TRITONSERVER_DATATYPE)
  option(USE_TRITONSERVER_DATATYPE "Build triton backend for triton server" OFF)
 endif()
+option(BUILD_PY_FFI "Build python ffi" ON)

 include(FetchContent)

@@ -388,6 +389,11 @@ install(
    ${INSTALL_CONFIGDIR}
 )

+# install python api
+install(TARGETS _turbomind DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/lib)
+install(TARGETS TransformerTritonBackend DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/lib)
+
+
 export(
  EXPORT
    transformer-shared-targets

--- a/MANIFEST.in
+++ b/MANIFEST.in
+
+include lmdeploy/lib/*.so
+include lmdeploy/lib/*.so*
+include lmdeploy/lib/*.dll
+include lmdeploy/lib/*.pyd
--- a/benchmark/profile_generation.py
+++ b/benchmark/profile_generation.py
-import multiprocessing as mp
+# import multiprocessing as mp
+from threading import Thread
+from queue import Queue
 import time

 import fire
 import numpy as np

-from lmdeploy.serve.turbomind.chatbot import Chatbot
+from lmdeploy.turbomind import TurboMind
+from lmdeploy.model import MODELS
+from transformers import AutoTokenizer


-def infer(chatbot, session_id: int, prompt: str, output_seqlen: int,
-          test_round: int, que: mp.Queue):
+def infer(model, session_id: int, input_ids: str, output_seqlen: int,
+          test_round: int, que: Queue):
+    chatbot = model.create_instance()
    stats = []
    for i in range(test_round):
-        timestamps = []
-        tokens = []
        start = time.perf_counter()
-        for status, res, token in chatbot.stream_infer(
+        timestamps = [start]
+        tokens = [0]
+        for outputs in chatbot.stream_infer(
                session_id,
-                prompt,
+                input_ids,
                request_output_len=output_seqlen,
                sequence_start=True,
-                sequence_end=True):
+                sequence_end=True,
+                ignore_eos=True):
+            res, token = outputs[0]
            timestamps.append(time.perf_counter())
            tokens.append(token)

-        first_token_latency = timestamps[0] - start
+        # TODO: ignore first token
+        first_token_latency = timestamps[1] - start
        token_latency = timestamps[-1] - timestamps[0]
        token = tokens[-1] - tokens[0]
        stats.append([first_token_latency, token, token_latency])
-        chatbot.reset_session()
    que.put((session_id, stats))


-def warmup(tritonserver_addr: str,
-           model_name: str,
+def warmup(model,
           concurrency: int,
           session_len: int,
           output_seqlen: int,
           warmup_round: int = 4):
    print('start to warmup ...')

-    def _infer(_chatbot, session_id):
+    def _infer(model, session_id):
+        chatbot = model.create_instance()
        for _ in range(warmup_round):
-            for _, _, _ in chatbot.stream_infer(
+            for _ in chatbot.stream_infer(
                    session_id,
-                    prompt='',
+                    input_ids=[1],
                    request_output_len=output_seqlen,
                    sequence_start=True,
-                    sequence_end=True):
+                    sequence_end=True,
+                    ignore_eos=True):
                continue
-            chatbot.reset_session()

    _start = time.perf_counter()
-    chatbots = [
-        Chatbot(tritonserver_addr=tritonserver_addr,
-                model_name=model_name,
-                session_len=session_len,
-                ignore_eos=True,
-                profile_generation=True) for _ in range(concurrency)
-    ]
    procs = []
-    for i, chatbot in enumerate(chatbots):
-        proc = mp.Process(target=_infer, args=(chatbot, i + 1))
+    for i in range(concurrency):
+        proc = Thread(target=_infer, args=(model, i + 1))
        procs.append(proc)
        proc.start()
-    for proc in procs:
-        proc.join()
+
+    try:
+        for proc in procs:
+            proc.join()
+    except Exception:
+        for proc in procs:
+            proc.stop()
+        exit(1)
    _end = time.perf_counter()
    print(f'end warmup, elapsed time: {round(_end - _start, 2)}s')


-def main(tritonserver_addr: str,
+def main(model_path: str,
         model_name: str,
+         tokenlizer: str,
         concurrency: int = 1,
         session_len: int = 2056,
         input_seqlen: int = 0,
         output_seqlen: int = 512,
         test_round: int = 10):
-    warmup(tritonserver_addr, model_name, concurrency, session_len,
+    tokenizer = AutoTokenizer.from_pretrained(tokenlizer)
+    model = MODELS.get(model_name)()
+    stop_words = model.stop_words
+    tm_model = TurboMind(model_path=model_path, stop_words=stop_words)
+
+    warmup(tm_model, concurrency, session_len,
           output_seqlen)

    # make up a prompt that can be tokenized into {input_seqlen} tokens
    prompt = '' if input_seqlen == 0 else 'hi' + ' hi' * (input_seqlen - 1)
-    que = mp.Queue()
+    input_ids = tokenizer.encode(prompt)
+    que = Queue()
    procs = []
    _start = time.perf_counter()
+
+    # TODO: update to the multithread version
    for i in range(concurrency):
-        chatbot = Chatbot(tritonserver_addr=tritonserver_addr,
-                          model_name=model_name,
-                          session_len=session_len,
-                          ignore_eos=True,
-                          profile_generation=True)
-        proc = mp.Process(target=infer,
-                          args=(chatbot, i + 1, prompt, output_seqlen,
+        proc = Thread(target=infer,
+                          args=(tm_model, i + 1, input_ids, output_seqlen,
                                test_round, que))
        procs.append(proc)
        proc.start()
-    for proc in procs:
-        proc.join()
+
+    try:
+        for proc in procs:
+            proc.join()
+    except Exception:
+        for proc in procs:
+            proc.stop()
+        exit(1)
    _end = time.perf_counter()
    elapsed_time = _end - _start


--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -5,6 +5,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    && rm -rf /var/lib/apt/lists/*

 RUN python3 -m pip install torch==1.13.1+cu117 torchvision==0.14.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
-RUN python3 -m pip install sentencepiece cmake transformers protobuf==3.20.3
+RUN python3 -m pip install sentencepiece cmake transformers protobuf==3.20.3 pybind11 mmengine

 ENV NCCL_LAUNCH_MODE=GROUP
--- a/examples/python/chat_example.py
+++ b/examples/python/chat_example.py
+import fire
+from lmdeploy import turbomind as tm
+from lmdeploy.model import MODELS
+from transformers import AutoTokenizer
+import random
+
+
+def input_prompt():
+    print('\ndouble enter to end input >>> ', end='')
+    sentinel = ''  # ends when this string is seen
+    return '\n'.join(iter(input, sentinel))
+
+
+def main(model_name, model_path, tokenizer_model_path, session_id: int = 1):
+    tm_model = tm.TurboMind(model_path)
+    generator = tm_model.create_instance()
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_path)
+    model = MODELS.get(model_name)()
+
+    nth_round = 1
+    step = 0
+    seed = random.getrandbits(64)
+
+    while True:
+        prompt = input_prompt()
+        if prompt == 'exit':
+            exit(0)
+        elif prompt == 'end':
+            pass
+        else:
+            prompt = model.get_prompt(prompt, nth_round == 1)
+            input_ids = tokenizer.encode(prompt, add_special_tokens=False)
+            for outputs in generator.stream_infer(
+                    session_id=session_id,
+                    input_ids=[input_ids],
+                    request_output_len=512,
+                    sequence_start=(nth_round == 1),
+                    sequence_end=False,
+                    step=step,
+                    stop=False,
+                    top_k=40,
+                    top_p=0.8,
+                    temperature=0.8,
+                    repetition_penalty=1.05,
+                    ignore_eos=False,
+                    random_seed=seed if nth_round == 1 else None):
+                res, tokens = outputs[0]
+                # decode res
+                response = tokenizer.decode(
+                    res[step:], skip_special_tokens=True)
+                print(f'session {session_id}, {tokens}, {response}')
+                # update step
+                step = tokens - 1
+
+        nth_round += 1
+
+
+if __name__ == '__main__':
+    fire.Fire(main)
--- a/generate.sh
+++ b/generate.sh
@@ -4,6 +4,7 @@ cmake .. \
    -DCMAKE_BUILD_TYPE=RelWithDebInfo \
    -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
    -DCMAKE_INSTALL_PREFIX=./install \
+    -DBUILD_PY_FFI=ON \
    -DBUILD_MULTI_GPU=ON \
    -DBUILD_CUTLASS_MOE=OFF \
    -DBUILD_CUTLASS_MIXED_GEMM=OFF \

--- a/lmdeploy/turbomind/__init__.py
+++ b/lmdeploy/turbomind/__init__.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from .tokenizer import Tokenizer, Preprocessor, Postprocessor
+
+from .turbomind import TurboMind
\ No newline at end of file
--- a/lmdeploy/turbomind/tokenizer.py
+++ b/lmdeploy/turbomind/tokenizer.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Sequence, Optional, Union
+from torch.nn.utils.rnn import pad_sequence
+import torch
+
+class Tokenizer:
+
+    def __init__(self, model_file: str):
+        if model_file.endswith('.model'):
+            model_folder = osp.split(model_file)[0]
+        else:
+            model_folder = model_file
+        tokenizer_config_file = osp.join(model_folder, 'tokenizer_config.json')
+
+        model_file_exists = osp.exists(model_file)
+        config_exists = osp.exists(tokenizer_config_file)
+        use_hf_model = not config_exists or not model_file_exists
+
+        self.use_hf_model = use_hf_model
+        if not self.use_hf_model:
+            from sentencepiece import SentencePieceProcessor
+            self.model = SentencePieceProcessor(model_file=model_file)
+            self.vocab_size = self.model.vocab_size()
+            self.bos_token_id = self.model.bos_id()
+            self.eos_token_id = self.model.eos_id()
+        else:
+            from transformers import AutoTokenizer
+            backend_tokenizer_file = osp.join(model_folder, 'tokenizer.json')
+            if not osp.exists(backend_tokenizer_file) and model_file_exists:
+                print('WARNING: Can not find tokenizer.json. '
+                      'It may take long time to initialize the tokenizer.')
+            self.model = AutoTokenizer.from_pretrained(model_folder)
+            self.vocab_size = self.model.vocab_size
+            self.bos_token_id = self.model.bos_token_id
+            self.eos_token_id = self.model.eos_token_id
+            # save tokenizer.json to reuse
+            if not osp.exists(backend_tokenizer_file) and model_file_exists:
+                self.model.backend_tokenizer.save(backend_tokenizer_file)
+
+    def encode(self, s: str):
+        if not self.use_hf_model:
+            add_bos = False
+            add_eos = False
+            if s.find('<BOS>') != -1:
+                s = s.replace('<BOS>', '')
+                add_bos = True
+            if s == '<EOS>':
+                s = ''
+                add_eos = True
+            return self.model.Encode(s, add_bos=add_bos, add_eos=add_eos)
+        else:
+            add_special_tokens = False
+            if s.find('<BOS>') != -1:
+                s = s.replace('<BOS>', '<s>')
+            if s == '<EOS>':
+                s = '</s>'
+            if len(s) == 0:
+                add_special_tokens = True
+            return self.model.encode(s, add_special_tokens=add_special_tokens)
+
+    def decode(self, t: Sequence[int]):
+        if not self.use_hf_model:
+            return self.model.Decode(t)
+        else:
+            skip_special_tokens = False
+            return self.model.decode(t,
+                                     skip_special_tokens=skip_special_tokens)
+
+class Preprocessor:
+    def __init__(self, tokenizer:Tokenizer):
+        self.tokenizer = tokenizer
+        self.bos_token_id = tokenizer.bos_token_id
+        self.eos_token_id = tokenizer.eos_token_id
+
+    def __call__(self, *args, **kwargs):
+        return self.infer(*args, **kwargs)
+
+    def infer(self, prompts: Union[str, Sequence[str]]) -> tuple:
+        """Tokenize the input prompts.
+
+        Args:
+            prompts(str | Sequence[str]): user's prompt, or a batch prompts
+
+        Returns:
+            Tuple(torch.Tensor, torch.Tensor): prompt's token
+            ids, ids' length and requested output length
+        """
+        if isinstance(prompts, str):
+            input0 = [[prompts]]
+        elif isinstance(prompts, Sequence):
+            input0 = [[prompt] for prompt in prompts]
+        else:
+            assert 0, f'str or Sequence[str] prompts are expected but got ' \
+                      f'{type(prompts)}'
+        
+        start_ids = [
+            torch.IntTensor(self.tokenizer.encode(prompt))
+            for prompt in prompts
+        ]
+        start_lengths = torch.IntTensor([[len(ids)] for ids in start_ids])
+        start_ids = pad_sequence(start_ids,
+                                 batch_first=True,
+                                 padding_value=self.eos_token_id)
+        return start_ids, start_lengths
+
+
+class Postprocessor:
+    def __init__(self, tokenizer:Tokenizer):
+        self.tokenizer = tokenizer
+        self.bos_token_id = tokenizer.bos_token_id
+        self.eos_token_id = tokenizer.eos_token_id
+
+    def __call__(self, *args, **kwargs):
+        return self.infer(*args, **kwargs)
+
+    def infer(self, output_ids: torch.Tensor, seqlen: torch.Tensor):
+        """De-tokenize tokens for text.
+
+        Args:
+            output_ids(torch.Tensor): tokens' id
+            seqlen(torch.Tensor): sequence length
+
+        Returns:
+            str: decoded tokens
+        """
+        outputs = []
+        for tokens, _len in zip(output_ids, seqlen):
+            output = self.tokenizer.decode(tokens[:_len])
+            outputs.append(output)
+        return outputs
\ No newline at end of file
--- a/lmdeploy/turbomind/turbomind.py
+++ b/lmdeploy/turbomind/turbomind.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Iterable
+import sys
+import os.path as osp
+import torch
+import numpy as np
+import lmdeploy
+from lmdeploy.model import MODELS
+from .tokenizer import Tokenizer, Preprocessor, Postprocessor
+from torch.nn.utils.rnn import pad_sequence
+
+# TODO: find another way import _turbomind
+lmdeploy_dir = osp.split(lmdeploy.__file__)[0]
+sys.path.append(osp.join(lmdeploy_dir, 'lib'))
+import _turbomind as _tm
+
+
+def _stop_words(stop_words: List[int]):
+    if stop_words is None:
+        return None
+    assert isinstance(stop_words, List) and \
+            all(isinstance(elem, int) for elem in stop_words), \
+            f'stop_words must be a list but got {type(stop_words)}'
+    # each id in stop_words represents a stop word
+    # refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for
+    # detailed explanation about fastertransformer's stop_words
+    stop_word_offsets = range(1, len(stop_words) + 1)
+    stop_words = np.array([[stop_words, stop_word_offsets]]).astype(np.int32)
+    return stop_words
+
+
+def _np_dict_to_tm_dict(np_dict: dict):
+    ret = _tm.TensorMap()
+    for k, v in np_dict.items():
+        ret[k] = _tm.from_dlpack(v)
+
+    return ret
+
+
+def _tm_dict_to_torch_dict(tm_dict: _tm.TensorMap):
+    ret = dict()
+    for k, v in tm_dict.items():
+        if v.type == _tm.DataType.TYPE_UINT32:
+            v = v.view(_tm.DataType.TYPE_INT32)
+        ret[k] = torch.from_dlpack(v)
+
+    return ret
+
+
+class TurboMind:
+
+    def __init__(self,
+                 model_path: str,
+                 data_type: str = 'fp16',
+                 session_len: int = 2048,
+                 eos_id: int = 2,
+                 stop_words: List[int] = None,
+                 device_id: int = 0,
+                 node_id: int = 0,
+                 device_num: int = 1,
+                 node_num: int = 1):
+        self.eos_id = eos_id
+
+        # create model instance
+        self.node_id = node_id
+        self.node_num = node_num
+        self.gpu_count = device_num
+        self.device_id = device_id
+        self.world_size = self.node_num * self.gpu_count
+        self.rank = self.node_id * self.gpu_count + self.device_id
+        self.session_len = session_len
+
+        weight_dir = osp.join(model_path, 'triton_models', 'weights')
+        model = _tm.AbstractTransformerModel.create_llama_model(
+            weight_dir, tensor_para_size=self.gpu_count, data_type=data_type)
+        model.create_shared_weights(self.device_id, self.rank)
+        self.model = model
+        self.stop_words = _stop_words(stop_words)
+
+    def create_instance(self, stream=0):
+        return TurboMindInstance(self, stream)
+
+
+class TurboMindInstance:
+
+    def __init__(self, tm_model, stream=0):
+        self.tm_model = tm_model
+
+        self.device_id = tm_model.device_id
+        self.rank = tm_model.rank
+        self.stop_words = tm_model.stop_words
+        self.eos_id = tm_model.eos_id
+        self.session_len = tm_model.session_len
+        self.stream = stream
+
+        # create instance
+        model = tm_model.model
+        nccl_params = model.create_nccl_params(tm_model.node_id)
+        custom_comms = model.create_custom_comms(tm_model.world_size)
+        instance_comm = model.create_instance_comm(tm_model.gpu_count)
+
+        model_inst = model.create_model_instance(self.device_id, self.rank,
+                                                 self.stream, nccl_params,
+                                                 custom_comms[0])
+        self.model_inst = model_inst
+        self.instance_comm = instance_comm
+
+    def stream_infer(self,
+                     session_id,
+                     input_ids,
+                     request_output_len: int = 512,
+                     sequence_start: bool = True,
+                     sequence_end: bool = False,
+                     step=1,
+                     stop=False,
+                     top_p=0.8,
+                     top_k=40,
+                     temperature=0.8,
+                     repetition_penalty=1.05,
+                     ignore_eos=False,
+                     random_seed=None):
+
+        if len(input_ids) == 0:
+            input_ids = []
+        if isinstance(input_ids[0], int):
+            input_ids = [input_ids]
+
+        batch_size = len(input_ids)
+
+        def _broadcast_np(data, dtype, shape=(batch_size, )):
+            if isinstance(data, Iterable):
+                assert len(data) == batch_size
+                return data
+
+            return np.full(shape, data, dtype=dtype)
+
+        input_ids = [torch.IntTensor(ids) for ids in input_ids]
+        input_lengths = torch.IntTensor([len(ids) for ids in input_ids])
+        input_ids = pad_sequence(
+            input_ids, batch_first=True, padding_value=self.eos_id)
+        input_lengths = input_lengths.detach().cpu().numpy()
+
+        if isinstance(session_id, int):
+            session_id = [session_id]
+        assert len(session_id) == batch_size
+
+        inputs = dict(
+            input_ids=input_ids,
+            input_lengths=input_lengths,
+            request_output_len=np.full(
+                input_lengths.shape, request_output_len, dtype=np.uint32),
+            runtime_top_k=_broadcast_np(top_k, np.uint32),
+            runtime_top_p=_broadcast_np(top_p, np.float32),
+            temperature=_broadcast_np(temperature, np.float32),
+            repetition_penalty=_broadcast_np(repetition_penalty, np.float32),
+            step=_broadcast_np(step, np.int32),
+
+            # session input
+            session_len=self.session_len *
+            np.ones([
+                batch_size,
+            ], dtype=np.uint32),
+            START=_broadcast_np((1 if sequence_start else 0), np.int32),
+            END=_broadcast_np((1 if sequence_end else 0), np.int32),
+            CORRID=np.array(session_id, dtype=np.uint64),
+            STOP=_broadcast_np((1 if stop else 0), np.int32))
+
+        if ignore_eos:
+            stop_words = None
+            bad_words = torch.tensor([[[self.eos_id], [1]]], dtype=torch.int32)
+        else:
+            stop_words = self.stop_words
+            bad_words = None
+
+        if stop_words is not None:
+            inputs['stop_words_list'] = stop_words
+        if bad_words is not None:
+            inputs['bad_words_list'] = bad_words
+
+        if random_seed is not None:
+            inputs['random_seed'] = _broadcast_np(random_seed, np.uint64)
+        tm_inputs = _np_dict_to_tm_dict(inputs)
+        tm_outputs = self.model_inst.forward(tm_inputs, self.instance_comm)
+
+        outputs = _tm_dict_to_torch_dict(tm_outputs)
+
+        # TODO: Add stream output
+        output_ids = outputs['output_ids'][:, 0, :]
+        sequence_length = outputs['sequence_length'].long()[:, 0]
+        return [[(output[:l], l.item())]
+                for output, l in zip(output_ids, sequence_length)]
--- a/src/turbomind/CMakeLists.txt
+++ b/src/turbomind/CMakeLists.txt
@@ -19,4 +19,7 @@ add_subdirectory(models)
 if(BUILD_PYT)
    add_subdirectory(th_op)
 endif()
+if(BUILD_PY_FFI)
+    add_subdirectory(python)
+endif()
 add_subdirectory(triton_backend)
--- a/src/turbomind/python/CMakeLists.txt
+++ b/src/turbomind/python/CMakeLists.txt
+# Copyright (c) OpenMMLab. All rights reserved.
+
+cmake_minimum_required(VERSION 3.8)
+project(_turbomind)
+
+find_package(pybind11 CONFIG)
+if(NOT pybind11_FOUND)
+    execute_process(COMMAND "pybind11-config" "--cmakedir"
+                    RESULT_VARIABLE _COMMAND_SUCCESS
+                    OUTPUT_VARIABLE pybind11_DIR
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
+    find_package(pybind11 CONFIG)
+endif()
+
+pybind11_add_module(${PROJECT_NAME} bind.cpp)
+target_link_libraries(${PROJECT_NAME} PRIVATE TransformerTritonBackend 
+    LlamaTritonBackend custom_ar_comm memory_utils)
+target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_14)
+
+set_target_properties(${PROJECT_NAME} PROPERTIES
+        BUILD_RPATH "\$ORIGIN"
+        INSTALL_RPATH "\$ORIGIN")
--- a/src/turbomind/python/bind.cpp
+++ b/src/turbomind/python/bind.cpp
+#include "src/turbomind/python/dlpack.h"
+#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
+#include "src/turbomind/triton_backend/llama/LlamaTritonModel.h"
+#include <memory>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/stl_bind.h>
+
+namespace py = pybind11;
+namespace ft = turbomind;
+using namespace pybind11::literals;
+
+// prepare to bind container
+using TensorVector = std::vector<triton::Tensor>;
+PYBIND11_MAKE_OPAQUE(TensorVector);
+using TensorMap = std::unordered_map<std::string, triton::Tensor>;
+PYBIND11_MAKE_OPAQUE(TensorMap);
+static const char kDlTensorCapsuleName[] = "dltensor";
+
+template<typename T>
+std::shared_ptr<T> make_shared_nodel(T data)
+{
+    return std::shared_ptr<T>(&data, [](T*) {});
+}
+
+DLDevice getDLDevice(triton::Tensor& tensor)
+{
+    DLDevice device{.device_id = 0};
+
+    switch (tensor.where) {
+        case triton::MEMORY_CPU:
+            device.device_type = DLDeviceType::kDLCPU;
+            break;
+        case triton::MEMORY_CPU_PINNED:
+            device.device_type = DLDeviceType::kDLCUDAHost;
+        case triton::MEMORY_GPU:
+            device.device_type = DLDeviceType::kDLCUDA;
+            break;
+        default:
+            break;
+    }
+
+    return device;
+}
+
+std::unique_ptr<DLManagedTensor> TritonTensorToDLManagedTensor(triton::Tensor& tensor)
+{
+    DLDevice device = getDLDevice(tensor);
+
+    DLDataType data_type{.lanes = 1};
+    switch (tensor.type) {
+        case triton::TYPE_BOOL:
+            data_type.code = DLDataTypeCode::kDLBool;
+            data_type.bits = 8;
+            break;
+        case triton::TYPE_UINT8:
+            data_type.code = DLDataTypeCode::kDLUInt;
+            data_type.bits = 8;
+            break;
+        case triton::TYPE_UINT16:
+            data_type.code = DLDataTypeCode::kDLUInt;
+            data_type.bits = 16;
+            break;
+        case triton::TYPE_UINT32:
+            data_type.code = DLDataTypeCode::kDLUInt;
+            data_type.bits = 32;
+            break;
+        case triton::TYPE_UINT64:
+            data_type.code = DLDataTypeCode::kDLUInt;
+            data_type.bits = 64;
+            break;
+        case triton::TYPE_INT8:
+        case triton::TYPE_BYTES:
+            data_type.code = DLDataTypeCode::kDLInt;
+            data_type.bits = 8;
+            break;
+        case triton::TYPE_INT16:
+            data_type.code = DLDataTypeCode::kDLInt;
+            data_type.bits = 16;
+            break;
+        case triton::TYPE_INT32:
+            data_type.code = DLDataTypeCode::kDLInt;
+            data_type.bits = 32;
+            break;
+        case triton::TYPE_INT64:
+            data_type.code = DLDataTypeCode::kDLInt;
+            data_type.bits = 64;
+            break;
+        case triton::TYPE_FP16:
+            data_type.code = DLDataTypeCode::kDLFloat;
+            data_type.bits = 16;
+            break;
+        case triton::TYPE_FP32:
+            data_type.code = DLDataTypeCode::kDLFloat;
+            data_type.bits = 32;
+            break;
+        case triton::TYPE_FP64:
+            data_type.code = DLDataTypeCode::kDLFloat;
+            data_type.bits = 64;
+            break;
+        case triton::TYPE_BF16:
+            data_type.code = DLDataTypeCode::kDLBfloat;
+            data_type.bits = 16;
+            break;
+        default:
+            break;
+    }
+    DLTensor dl_tensor{.data        = const_cast<void*>(tensor.data),
+                       .device      = device,
+                       .ndim        = (int32_t)(tensor.shape.size()),
+                       .dtype       = data_type,
+                       .shape       = reinterpret_cast<int64_t*>(const_cast<size_t*>(tensor.shape.data())),
+                       .strides     = (int64_t*)(nullptr),
+                       .byte_offset = 0};
+
+    return std::unique_ptr<DLManagedTensor>(
+        new DLManagedTensor{.dl_tensor = dl_tensor, .manager_ctx = nullptr, .deleter = [](DLManagedTensor*) {}});
+}
+
+triton::MemoryType getMemoryType(DLDevice device)
+{
+    switch (device.device_type) {
+        case DLDeviceType::kDLCPU:
+            return triton::MemoryType::MEMORY_CPU;
+        case DLDeviceType::kDLCUDAHost:
+            return triton::MemoryType::MEMORY_CPU_PINNED;
+        case DLDeviceType::kDLCUDA:
+            return triton::MemoryType::MEMORY_GPU;
+        default:
+            return triton::MemoryType::MEMORY_CPU;
+    }
+}
+
+triton::DataType getDataType(DLDataType data_type)
+{
+    switch (data_type.code) {
+        case DLDataTypeCode::kDLUInt:
+            switch (data_type.bits) {
+                case 8:
+                    return triton::TYPE_UINT8;
+                case 16:
+                    return triton::TYPE_UINT16;
+                case 32:
+                    return triton::TYPE_UINT32;
+                case 64:
+                    return triton::TYPE_UINT64;
+                default:
+                    return triton::TYPE_INVALID;
+            }
+            break;
+        case DLDataTypeCode::kDLInt:
+            switch (data_type.bits) {
+                case 8:
+                    return triton::TYPE_INT8;
+                case 16:
+                    return triton::TYPE_INT16;
+                case 32:
+                    return triton::TYPE_INT32;
+                case 64:
+                    return triton::TYPE_INT64;
+                default:
+                    return triton::TYPE_INVALID;
+            }
+            break;
+        case DLDataTypeCode::kDLFloat:
+            switch (data_type.bits) {
+                case 16:
+                    return triton::TYPE_FP16;
+                case 32:
+                    return triton::TYPE_FP32;
+                case 64:
+                    return triton::TYPE_FP64;
+                default:
+                    return triton::TYPE_INVALID;
+            }
+            break;
+        case DLDataTypeCode::kDLBfloat:
+            switch (data_type.bits) {
+                case 16:
+                    return triton::TYPE_BF16;
+                default:
+                    return triton::TYPE_INVALID;
+            }
+            break;
+        case DLDataTypeCode::kDLBool:
+            return triton::TYPE_BOOL;
+        default:
+            return triton::TYPE_INVALID;
+    }
+}
+
+std::shared_ptr<triton::Tensor> DLManagedTensorToTritonTensor(DLManagedTensor* tensor)
+{
+    auto& dl_tensor = tensor->dl_tensor;
+    auto  where     = getMemoryType(dl_tensor.device);
+    auto  dtype     = getDataType(dl_tensor.dtype);
+    assert(dl_tensor.ndim > 0);
+    std::vector<size_t> shape(dl_tensor.shape, dl_tensor.shape + dl_tensor.ndim);
+    auto                data = dl_tensor.data;
+
+    return std::make_shared<triton::Tensor>(where, dtype, shape, data);
+}
+
+PYBIND11_MODULE(_turbomind, m)
+{
+
+    // nccl param
+    py::class_<ft::NcclParam>(m, "NcclParam")
+        .def(py::init<int, int>(), "rank"_a = 0, "world_size"_a = 1)
+        .def("__str__", &ft::NcclParam::toString);
+
+    // custom comm
+    py::class_<ft::AbstractCustomComm, std::shared_ptr<ft::AbstractCustomComm>>(m, "AbstractCustomComm");
+
+    // instance comm
+    py::class_<ft::AbstractInstanceComm>(m, "AbstractInstanceComm");
+
+    // data type
+    py::enum_<triton::DataType>(m, "DataType")
+        .value("TYPE_INVALID", triton::DataType::TYPE_INVALID)
+        .value("TYPE_BOOL", triton::DataType::TYPE_BOOL)
+        .value("TYPE_UINT8", triton::DataType::TYPE_UINT8)
+        .value("TYPE_UINT16", triton::DataType::TYPE_UINT16)
+        .value("TYPE_UINT32", triton::DataType::TYPE_UINT32)
+        .value("TYPE_UINT64", triton::DataType::TYPE_UINT64)
+        .value("TYPE_INT8", triton::DataType::TYPE_INT8)
+        .value("TYPE_INT16", triton::DataType::TYPE_INT16)
+        .value("TYPE_INT32", triton::DataType::TYPE_INT32)
+        .value("TYPE_INT64", triton::DataType::TYPE_INT64)
+        .value("TYPE_FP16", triton::DataType::TYPE_FP16)
+        .value("TYPE_FP32", triton::DataType::TYPE_FP32)
+        .value("TYPE_FP64", triton::DataType::TYPE_FP64)
+        .value("TYPE_BYTES", triton::DataType::TYPE_BYTES)
+        .value("TYPE_BF16", triton::DataType::TYPE_BF16);
+
+    // memory type
+    py::enum_<triton::MemoryType>(m, "MemoryType")
+        .value("MEMORY_CPU", triton::MemoryType::MEMORY_CPU)
+        .value("MEMORY_CPU_PINNED", triton::MemoryType::MEMORY_CPU_PINNED)
+        .value("MEMORY_GPU", triton::MemoryType::MEMORY_GPU);
+
+    // tensor
+    py::class_<triton::Tensor, std::shared_ptr<triton::Tensor>>(m, "Tensor")
+        .def_readonly("where", &triton::Tensor::where)
+        .def_readonly("type", &triton::Tensor::type)
+        .def_readonly("shape", &triton::Tensor::shape)
+        .def_readonly("data", &triton::Tensor::data)
+        .def(py::init([](const triton::MemoryType   where,
+                         const triton::DataType     type,
+                         const std::vector<size_t>& shape,
+                         const long                 data) {
+            auto data_ptr = reinterpret_cast<void*>(data);
+            return new triton::Tensor(where, type, shape, data_ptr);
+        }))
+        .def(
+            "view",
+            [](triton::Tensor* self, triton::DataType new_type) {
+                return new triton::Tensor(self->where, new_type, self->shape, self->data);
+            },
+            "new_type"_a)
+        .def(
+            "view",
+            [](triton::Tensor* self, std::vector<size_t> new_shape) {
+                return new triton::Tensor(self->where, self->type, new_shape, self->data);
+            },
+            "new_shape"_a)
+        .def(
+            "__dlpack__",
+            [](triton::Tensor* self, long stream) {
+                auto tensor_ptr = TritonTensorToDLManagedTensor(*self);
+                return new py::capsule(tensor_ptr.release(), kDlTensorCapsuleName, [](PyObject* obj) {
+                    DLManagedTensor* dlmt =
+                        static_cast<DLManagedTensor*>(PyCapsule_GetPointer(obj, kDlTensorCapsuleName));
+                    if (dlmt) {
+                        dlmt->deleter(dlmt);
+                    }
+                    else {
+                        // The tensor has been deleted. Clear any error from
+                        // PyCapsule_GetPointer.
+                        PyErr_Clear();
+                    }
+                });
+            },
+            "stream"_a = 0)
+        .def("__dlpack_device__", [](triton::Tensor* self) {
+            auto device = getDLDevice(*self);
+            return std::tuple<int, int>(int(device.device_type), device.device_id);
+        });
+    m.def(
+        "from_dlpack",
+        [](py::object obj) {
+            py::capsule      cap = obj.attr("__dlpack__")();
+            DLManagedTensor* dlmt =
+                static_cast<DLManagedTensor*>(PyCapsule_GetPointer(cap.ptr(), kDlTensorCapsuleName));
+            auto ret = DLManagedTensorToTritonTensor(dlmt);
+            return ret;
+        },
+        "dl_managed_tensor"_a);
+
+    // transformer model instance
+    py::bind_map<TensorMap, std::shared_ptr<TensorMap>>(m, "TensorMap");
+    py::class_<AbstractTransformerModelInstance>(m, "AbstractTransformerModelInstance")
+        .def(
+            "forward",
+            [](AbstractTransformerModelInstance* model, std::shared_ptr<TensorMap> input_tensors, ft::AbstractInstanceComm* inst_comm) {
+                return model->forward(input_tensors, inst_comm);
+            }, py::call_guard<py::gil_scoped_release>(),
+            "input_tensors"_a,
+            "inst_comm"_a = nullptr);
+
+    // transformer model
+    py::class_<AbstractTransformerModel, std::shared_ptr<AbstractTransformerModel>>(m, "AbstractTransformerModel")
+        // .def_static("create_llama_model", &AbstractTransformerModel::createLlamaModel, "model_dir"_a)
+        .def_static("create_llama_model", [](std::string model_dir,
+                                             size_t      tensor_para_size,
+                                             size_t      pipeline_para_size,
+                                             int         enable_custom_all_reduce,
+                                             std::string data_type) -> std::shared_ptr<AbstractTransformerModel> {
+
+            if (data_type == "half" || data_type == "fp16") {
+                return std::make_shared<LlamaTritonModel<half>>(tensor_para_size,
+                                                                pipeline_para_size,
+                                                                enable_custom_all_reduce,
+                                                                model_dir);
+            }else {
+                return std::make_shared<LlamaTritonModel<float>>(tensor_para_size,
+                                                                pipeline_para_size,
+                                                                enable_custom_all_reduce,
+                                                                model_dir);
+            }
+        }, "model_dir"_a,
+            "tensor_para_size"_a=1,
+            "pipeline_para_size"_a=1,
+            "enable_custom_all_reduce"_a=0,
+            "data_type"_a="half")
+        .def("create_nccl_params",
+             &AbstractTransformerModel::createNcclParams,
+             "node_id"_a,
+             "device_id_start"_a = 0,
+             "multi_node"_a      = false)
+        .def(
+            "create_custom_comms",
+            [](std::shared_ptr<AbstractTransformerModel>& model, int world_size) {
+                std::vector<std::shared_ptr<ft::AbstractCustomComm>> ret;
+                model->createCustomComms(&ret, world_size);
+                return ret;
+            },
+            "world_size"_a)
+        .def("create_instance_comm", &AbstractTransformerModel::createInstanceComm, "size"_a)
+        .def(
+            "create_model_instance",
+            [](std::shared_ptr<AbstractTransformerModel>&                        model,
+               int                                                               deviceId,
+               int                                                               rank,
+               long                                                              stream_id,
+               std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
+               std::shared_ptr<ft::AbstractCustomComm>                           custom_all_reduce_comm = nullptr) {
+                cudaStream_t stream = reinterpret_cast<cudaStream_t>(stream_id);
+                return model->createModelInstance(deviceId, rank, stream, nccl_params, custom_all_reduce_comm);
+            },
+            "device_id"_a,
+            "rank"_a,
+            "stream"_a,
+            "nccl_params"_a,
+            "custom_all_reduce_comm"_a = nullptr)
+        .def("create_shared_weights", &AbstractTransformerModel::createSharedWeights, "device_id"_a, "rank"_a)
+        .def("__str__", &AbstractTransformerModel::toString)
+        .def("__repr__", &AbstractTransformerModel::toString)
+        .def("get_tensor_para_size", &AbstractTransformerModel::getTensorParaSize)
+        .def("get_pipeline_para_size", &AbstractTransformerModel::getPipelineParaSize);
+}
\ No newline at end of file
--- a/src/turbomind/python/dlpack.h
+++ b/src/turbomind/python/dlpack.h
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file dlpack.h
+ * \brief The common header of DLPack.
+ */
+#ifndef DLPACK_DLPACK_H_
+#define DLPACK_DLPACK_H_
+
+/**
+ * \brief Compatibility with C++
+ */
+#ifdef __cplusplus
+#define DLPACK_EXTERN_C extern "C"
+#else
+#define DLPACK_EXTERN_C
+#endif
+
+/*! \brief The current major version of dlpack */
+#define DLPACK_MAJOR_VERSION 1
+
+/*! \brief The current minor version of dlpack */
+#define DLPACK_MINOR_VERSION 0
+
+/*! \brief DLPACK_DLL prefix for windows */
+#ifdef _WIN32
+#ifdef DLPACK_EXPORTS
+#define DLPACK_DLL __declspec(dllexport)
+#else
+#define DLPACK_DLL __declspec(dllimport)
+#endif
+#else
+#define DLPACK_DLL
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!
+ * \brief The DLPack version.
+ *
+ * A change in major version indicates that we have changed the
+ * data layout of the ABI - DLManagedTensorVersioned.
+ *
+ * A change in minor version indicates that we have added new
+ * code, such as a new device type, but the ABI is kept the same.
+ *
+ * If an obtained DLPack tensor has a major version that disagrees
+ * with the version number specified in this header file
+ * (i.e. major != DLPACK_MAJOR_VERSION), the consumer must call the deleter
+ * (and it is safe to do so). It is not safe to access any other fields
+ * as the memory layout will have changed.
+ *
+ * In the case of a minor version mismatch, the tensor can be safely used as
+ * long as the consumer knows how to interpret all fields. Minor version
+ * updates indicate the addition of enumeration values.
+ */
+typedef struct {
+    /*! \brief DLPack major version. */
+    uint32_t major;
+    /*! \brief DLPack minor version. */
+    uint32_t minor;
+} DLPackVersion;
+
+/*!
+ * \brief The device type in DLDevice.
+ */
+#ifdef __cplusplus
+typedef enum: int32_t {
+#else
+typedef enum {
+#endif
+    /*! \brief CPU device */
+    kDLCPU = 1,
+    /*! \brief CUDA GPU device */
+    kDLCUDA = 2,
+    /*!
+     * \brief Pinned CUDA CPU memory by cudaMallocHost
+     */
+    kDLCUDAHost = 3,
+    /*! \brief OpenCL devices. */
+    kDLOpenCL = 4,
+    /*! \brief Vulkan buffer for next generation graphics. */
+    kDLVulkan = 7,
+    /*! \brief Metal for Apple GPU. */
+    kDLMetal = 8,
+    /*! \brief Verilog simulator buffer */
+    kDLVPI = 9,
+    /*! \brief ROCm GPUs for AMD GPUs */
+    kDLROCM = 10,
+    /*!
+     * \brief Pinned ROCm CPU memory allocated by hipMallocHost
+     */
+    kDLROCMHost = 11,
+    /*!
+     * \brief Reserved extension device type,
+     * used for quickly test extension device
+     * The semantics can differ depending on the implementation.
+     */
+    kDLExtDev = 12,
+    /*!
+     * \brief CUDA managed/unified memory allocated by cudaMallocManaged
+     */
+    kDLCUDAManaged = 13,
+    /*!
+     * \brief Unified shared memory allocated on a oneAPI non-partititioned
+     * device. Call to oneAPI runtime is required to determine the device
+     * type, the USM allocation type and the sycl context it is bound to.
+     *
+     */
+    kDLOneAPI = 14,
+    /*! \brief GPU support for next generation WebGPU standard. */
+    kDLWebGPU = 15,
+    /*! \brief Qualcomm Hexagon DSP */
+    kDLHexagon = 16,
+} DLDeviceType;
+
+/*!
+ * \brief A Device for Tensor and operator.
+ */
+typedef struct {
+    /*! \brief The device type used in the device. */
+    DLDeviceType device_type;
+    /*!
+     * \brief The device index.
+     * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
+     */
+    int32_t device_id;
+} DLDevice;
+
+/*!
+ * \brief The type code options DLDataType.
+ */
+typedef enum {
+    /*! \brief signed integer */
+    kDLInt = 0U,
+    /*! \brief unsigned integer */
+    kDLUInt = 1U,
+    /*! \brief IEEE floating point */
+    kDLFloat = 2U,
+    /*!
+     * \brief Opaque handle type, reserved for testing purposes.
+     * Frameworks need to agree on the handle data type for the exchange to be well-defined.
+     */
+    kDLOpaqueHandle = 3U,
+    /*! \brief bfloat16 */
+    kDLBfloat = 4U,
+    /*!
+     * \brief complex number
+     * (C/C++/Python layout: compact struct per complex number)
+     */
+    kDLComplex = 5U,
+    /*! \brief boolean */
+    kDLBool = 6U,
+} DLDataTypeCode;
+
+/*!
+ * \brief The data type the tensor can hold. The data type is assumed to follow the
+ * native endian-ness. An explicit error message should be raised when attempting to
+ * export an array with non-native endianness
+ *
+ *  Examples
+ *   - float: type_code = 2, bits = 32, lanes = 1
+ *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes = 4
+ *   - int8: type_code = 0, bits = 8, lanes = 1
+ *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
+ *   - bool: type_code = 6, bits = 8, lanes = 1 (as per common array library convention, the underlying storage size of
+ * bool is 8 bits)
+ */
+typedef struct {
+    /*!
+     * \brief Type code of base types.
+     * We keep it uint8_t instead of DLDataTypeCode for minimal memory
+     * footprint, but the value should be one of DLDataTypeCode enum values.
+     * */
+    uint8_t code;
+    /*!
+     * \brief Number of bits, common choices are 8, 16, 32.
+     */
+    uint8_t bits;
+    /*! \brief Number of lanes in the type, used for vector types. */
+    uint16_t lanes;
+} DLDataType;
+
+/*!
+ * \brief Plain C Tensor object, does not manage memory.
+ */
+typedef struct {
+    /*!
+     * \brief The data pointer points to the allocated data. This will be CUDA
+     * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
+     * types. This pointer is always aligned to 256 bytes as in CUDA. The
+     * `byte_offset` field should be used to point to the beginning of the data.
+     *
+     * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
+     * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
+     * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
+     * (after which this note will be updated); at the moment it is recommended
+     * to not rely on the data pointer being correctly aligned.
+     *
+     * For given DLTensor, the size of memory required to store the contents of
+     * data is calculated as follows:
+     *
+     * \code{.c}
+     * static inline size_t GetDataSize(const DLTensor* t) {
+     *   size_t size = 1;
+     *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
+     *     size *= t->shape[i];
+     *   }
+     *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
+     *   return size;
+     * }
+     * \endcode
+     */
+    void* data;
+    /*! \brief The device of the tensor */
+    DLDevice device;
+    /*! \brief Number of dimensions */
+    int32_t ndim;
+    /*! \brief The data type of the pointer*/
+    DLDataType dtype;
+    /*! \brief The shape of the tensor */
+    int64_t* shape;
+    /*!
+     * \brief strides of the tensor (in number of elements, not bytes)
+     *  can be NULL, indicating tensor is compact and row-majored.
+     */
+    int64_t* strides;
+    /*! \brief The offset in bytes to the beginning pointer to data */
+    uint64_t byte_offset;
+} DLTensor;
+
+/*!
+ * \brief C Tensor object, manage memory of DLTensor. This data structure is
+ *  intended to facilitate the borrowing of DLTensor by another framework. It is
+ *  not meant to transfer the tensor. When the borrowing framework doesn't need
+ *  the tensor, it should call the deleter to notify the host that the resource
+ *  is no longer needed.
+ *
+ * \note This data structure is used as Legacy DLManagedTensor
+ *       in DLPack exchange and is deprecated after DLPack v0.8
+ *       Use DLManagedTensorVersioned instead.
+ *       This data structure may get renamed or deleted in future versions.
+ *
+ * \sa DLManagedTensorVersioned
+ */
+typedef struct DLManagedTensor {
+    /*! \brief DLTensor which is being memory managed */
+    DLTensor dl_tensor;
+    /*! \brief the context of the original host framework of DLManagedTensor in
+     *   which DLManagedTensor is used in the framework. It can also be NULL.
+     */
+    void* manager_ctx;
+    /*!
+     * \brief Destructor - this should be called
+     * to destruct the manager_ctx  which backs the DLManagedTensor. It can be
+     * NULL if there is no way for the caller to provide a reasonable destructor.
+     * The destructors deletes the argument self as well.
+     */
+    void (*deleter)(struct DLManagedTensor* self);
+} DLManagedTensor;
+
+// bit masks used in in the DLManagedTensorVersioned
+
+/*! \brief bit mask to indicate that the tensor is read only. */
+#define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL)
+
+/*!
+ * \brief A versioned and managed C Tensor object, manage memory of DLTensor.
+ *
+ * This data structure is intended to facilitate the borrowing of DLTensor by
+ * another framework. It is not meant to transfer the tensor. When the borrowing
+ * framework doesn't need the tensor, it should call the deleter to notify the
+ * host that the resource is no longer needed.
+ *
+ * \note This is the current standard DLPack exchange data structure.
+ */
+struct DLManagedTensorVersioned {
+    /*!
+     * \brief The API and ABI version of the current managed Tensor
+     */
+    DLPackVersion version;
+    /*!
+     * \brief the context of the original host framework.
+     *
+     * Stores DLManagedTensorVersioned is used in the
+     * framework. It can also be NULL.
+     */
+    void* manager_ctx;
+    /*!
+     * \brief Destructor.
+     *
+     * This should be called to destruct manager_ctx which holds the DLManagedTensorVersioned.
+     * It can be NULL if there is no way for the caller to provide a reasonable
+     * destructor. The destructors deletes the argument self as well.
+     */
+    void (*deleter)(struct DLManagedTensorVersioned* self);
+    /*!
+     * \brief Additional bitmask flags information about the tensor.
+     *
+     * By default the flags should be set to 0.
+     *
+     * \note Future ABI changes should keep everything until this field
+     *       stable, to ensure that deleter can be correctly called.
+     *
+     * \sa DLPACK_FLAG_BITMASK_READ_ONLY
+     */
+    uint64_t flags;
+    /*! \brief DLTensor which is being memory managed */
+    DLTensor dl_tensor;
+};
+
+#ifdef __cplusplus
+}  // DLPACK_EXTERN_C
+#endif
+#endif  // DLPACK_DLPACK_H_
\ No newline at end of file
--- a/tests/python/test_tokenizer.py
+++ b/tests/python/test_tokenizer.py
+from lmdeploy.turbomind.tokenizer import Tokenizer, Preprocessor, Postprocessor
+
+def main():
+    tokenizer = Tokenizer('huggyllama/llama-7b')
+    preprocessor = Preprocessor(tokenizer)
+    postprocessor = Postprocessor(tokenizer)
+
+    prompts = ['cest la vie', '上帝已死']
+    tokens = preprocessor(prompts)
+    print(tokens)
+
+    decode_prompts = postprocessor(*tokens)
+    print(decode_prompts)
+
+if __name__ == '__main__':
+    main()