同步0.2.6代码

d7117b95 · zhouxiang · 5f83e392 · d7117b95 · d7117b95 · d7117b95
Commit d7117b95 authored Mar 22, 2024 by zhouxiang
11 changed files
--- a/src/turbomind/python/CMakeLists.txt
+++ b/src/turbomind/python/CMakeLists.txt
@@ -17,6 +17,13 @@ target_link_libraries(${PROJECT_NAME} PRIVATE TransformerTritonBackend
    LlamaTritonBackend)
 target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_14)

+set(_INSTALL_CUDA_RPATH
+    "\$ORIGIN"
+    "\$ORIGIN/../../nvidia/nccl/lib/"
+    "\$ORIGIN/../../nvidia/cuda_runtime/lib/"
+    "\$ORIGIN/../../nvidia/cublas/lib/"
+)
 set_target_properties(${PROJECT_NAME} PROPERTIES
-        BUILD_RPATH "\$ORIGIN"
-	INSTALL_RPATH "\$ORIGIN;\$ORIGIN/../../nvidia/nccl/lib/")
+    BUILD_RPATH "\$ORIGIN"
+    INSTALL_RPATH "${_INSTALL_CUDA_RPATH}"
+)
--- a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -47,7 +47,6 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaM
            reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
            model_dir);
    }
-#ifdef ENABLE_BF16
    else if (data_type == "bf16") {
 #ifdef ENABLE_BF16
        return std::make_shared<LlamaTritonModel<__nv_bfloat16>>(
@@ -60,7 +59,6 @@ std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaM
        ft::FT_CHECK(false);
 #endif
    }
-#endif
    else {
        return std::make_shared<LlamaTritonModel<float>>(
            reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
@@ -146,7 +144,17 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
    enable_custom_all_reduce_(enable_custom_all_reduce)
 {
    INIReader reader;
-    FT_CHECK_WITH_INFO((config.empty() ^ model_dir.empty()), "invalid init options");
+    FT_CHECK_WITH_INFO(!(config.empty() && model_dir.empty()), "invalid init options");
+
+    if (!model_dir.empty()) {
+        model_dir_ = model_dir;
+        const std::string inifile{model_dir + "/config.ini"};
+        reader = INIReader(inifile);
+        if (reader.ParseError() < 0) {
+            TM_LOG_ERROR("[ERROR] Can't load %s", inifile.c_str());
+            ft::FT_CHECK(false);
+        }
+    }

    if (!config.empty()) {
        std::FILE* tmpf = std::tmpfile();
@@ -159,16 +167,6 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t      tensor_para_size,
        }
    }

-    if (!model_dir.empty()) {
-        model_dir_ = model_dir;
-        const std::string inifile{model_dir + "/config.ini"};
-        reader = INIReader(inifile);
-        if (reader.ParseError() < 0) {
-            TM_LOG_ERROR("[ERROR] Can't load %s", inifile.c_str());
-            ft::FT_CHECK(false);
-        }
-    }
-
    model_name_          = reader.Get("llama", "model_name");
    head_num_            = reader.GetInteger("llama", "head_num");
    kv_head_num_         = reader.GetInteger("llama", "kv_head_num", 0);

--- a/src/turbomind/utils/CMakeLists.txt
+++ b/src/turbomind/utils/CMakeLists.txt
@@ -67,7 +67,8 @@ add_library(mpi_utils STATIC mpi_utils.cc)
 #set_property(TARGET mpi_utils PROPERTY POSITION_INDEPENDENT_CODE  ON)
 #set_property(TARGET mpi_utils PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 if (BUILD_MULTI_GPU)
-    target_link_libraries(mpi_utils PUBLIC mpi logger)
+    #target_link_libraries(mpi_utils PUBLIC mpi logger)
+    target_link_libraries(mpi_utils PUBLIC ${MPI_CXX_LIBRARIES} logger)
 endif()

 add_library(nccl_utils STATIC nccl_utils.cc)

--- a/tests/csrc/unittests/test_penalty_kernels.cu
+++ b/tests/csrc/unittests/test_penalty_kernels.cu
@@ -386,6 +386,7 @@ protected:
    T*   d_bias_;
    int* d_output_ids_;
    int* d_input_lengths_;
+    int* d_penalty_workspace_;

    float* d_repetition_penalties_;

@@ -410,6 +411,8 @@ protected:
        d_bias_          = reinterpret_cast<T*>(allocator->malloc(sizeof(T) * vocab_size_padded_));
        d_output_ids_    = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * sequence_length_ * batch_size_));
        d_input_lengths_ = reinterpret_cast<int*>(allocator->malloc(sizeof(int) * batch_size_));
+        d_penalty_workspace_ =
+            reinterpret_cast<int*>(allocator->malloc((sizeof(int) + sizeof(float)) * batch_size_ * step_));

        cudaAutoCpy(d_logits_, h_logits_, batch_size_ * vocab_size_padded_, stream);
        cudaAutoCpy(d_bias_, h_bias_, vocab_size_padded_, stream);
@@ -501,6 +504,7 @@ public:
        else {
            invokeBatchApplyRepetitionPenalty(d_logits_,
                                              d_repetition_penalties_,
+                                              d_penalty_workspace_,
                                              d_output_ids_,
                                              batch_size_,
                                              batch_size_,
@@ -559,6 +563,7 @@ public:
        cudaAutoCpy(d_logits_batch, h_logits_, batch_size_ * vocab_size_padded_, stream);
        invokeBatchApplyRepetitionPenalty(d_logits_batch,
                                          d_repetition_penalties_,
+                                          d_penalty_workspace_,
                                          d_output_ids_,
                                          batch_size_,
                                          batch_size_,

--- a/tests/pytorch/test_decode.py
+++ b/tests/pytorch/test_decode.py
-import os
-
-import numpy as np
-import torch
-from transformers import AutoTokenizer
-
-from lmdeploy.pytorch.decode import Engine, decode_single
-from lmdeploy.pytorch.model import accel_model, init_model
-
-
-def _test_decode_dist(model_path, prompt):
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
-    tokenizer.pad_token_id = tokenizer.eos_token_id
-    tokenizer.padding_side = 'right'
-
-    inputs = tokenizer(prompt)
-    input_ids = inputs.input_ids
-
-    engine = Engine(model_path, tokenizer=tokenizer)
-    probs = engine.decode(input_ids, sort=False, max_bs=1, pad=True)
-
-    return probs
-
-
-def _test_decode_single(model_path, prompt):
-    model, tokenizer = init_model(model_path)
-    model = accel_model(model)
-    model = model.eval()
-
-    tokenizer.pad_token_id = tokenizer.eos_token_id
-    tokenizer.padding_side = 'right'
-
-    inputs = tokenizer(prompt, return_tensors='pt', padding=True)
-    input_ids = inputs.input_ids.cuda()
-    attention_mask = inputs.attention_mask.cuda()
-
-    probs: torch.Tensor = decode_single(model, input_ids, attention_mask)
-
-    return probs.numpy()
-
-
-def test_compare(output_outliers=True):
-    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'
-    os.environ['MKL_THREADING_LAYER'] = 'GNU'
-    # https://github.com/pytorch/pytorch/issues/37377#issuecomment-629529611
-
-    model_path = 'llama2/huggingface/llama-2-7b'
-
-    prompts = [
-        'I believe the meaning of life is to find your gift. The purpose of life is to give it away.',  # noqa: E501
-        'Simply put, the theory of relativity states that ',
-        'Building a website can be done in 10 simple steps:'
-    ]
-
-    p_single = _test_decode_single(model_path, prompts)
-    p_dist = _test_decode_dist(model_path, prompts)
-
-    rtol = 2.0e-2
-    atol = 2.0e-2
-    if output_outliers:
-        np.set_printoptions(linewidth=150, edgeitems=5)
-        failed = (abs(p_dist - p_single) > atol + rtol * abs(p_single))
-        idx = failed.nonzero()
-        print(f'Num outliers: {len(idx[0])}')
-        print(p_dist[idx])
-        print(p_single[idx])
-
-    assert np.allclose(p_dist, p_single, rtol=rtol, atol=atol)
--- a/tests/pytorch/test_dist.py
+++ b/tests/pytorch/test_dist.py
-import unittest
-
-import torch
-
-from lmdeploy.pytorch.dist import (get_rank, master_only,
-                                   master_only_and_broadcast_general,
-                                   master_only_and_broadcast_tensor)
-
-
-class SimpleTest(unittest.TestCase):
-
-    @master_only
-    def fake_input(self):
-        print(f'Evaluate fake input 1 on {get_rank()}')
-        return 'master only or none'
-
-    @master_only_and_broadcast_general
-    def fake_input21(self):
-        print(f'Evaluate fake input 21 (str) on {get_rank()}')
-        return 'master only and_broadcast'
-
-    @master_only_and_broadcast_general
-    def fake_input22(self):
-        print(f'Evaluate fake input 22 (cpu tensor) on {get_rank()}')
-        return torch.tensor([6, 66, 666])
-
-    @master_only_and_broadcast_tensor
-    def fake_input3(self):
-        print(f'Evaluate fake input 3 (gpu tensor) on {get_rank()}')
-        return torch.tensor([6, 66, 666]).cuda()
-
-    def test(self):
-        torch.distributed.init_process_group(backend='nccl')
-        rank = get_rank()
-        # unittest will discard --local_rank, thus set manually
-        torch.cuda.set_device(rank)
-
-        in1 = self.fake_input()
-        in21 = self.fake_input21()
-        in22 = self.fake_input22()
-        in3 = self.fake_input3(dtype=torch.long, size=(1, 3))
-
-        if rank == 0:
-            self.assertEqual(in1, 'master only or none')
-        else:
-            self.assertEqual(in1, None)
-
-        self.assertEqual(in21, 'master only and_broadcast')
-        self.assertTrue(torch.allclose(in22, torch.tensor([6, 66, 666])))
-        self.assertFalse(torch.allclose(in3.cpu(), torch.tensor([6, 6, 666])))
-        self.assertTrue(torch.allclose(in3.cpu(), torch.tensor([6, 66, 666])))
--- a/tests/pytorch/test_model.py
+++ b/tests/pytorch/test_model.py
-from lmdeploy.pytorch.model import accel_model, init_model
-
-
-def test_init_model():
-    cprint = lambda x: print(f'\033[92m{x}\033[0m')  # noqa: E731
-
-    # Test llama2-7b
-    for model_path in ['llama2/huggingface/llama-2-7b', 'internlm-7b']:
-        model, tokenizer = init_model(model_path)
-        assert tokenizer.is_fast
-        cprint('llama2 on CPU')
-        print(model)
-        model1 = accel_model(model)
-        cprint('llama2 on GPU')
-        print(model1)
-        cprint('llama2 with kernel injection')
-        model2 = accel_model(model, accel='deepspeed')
-        assert 'DeepSpeedSelfAttention' in repr(model2)
-        assert 'DeepSpeedMLP' in repr(model2)
--- a/tests/pytorch/test_utils.py
+++ b/tests/pytorch/test_utils.py
-from lmdeploy.pytorch.utils import BasicStreamer, TerminalIO
-
-
-def test_terminal_io(monkeypatch):
-    import io
-    tio = TerminalIO()
-    inputs = 'hello\n\n'
-    # inputs = 'hello\n\n\x1B[A\n\n'
-    monkeypatch.setattr('sys.stdin', io.StringIO(inputs))
-    string = tio.input()
-    tio.output(string)
-    assert string == 'hello'
-    # string = tio.input()
-    # tio.output(string)
-    # assert string == 'hello'
-
-
-def test_basic_streamer():
-    output = []
-
-    def decode_func(value):
-        return value + 10
-
-    def output_func(value):
-        output.append(value)
-
-    streamer = BasicStreamer(decode_func, output_func)
-    for i in range(10):
-        streamer.put(i)
-        if i == 5:
-            streamer.end()
-    streamer.end()
-
-    assert output == [11, 12, 13, 14, 15, '\n', 17, 18, 19, '\n']
-
-    output.clear()
-    streamer = BasicStreamer(decode_func, output_func, skip_prompt=False)
-    for i in range(10):
-        streamer.put(i)
-        if i == 5:
-            streamer.end()
-    streamer.end()
-
-    assert output == [10, 11, 12, 13, 14, 15, '\n', 16, 17, 18, 19, '\n']
--- a/tests/test_lmdeploy/test_cli.py
+++ b/tests/test_lmdeploy/test_cli.py
-import inspect
-
-
-def compare_func(class_method, function):
-    """Compare if a class method has same arguments as a function."""
-
-    argspec_cls = inspect.getfullargspec(class_method)
-    argspec_func = inspect.getfullargspec(function)
-    assert argspec_cls.args[1:] == argspec_func.args
-    assert argspec_cls.defaults == argspec_func.defaults
-    assert argspec_cls.annotations == argspec_func.annotations
-
-
-def test_cli():
-
-    from lmdeploy.cli.cli import CLI
-    from lmdeploy.serve.turbomind.deploy import main as convert
-    compare_func(CLI.convert, convert)
-
-
-def test_subcli_chat():
-    from lmdeploy.cli.chat import SubCliChat
-    from lmdeploy.pytorch.chat import main as run_torch_model
-    from lmdeploy.turbomind.chat import main as run_turbomind_model
-
-    compare_func(SubCliChat.torch, run_torch_model)
-    compare_func(SubCliChat.turbomind, run_turbomind_model)
-
-
-def test_subcli_lite():
-    from lmdeploy.cli.lite import SubCliLite
-    from lmdeploy.lite.apis.auto_awq import auto_awq
-    from lmdeploy.lite.apis.calibrate import calibrate
-    from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams
-
-    compare_func(SubCliLite.auto_awq, auto_awq)
-    compare_func(SubCliLite.calibrate, calibrate)
-    compare_func(SubCliLite.kv_qparams, run_kv_qparams)
-
-
-def test_subcli_serve():
-    from lmdeploy.cli.serve import SubCliServe
-    from lmdeploy.serve.client import main as run_triton_client
-    from lmdeploy.serve.gradio.app import run as run_gradio
-    from lmdeploy.serve.openai.api_client import main as run_api_client
-    from lmdeploy.serve.openai.api_server import serve as run_api_server
-
-    compare_func(SubCliServe.gradio, run_gradio)
-    compare_func(SubCliServe.api_server, run_api_server)
-    compare_func(SubCliServe.api_client, run_api_client)
-    compare_func(SubCliServe.triton_client, run_triton_client)
--- a/tests/test_lmdeploy/test_model.py
+++ b/tests/test_lmdeploy/test_model.py
 import pytest

-from lmdeploy.model import MODELS, SamplingParam
+from lmdeploy.model import MODELS, best_match_model
+
+
+@pytest.mark.parametrize(
+    'model_path_and_name',
+    [('internlm/internlm-chat-7b', ['internlm']),
+     ('internlm/internlm2-1_8b', ['base']),
+     ('models--internlm--internlm-chat-7b/snapshots/1234567', ['internlm']),
+     ('Qwen/Qwen-7B-Chat', ['qwen']),
+     ('codellama/CodeLlama-7b-hf', ['codellama']),
+     ('upstage/SOLAR-0-70b', ['solar', 'solar-70b']),
+     ('meta-llama/Llama-2-7b-chat-hf', ['llama2']),
+     ('THUDM/chatglm2-6b', ['chatglm']),
+     ('01-ai/Yi-6B-200k', ['yi', 'yi-200k']), ('01-ai/Yi-34B-Chat', ['yi']),
+     ('01-ai/Yi-6B-Chat', ['yi', 'yi-chat']),
+     ('WizardLM/WizardLM-70B-V1.0', ['wizardlm']),
+     ('codellama/CodeLlama-34b-Instruct-hf', ['codellama']),
+     ('tiiuae/falcon-7b', ['falcon']), ('workspace', [None])])
+@pytest.mark.parametrize('suffix', ['', '-w4', '-4bit', '-16bit'])
+def test_best_match_model(model_path_and_name, suffix):
+    if model_path_and_name[0] == 'internlm/internlm2-1_8b' and suffix:
+        return  # internlm/internlm2-1_8b-suffix will got None
+    deduced_name = best_match_model(model_path_and_name[0] + suffix)
+    if deduced_name is not None:
+        assert deduced_name in model_path_and_name[
+            1], f'expect {model_path_and_name[1]}, but got {deduced_name}'
+    else:
+        assert deduced_name in model_path_and_name[
+            1], f'expect {model_path_and_name[1]}, but got {deduced_name}'
+
+
+@pytest.mark.parametrize('model_name',
+                         ['llama2', 'base', 'yi', 'qwen-7b', 'vicuna'])
+@pytest.mark.parametrize('meta_instruction', ['[fake meta_instruction]'])
+def test_model_config(model_name, meta_instruction):
+    from lmdeploy.model import ChatTemplateConfig
+    chat_template = ChatTemplateConfig(
+        model_name, meta_instruction=meta_instruction).chat_template
+    prompt = chat_template.get_prompt('')
+    if model_name == 'base':
+        assert prompt == ''
+    else:
+        assert meta_instruction in prompt


 def test_base_model():
@@ -21,8 +63,6 @@ def test_vicuna():
    model = MODELS.get('vicuna')(capability='completion')
    assert model.get_prompt(prompt, sequence_start=True) == prompt
    assert model.get_prompt(prompt, sequence_start=False) == prompt
-    assert model.stop_words is None
-    assert model.system is not None

    model = MODELS.get('vicuna')(capability='chat',
                                 system='Provide answers in Python')
@@ -34,7 +74,7 @@ def test_vicuna():
    _prompt = None
    with pytest.raises(AssertionError):
        _prompt = model.get_prompt(prompt, sequence_start=True)
-    assert _prompt is None
+        assert _prompt is None


 def test_internlm_chat():
@@ -43,7 +83,7 @@ def test_internlm_chat():
    assert model.get_prompt(prompt, sequence_start=True) == prompt
    assert model.get_prompt(prompt, sequence_start=False) == prompt
    assert model.stop_words is not None
-    assert model.system == ''
+    assert model.system == '<|System|>:'
    assert model.session_len == 2048

    model = MODELS.get('internlm-chat-7b')(capability='chat',
@@ -56,7 +96,7 @@ def test_internlm_chat():
    _prompt = None
    with pytest.raises(AssertionError):
        _prompt = model.get_prompt(prompt, sequence_start=True)
-    assert _prompt is None
+        assert _prompt is None

    model = MODELS.get('internlm-chat-7b-8k')()
    assert model.session_len == 8192
@@ -68,7 +108,6 @@ def test_baichuan():
    assert model.get_prompt(prompt, sequence_start=True) == prompt
    assert model.get_prompt(prompt, sequence_start=False) == prompt
    assert model.stop_words is None
-    assert model.repetition_penalty == 1.1

    model = MODELS.get('baichuan-7b')(capability='chat')
    _prompt = model.get_prompt(prompt, sequence_start=True)
@@ -81,19 +120,19 @@ def test_llama2():
    assert model.get_prompt(prompt, sequence_start=True) == prompt
    assert model.get_prompt(prompt, sequence_start=False) == prompt
    assert model.stop_words is None
-    assert model.default_sys_prompt is not None
+    assert model.meta_instruction is not None

    model = MODELS.get('llama2')(capability='chat',
-                                 system='Provide answers in Python')
+                                 meta_instruction='Provide answers in Python')
    assert model.get_prompt(prompt, sequence_start=True) != prompt
    assert model.get_prompt(prompt, sequence_start=False) != prompt
-    assert model.default_sys_prompt == 'Provide answers in Python'
+    assert model.meta_instruction == 'Provide answers in Python'

    model = MODELS.get('llama2')(capability='voice')
    _prompt = None
    with pytest.raises(AssertionError):
        _prompt = model.get_prompt(prompt, sequence_start=True)
-    assert _prompt is None
+        assert _prompt is None


 def test_qwen():
@@ -111,7 +150,7 @@ def test_qwen():
    _prompt = None
    with pytest.raises(AssertionError):
        _prompt = model.get_prompt(prompt, sequence_start=True)
-    assert _prompt is None
+        assert _prompt is None


 def test_codellama_completion():
@@ -167,39 +206,3 @@ def test_codellama_others():
    with pytest.raises(AssertionError):
        model = MODELS.get('codellama')(capability='java')
    assert model is None
-
-
-def test_sampling_param():
-    model = MODELS.get('llama')()
-    default_sampling_param = SamplingParam()
-    assert model.sampling_param == default_sampling_param
-
-    model = MODELS.get('llama')(top_p=0.1, top_k=10)
-    assert model.sampling_param.top_p == 0.1 and \
-        model.sampling_param.top_k == 10
-    assert model.sampling_param.temperature == 0.8 and \
-        model.sampling_param.repetition_penalty == 1.0
-
-    model = MODELS.get('codellama')(capability='completion')
-    assert model.sampling_param.top_p == 0.9 and \
-        model.sampling_param.top_k is None and \
-        model.sampling_param.temperature == 0.2 and \
-        model.sampling_param.repetition_penalty == 1.0
-
-    model = MODELS.get('codellama')(capability='chat')
-    assert model.sampling_param.top_p == 0.95 and \
-        model.sampling_param.top_k is None and \
-        model.sampling_param.temperature == 0.2 and \
-        model.sampling_param.repetition_penalty == 1.0
-
-    model = MODELS.get('codellama')(capability='infilling')
-    assert model.sampling_param.top_p == 0.9 and \
-        model.sampling_param.top_k is None and \
-        model.sampling_param.temperature == 0.0 and \
-        model.sampling_param.repetition_penalty == 1.0
-
-    model = MODELS.get('codellama')(capability='python')
-    assert model.sampling_param.top_p == 0.9 and \
-        model.sampling_param.top_k is None and \
-        model.sampling_param.temperature == 0.2 and \
-        model.sampling_param.repetition_penalty == 1.0
--- a/tests/test_lmdeploy/test_tokenizer.py
+++ b/tests/test_lmdeploy/test_tokenizer.py
+import random
+
 import pytest

-from lmdeploy.tokenizer import HuggingFaceTokenizer
+from lmdeploy.tokenizer import DetokenizeState, HuggingFaceTokenizer


 @pytest.mark.parametrize('model_path', [
    'internlm/internlm-chat-7b', 'Qwen/Qwen-7B-Chat',
+    'baichuan-inc/Baichuan2-7B-Chat', 'upstage/SOLAR-0-70b-16bit',
    'baichuan-inc/Baichuan-7B', 'codellama/CodeLlama-7b-hf',
-    'upstage/SOLAR-0-70b-16bit'
+    'THUDM/chatglm2-6b', '01-ai/Yi-6B-200k', '01-ai/Yi-34B-Chat',
+    '01-ai/Yi-6B-Chat', 'WizardLM/WizardLM-70B-V1.0',
+    'codellama/CodeLlama-34b-Instruct-hf', 'tiiuae/falcon-7b'
+])
+@pytest.mark.parametrize('input', [
+    'hi, this is a test 😆😆! ' * 5, '為什麼我還在用繁體字 😆😆 gg! ' * 5,
+    ' License at\n#\n#' + ' ' * 100 + 'ht', '   '
 ])
-@pytest.mark.parametrize(
-    'input', ['hi, this is a test 😆😆! ' * 5, '為什麼我還在用繁體字 😆😆 gg! ' * 5])
-def test_tokenizer(model_path, input):
+@pytest.mark.parametrize('interval', [1, 3])
+@pytest.mark.parametrize('skip_special_tokens', [True, False])
+def test_tokenizer(model_path, input, interval, skip_special_tokens):
    tokenizer = HuggingFaceTokenizer(model_path)
-    encoded = tokenizer.encode(input)
+    encoded = tokenizer.encode(input, False, add_special_tokens=False)
    output = ''
-    offset = 0
-    for i in range(1, len(encoded) + 1):
-        decoded = tokenizer.decode(encoded[:i], offset)
-        if decoded.endswith('�'):
-            continue
+    state = DetokenizeState()
+    for i in range(0, len(encoded), interval):
+        offset = i + interval
+        if offset < len(encoded):
+            # lmdeploy may decode nothing when concurrency is high
+            if random.randint(1, 10) < 4:
+                offset -= interval
+        decoded, state = tokenizer.detokenize_incrementally(
+            encoded[:offset], state, skip_special_tokens)
        output += decoded
-        offset = i
    assert input == output, 'input string should equal to output after enc-dec'
+
+
+@pytest.mark.parametrize('model_path', [
+    'internlm/internlm-chat-7b', 'Qwen/Qwen-7B-Chat',
+    'baichuan-inc/Baichuan2-7B-Chat', 'codellama/CodeLlama-7b-hf',
+    'upstage/SOLAR-0-70b-16bit'
+])
+@pytest.mark.parametrize('stop_words', ['.', ' ', '?', ''])
+def test_tokenizer_with_stop_words(model_path, stop_words):
+    tokenizer = HuggingFaceTokenizer(model_path)
+    indexes = tokenizer.indexes_containing_token(stop_words)
+    assert indexes is not None
+
+
+def test_qwen_vl_decode_special():
+    from lmdeploy.tokenizer import Tokenizer
+    tok = Tokenizer('Qwen/Qwen-VL-Chat')
+    try:
+        tok.decode([151857])
+        assert (0)
+    except Exception as e:
+        assert str(e) == 'Unclosed image token'